From a5f3d4536d5a633acea5ba75c39d0b20fb74eefe Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 7 Jun 2020 18:58:47 +0200 Subject: [PATCH 001/210] Add RK-VS dimensionality reduction. --- docs/modules/preprocessing/dim_reduction.rst | 18 +- .../variable_selection/__init__.py | 1 + .../dim_reduction/variable_selection/_rkvs.py | 208 ++++++++++++++++++ 3 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index ded6b831f..a7ff49b63 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -4,9 +4,25 @@ Dimensionality Reduction When dealing with data samples with high dimensionality, we often need to reduce the dimensions so we can better observe the data. +Variable selection +------------------ +One approach to reduce the dimensionality of the data is to select a subset of +the original variables or features. This approach is called variable +selection. In FDA, this means evaluating the function at a small number of +points. These evaluations would be the selected features of the functional +datum. + +The variable selection transformers implemented in scikit-fda are the +following: + +.. autosummary:: + :toctree: autosummary + + skfda.preprocessing.dim_reduction.variable_selection.RKVS + Projection ---------- -One way to reduce the dimension is through projection. For example, in +Another way to reduce the dimension is through projection. For example, in functional principal component analysis, we project the data samples into a smaller sample of functions that preserve the maximum sample variance. diff --git a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py index e69de29bb..f3b82a57e 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py @@ -0,0 +1 @@ +from ._rkvs import RKVS diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py new file mode 100644 index 000000000..2ddd4a218 --- /dev/null +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -0,0 +1,208 @@ +import sklearn.utils.validation + +import numpy as np +import numpy.linalg as linalg + +from ....representation import FDataGrid + + +def _rkvs(X, Y, n_components: int=1): + ''' + Parameters + ---------- + X + Matrix of trajectories + Y + Vector of class labels + n_components + Number of selected components + ''' + + X = np.atleast_2d(X) + assert n_components >= 1 + assert n_components <= X.shape[1] + + Y = np.asarray(Y) + + selected_features = np.zeros(n_components, dtype=int) + score = np.zeros(n_components) + indexes = np.arange(0, X.shape[1]) + + # Calculate means and covariance matrix + class_1_trajectories = X[Y.ravel() == 1] + class_0_trajectories = X[Y.ravel() == 0] + + means = (np.mean(class_1_trajectories, axis=0) - + np.mean(class_0_trajectories, axis=0)) + + class_1_count = sum(Y) + class_0_count = Y.shape[0] - class_1_count + + class_1_proportion = class_1_count / Y.shape[0] + class_0_proportion = class_0_count / Y.shape[0] + + # The result should be casted to 2D because of bug #11502 in numpy + variances = ( + class_1_proportion * np.atleast_2d( + np.cov(class_1_trajectories, rowvar=False, bias=True)) + + class_0_proportion * np.atleast_2d( + np.cov(class_0_trajectories, rowvar=False, bias=True))) + + # The first variable maximizes |mu(t)|/sigma(t) + mu_sigma = np.abs(means) / np.sqrt(np.diag(variances)) + + selected_features[0] = np.argmax(mu_sigma) + score[0] = mu_sigma[selected_features[0]] + indexes = np.delete(indexes, selected_features[0]) + + for i in range(1, n_components): + aux = np.zeros_like(indexes, dtype=np.float_) + + for j in range(0, indexes.shape[0]): + new_selection = np.concatenate([selected_features[0:i], + [indexes[j]]]) + + new_means = np.atleast_2d(means[new_selection]) + + lstsq_solution = linalg.lstsq( + variances[new_selection[:, np.newaxis], new_selection], + new_means.T, rcond=None)[0] + + aux[j] = new_means @ lstsq_solution + + aux2 = np.argmax(aux) + selected_features[i] = indexes[aux2] + score[i] = aux[aux2] + indexes = np.delete(indexes, aux2) + + return selected_features, score + + +class RKVS(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): + r''' + Reproducing kernel variable selection. + + This is a variable selection method for homoscedastic binary + classification problems. It aims to select the variables + :math:`t_1, \ldots, t_d` which maximize the following quantity + + .. math:: + \phi(t_1, \ldots, t_d) = m_{t_1, \ldots, t_d}^T + K_{t_1, \ldots, t_d}^{-1} m_{t_1, \ldots, t_d} + + where :math:`m_{t_1, \ldots, t_d}` is the difference of the mean + functions of both classes evaluated at points :math:`t_1, \ldots, t_d` + and :math:`K_{t_1, \ldots, t_d}` is the common covariance function + evaluated at the same points. + + This method is optimal for variable selection in homoscedastic binary + classification problems when all possible combinations of points are + taken into account. That means that for all possible selections of + :math:`t_1, \ldots, t_d`, the one in which :math:`\phi(t_1, \ldots, t_d)` + is greater minimizes the optimal misclassification error of all the + classification problems with the reduced dimensionality. + + In practice, however, the points are selected one at a time, using + a greedy approach, so this optimality is not always guaranteed. + + Examples: + + >>> from skfda.preprocessing.dim_reduction import variable_selection + >>> from skfda.datasets import make_gaussian_process + >>> import skfda + >>> import numpy as np + + We create trajectories from two classes, one with zero mean and the + other with a peak-like mean. Both have Brownian covariance. + + >>> n_samples = 10000 + >>> n_features = 1000 + >>> + >>> def mean_1(t): + ... return (np.abs(t - 0.25) + ... - 2 * np.abs(t - 0.5) + ... + np.abs(t - 0.75)) + >>> + >>> X_0 = make_gaussian_process(n_samples=n_samples // 2, + ... n_features=n_features, + ... random_state=0) + >>> X_1 = make_gaussian_process(n_samples=n_samples // 2, + ... n_features=n_features, + ... mean=mean_1, + ... random_state=1) + >>> X = skfda.concatenate((X_0, X_1)) + >>> + >>> y = np.zeros(n_samples) + >>> y [n_samples // 2:] = 1 + + Select the relevant points to distinguish the two classes + + >>> rkvs = variable_selection.RKVS(n_components=3) + >>> _ = rkvs.fit(X, y) + >>> point_mask = rkvs.get_support() + >>> points = X.sample_points[0][point_mask] + >>> np.allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) + True + + Apply the learned dimensionality reduction + + >>> X_dimred = rkvs.transform(X) + >>> len(X.sample_points[0]) + 1000 + >>> len(X_dimred.sample_points[0]) + 3 + + References: + [1] J. R. Berrendero, A. Cuevas, y J. L. Torrecilla, «On the Use of + Reproducing Kernel Hilbert Spaces in Functional Classification», + Journal of the American Statistical Association, vol. 113, n.º 523, + pp. 1210-1218, jul. 2018, doi: 10.1080/01621459.2017.1320287. + + ''' + + def __init__(self, n_components: int=1): + self.n_components = n_components + + def fit(self, X: FDataGrid, y): + + n_unique_labels = len(np.unique(y)) + if n_unique_labels != 2: + raise ValueError(f"RK-VS can only be used when there are only " + f"two different labels, but there are " + f"{n_unique_labels}") + + if X.dim_domain != 1 or X.dim_codomain != 1: + raise ValueError("Domain and codomain dimensions must be 1") + + X, y = sklearn.utils.validation.check_X_y(X.data_matrix[..., 0], y) + + self.features_shape_ = X.shape[1:] + + self.results_ = _rkvs( + X=X, + Y=y, + n_components=self.n_components) + + return self + + def transform(self, X: FDataGrid, Y=None): + + sklearn.utils.validation.check_is_fitted(self) + + X_matrix = sklearn.utils.validation.check_array(X.data_matrix[..., 0]) + + if X_matrix.shape[1:] != self.features_shape_: + raise ValueError("The trajectories have a different number of " + "points than the ones fitted") + + return X.copy(data_matrix=X_matrix[:, self.results_[0]], + sample_points=X.sample_points[0][self.results_[0]]) + + def get_support(self, indices: bool=False): + indexes_unraveled = self.results_[0] + if indices: + return indexes_unraveled + else: + mask = np.zeros(self.features_shape_[0], dtype=bool) + mask[self.results_[0]] = True + return mask From 3ca0ceda62369fbda882b76b117005e9040221ae Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 9 Jun 2020 19:42:40 +0200 Subject: [PATCH 002/210] Change RKVS to return multivariate data. --- .../dim_reduction/variable_selection/_rkvs.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 2ddd4a218..06b62b195 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -105,6 +105,9 @@ class RKVS(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): In practice, however, the points are selected one at a time, using a greedy approach, so this optimality is not always guaranteed. + Parameters: + n_components (int): number of variables to select. + Examples: >>> from skfda.preprocessing.dim_reduction import variable_selection @@ -149,8 +152,8 @@ class RKVS(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): >>> X_dimred = rkvs.transform(X) >>> len(X.sample_points[0]) 1000 - >>> len(X_dimred.sample_points[0]) - 3 + >>> X_dimred.shape + (10000, 3) References: [1] J. R. Berrendero, A. Cuevas, y J. L. Torrecilla, «On the Use of @@ -176,9 +179,9 @@ def fit(self, X: FDataGrid, y): X, y = sklearn.utils.validation.check_X_y(X.data_matrix[..., 0], y) - self.features_shape_ = X.shape[1:] + self._features_shape_ = X.shape[1:] - self.results_ = _rkvs( + self._features_, self._scores_ = _rkvs( X=X, Y=y, n_components=self.n_components) @@ -191,18 +194,17 @@ def transform(self, X: FDataGrid, Y=None): X_matrix = sklearn.utils.validation.check_array(X.data_matrix[..., 0]) - if X_matrix.shape[1:] != self.features_shape_: + if X_matrix.shape[1:] != self._features_shape_: raise ValueError("The trajectories have a different number of " "points than the ones fitted") - return X.copy(data_matrix=X_matrix[:, self.results_[0]], - sample_points=X.sample_points[0][self.results_[0]]) + return X_matrix[:, self._features_] def get_support(self, indices: bool=False): - indexes_unraveled = self.results_[0] + features = self._features_ if indices: - return indexes_unraveled + return features else: - mask = np.zeros(self.features_shape_[0], dtype=bool) - mask[self.results_[0]] = True + mask = np.zeros(self._features_shape_[0], dtype=bool) + mask[features] = True return mask From f55c7caacfe373c82c465487b4b3c0032dc2669e Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 17 Jun 2020 17:19:47 +0200 Subject: [PATCH 003/210] Rename to `RKHSVariableSelection`. --- .../dim_reduction/variable_selection/_rkvs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 06b62b195..84f8e26cd 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -6,7 +6,7 @@ from ....representation import FDataGrid -def _rkvs(X, Y, n_components: int=1): +def _rkhs_vs(X, Y, n_components: int=1): ''' Parameters ---------- @@ -78,7 +78,8 @@ def _rkvs(X, Y, n_components: int=1): return selected_features, score -class RKVS(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): +class RKHSVariableSelection(sklearn.base.BaseEstimator, + sklearn.base.TransformerMixin): r''' Reproducing kernel variable selection. @@ -140,7 +141,7 @@ class RKVS(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): Select the relevant points to distinguish the two classes - >>> rkvs = variable_selection.RKVS(n_components=3) + >>> rkvs = variable_selection.RKHSVariableSelection(n_components=3) >>> _ = rkvs.fit(X, y) >>> point_mask = rkvs.get_support() >>> points = X.sample_points[0][point_mask] @@ -181,7 +182,7 @@ def fit(self, X: FDataGrid, y): self._features_shape_ = X.shape[1:] - self._features_, self._scores_ = _rkvs( + self._features_, self._scores_ = _rkhs_vs( X=X, Y=y, n_components=self.n_components) From 71ad9f509dd9cde84c0176cb3a709f72e24e6a5f Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 17 Jun 2020 18:37:58 +0200 Subject: [PATCH 004/210] Update doc link. --- docs/modules/preprocessing/dim_reduction.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index a7ff49b63..9082d931a 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -18,7 +18,7 @@ following: .. autosummary:: :toctree: autosummary - skfda.preprocessing.dim_reduction.variable_selection.RKVS + skfda.preprocessing.dim_reduction.variable_selection.RKHSVariableSelection Projection ---------- From f5f2845cd28ef474c779d633e302a53928cda48d Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 17 Jun 2020 18:47:09 +0200 Subject: [PATCH 005/210] Fix imports. --- skfda/preprocessing/dim_reduction/__init__.py | 1 + .../preprocessing/dim_reduction/variable_selection/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/__init__.py b/skfda/preprocessing/dim_reduction/__init__.py index 641ba946c..b079520b4 100644 --- a/skfda/preprocessing/dim_reduction/__init__.py +++ b/skfda/preprocessing/dim_reduction/__init__.py @@ -1 +1,2 @@ from . import projection +from . import variable_selection diff --git a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py index f3b82a57e..48c69de54 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py @@ -1 +1 @@ -from ._rkvs import RKVS +from ._rkvs import RKHSVariableSelection From 284827f69e58c387eb818e9130d4a48cc3758e01 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 16 Jul 2020 14:08:17 +0200 Subject: [PATCH 006/210] Improved documentation of RKHSVariableSelection. --- .../dim_reduction/variable_selection/_rkvs.py | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 84f8e26cd..310d2bd7a 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -83,9 +83,12 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, r''' Reproducing kernel variable selection. - This is a variable selection method for homoscedastic binary - classification problems. It aims to select the variables - :math:`t_1, \ldots, t_d` which maximize the following quantity + This is a filter variable selection method for binary classification + problems. With a fixed number :math:`d` of variables to select, it aims to + find the variables :math:`X(t_1), \ldots, X(t_d)` for the values + :math:`t_1, \ldots, t_d` that maximize the separation of + the class means in the reduced space, measured using the Mahalanobis + distance .. math:: \phi(t_1, \ldots, t_d) = m_{t_1, \ldots, t_d}^T @@ -96,17 +99,22 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, and :math:`K_{t_1, \ldots, t_d}` is the common covariance function evaluated at the same points. - This method is optimal for variable selection in homoscedastic binary - classification problems when all possible combinations of points are - taken into account. That means that for all possible selections of - :math:`t_1, \ldots, t_d`, the one in which :math:`\phi(t_1, \ldots, t_d)` - is greater minimizes the optimal misclassification error of all the - classification problems with the reduced dimensionality. - - In practice, however, the points are selected one at a time, using + This method is optimal, with a fixed value of :math:`d`, for variable + selection in Gaussian binary classification problems with the same + covariance in both classes (homoscedasticity), when all possible + combinations of points are taken into account. That means that for all + possible selections of :math:`t_1, \ldots, t_d`, the one in which + :math:`\phi(t_1, \ldots, t_d)` is greater minimizes the optimal + misclassification error of all the classification problems with the + reduced dimensionality. For a longer discussion about the optimality and + consistence of this method, we refer the reader to the original + article [1]_. + + In practice the points are selected one at a time, using a greedy approach, so this optimality is not always guaranteed. Parameters: + n_components (int): number of variables to select. Examples: @@ -157,10 +165,12 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, (10000, 3) References: - [1] J. R. Berrendero, A. Cuevas, y J. L. Torrecilla, «On the Use of - Reproducing Kernel Hilbert Spaces in Functional Classification», - Journal of the American Statistical Association, vol. 113, n.º 523, - pp. 1210-1218, jul. 2018, doi: 10.1080/01621459.2017.1320287. + + .. [1] J. R. Berrendero, A. Cuevas, y J. L. Torrecilla, «On the Use + of Reproducing Kernel Hilbert Spaces in Functional + Classification», Journal of the American Statistical + Association, vol. 113, n.º 523, pp. 1210-1218, jul. 2018, + doi: 10.1080/01621459.2017.1320287. ''' From 22fc54980da40a8bf3d6305fda128a685ba7e40f Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 16 Jul 2020 14:37:30 +0200 Subject: [PATCH 007/210] Add documentation of `get_support`. --- .../dim_reduction/variable_selection/_rkvs.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 310d2bd7a..db4518a71 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -212,6 +212,26 @@ def transform(self, X: FDataGrid, Y=None): return X_matrix[:, self._features_] def get_support(self, indices: bool=False): + """ + Get a mask, or integer index, of the features selected + + Parameters: + + indices : boolean (default False) + If True, the return value will be an array of integers, rather + than a boolean mask. + + Returns: + support : array + An index that selects the retained features from a `FDataGrid` + object. + If `indices` is False, this is a boolean array of shape + [# input features], in which an element is True iff its + corresponding feature is selected for retention. If `indices` + is True, this is an integer array of shape [# output features] + whose values are indices into the input feature vector. + + """ features = self._features_ if indices: return features From b275e27178f29ab3527865b99537d3425303b335 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 23 Jul 2020 17:31:11 +0200 Subject: [PATCH 008/210] First version of sample names. --- skfda/ml/clustering/kmeans.py | 20 ++++---- .../dim_reduction/projection/_fpca.py | 6 ++- skfda/preprocessing/registration/elastic.py | 3 +- skfda/representation/_functional_data.py | 28 ++++++++++- skfda/representation/basis/_fdatabasis.py | 44 ++++++++++++---- skfda/representation/grid.py | 50 +++++++++++++------ 6 files changed, 111 insertions(+), 40 deletions(-) diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/kmeans.py index aed86ce2c..7edacfe02 100644 --- a/skfda/ml/clustering/kmeans.py +++ b/skfda/ml/clustering/kmeans.py @@ -3,13 +3,15 @@ from abc import abstractmethod import warnings -import numpy as np from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin from sklearn.utils import check_random_state from sklearn.utils.validation import check_is_fitted +import numpy as np + from ...misc.metrics import pairwise_distance, lp_distance + __author__ = "Amanda Hernando Bernabé" __email__ = "amanda.hernando@estudiante.uam.es" @@ -130,7 +132,7 @@ def _init_centroids(self, fdatagrid, random_state): if self.init is None: _, idx = np.unique(fdatagrid.data_matrix, axis=0, return_index=True) - unique_data = fdatagrid.data_matrix[np.sort(idx)] + unique_data = fdatagrid[np.sort(idx)] if len(unique_data) < self.n_clusters: return ValueError("Not enough unique data points to " @@ -141,7 +143,7 @@ def _init_centroids(self, fdatagrid, random_state): :self.n_clusters] centroids = unique_data[indices] - return fdatagrid.copy(data_matrix=centroids) + return centroids.copy() else: return self.init.copy() @@ -289,21 +291,21 @@ def predict(self, X, sample_weight=None): """ check_is_fitted(self) self._check_test_data(X) - + membership_matrix = self._create_membership(X.n_samples) centroids = self.cluster_centers_.copy() - + pairwise_metric = pairwise_distance(self.metric) - + distances_to_centroids = pairwise_metric(fdata1=X, fdata2=centroids) - + self._update( fdata=X, membership_matrix=membership_matrix, distances_to_centroids=distances_to_centroids, centroids=centroids) - + return membership_matrix def transform(self, X): @@ -725,7 +727,7 @@ def _update(self, fdata, membership_matrix, distances_to_centroids, membership_matrix_raised = np.power( membership_matrix, self.fuzzifier) - slice_denominator = ((slice(None),) + (np.newaxis,) * + slice_denominator = ((slice(None),) + (np.newaxis,) * (fdata.data_matrix.ndim - 1)) centroids.data_matrix[:] = ( np.einsum('ij,i...->j...', membership_matrix_raised, diff --git a/skfda/preprocessing/dim_reduction/projection/_fpca.py b/skfda/preprocessing/dim_reduction/projection/_fpca.py index 95ca70b1f..5dd3d30a7 100644 --- a/skfda/preprocessing/dim_reduction/projection/_fpca.py +++ b/skfda/preprocessing/dim_reduction/projection/_fpca.py @@ -202,7 +202,8 @@ def _fit_basis(self, X: FDataBasis, y=None): self.explained_variance_ratio_ = pca.explained_variance_ratio_ self.explained_variance_ = pca.explained_variance_ self.components_ = X.copy(basis=components_basis, - coefficients=component_coefficients) + coefficients=component_coefficients, + sample_names=(None,) * self.n_components) return self @@ -319,7 +320,8 @@ def _fit_grid(self, X: FDataGrid, y=None): pca.fit(final_matrix) self.components_ = X.copy(data_matrix=np.transpose( np.linalg.solve(np.sqrt(weights_matrix), - np.transpose(pca.components_)))) + np.transpose(pca.components_))), + sample_names=(None,) * self.n_components) self.explained_variance_ratio_ = pca.explained_variance_ratio_ self.explained_variance_ = pca.explained_variance_ diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py index a073c2438..a514dd7e9 100644 --- a/skfda/preprocessing/registration/elastic.py +++ b/skfda/preprocessing/registration/elastic.py @@ -754,7 +754,8 @@ def elastic_mean(fdatagrid, *, penalty=0., center=True, max_iter=20, tol=1e-3, # Karcher mean orbit in space L2/Gamma karcher_mean = srsf_transformer.inverse_transform( - fdatagrid.copy(data_matrix=[mu], sample_points=eval_points)) + fdatagrid.copy(data_matrix=[mu], sample_points=eval_points, + sample_names=("Karcher mean",))) if center: # Gamma mean in Hilbert Sphere diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index b62acdb92..7af3a9c9c 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -36,7 +36,8 @@ def __init__(self, *, extrapolation, dataset_label=None, axes_labels=None, argument_names=None, - coordinate_names=None): + coordinate_names=None, + sample_names=None): self.extrapolation = extrapolation self.dataset_name = dataset_name @@ -47,6 +48,7 @@ def __init__(self, *, extrapolation, self.argument_names = argument_names self.coordinate_names = coordinate_names self.axes_labels = axes_labels + self.sample_names = sample_names @property def dataset_label(self): @@ -123,6 +125,22 @@ def axes_labels(self, labels): self.argument_names = labels[:self.dim_domain] self.coordinate_names = labels[self.dim_domain:] + @property + def sample_names(self): + return self._sample_names + + @sample_names.setter + def sample_names(self, names): + if names is None: + names = (None,) * self.n_samples + else: + names = tuple(names) + if len(names) != self.n_samples: + raise ValueError("There must be a name for each of the " + "samples.") + + self._sample_names = names + @property @abstractmethod def n_samples(self): @@ -628,6 +646,14 @@ def __eq__(self, other): and self.coordinate_names == other.coordinate_names ) + def _copy_op(self, other, **kwargs): + + base_copy = (other if isinstance(other, type(self)) + and self.n_samples == 1 and other.n_samples != 1 + else self) + + return base_copy.copy(**kwargs) + @abstractmethod def __add__(self, other): """Addition for FData object.""" diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 8a719a7db..dea6b79e5 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -1,5 +1,6 @@ from builtins import isinstance import copy +import numbers import pandas.api.extensions @@ -88,7 +89,9 @@ def __len__(self): def __init__(self, basis, coefficients, *, dataset_label=None, dataset_name=None, axes_labels=None, argument_names=None, - coordinate_names=None, extrapolation=None): + coordinate_names=None, + sample_names=None, + extrapolation=None): """Construct a FDataBasis object. Args: @@ -110,7 +113,8 @@ def __init__(self, basis, coefficients, *, dataset_label=None, dataset_name=dataset_name, axes_labels=axes_labels, argument_names=argument_names, - coordinate_names=coordinate_names) + coordinate_names=coordinate_names, + sample_names=sample_names) @classmethod def from_data(cls, data_matrix, sample_points, basis, @@ -395,10 +399,12 @@ def mean(self, weights=None): return self.copy(coefficients=np.average(self.coefficients, weights=weights, axis=0 - )[np.newaxis, ...] + )[np.newaxis, ...], + sample_names=("mean",) ) - return self.copy(coefficients=np.mean(self.coefficients, axis=0)) + return self.copy(coefficients=np.mean(self.coefficients, axis=0), + sample_names=("mean",)) def gmean(self, eval_points=None): """Compute the geometric mean of the functional data object. @@ -528,6 +534,7 @@ def copy(self, *, basis=None, coefficients=None, dataset_name=None, argument_names=None, coordinate_names=None, + sample_names=None, extrapolation=None): """FDataBasis copy""" @@ -541,11 +548,17 @@ def copy(self, *, basis=None, coefficients=None, dataset_name = self.dataset_name if argument_names is None: + # Tuple, immutable argument_names = self.argument_names if coordinate_names is None: + # Tuple, immutable coordinate_names = self.coordinate_names + if sample_names is None: + # Tuple, immutable + sample_names = self.sample_names + if extrapolation is None: extrapolation = self.extrapolation @@ -553,6 +566,7 @@ def copy(self, *, basis=None, coefficients=None, dataset_name=dataset_name, argument_names=argument_names, coordinate_names=coordinate_names, + sample_names=sample_names, extrapolation=extrapolation) def times(self, other): @@ -689,7 +703,10 @@ def concatenate(self, *others, as_coordinates=False): data = [self.coefficients] + [other.coefficients for other in others] - return self.copy(coefficients=np.concatenate(data, axis=0)) + sample_names = [fd.sample_names for fd in [self, *others]] + + return self.copy(coefficients=np.concatenate(data, axis=0), + sample_names=sum(sample_names, ())) def compose(self, fd, *, eval_points=None, **kwargs): """Composition of functions. @@ -718,19 +735,24 @@ def compose(self, fd, *, eval_points=None, **kwargs): def __getitem__(self, key): """Return self[key].""" - if isinstance(key, int): - return self.copy(coefficients=self.coefficients[key:key + 1]) + if isinstance(key, numbers.Integral): # To accept also numpy ints + key = int(key) + return self.copy(coefficients=self.coefficients[key:key + 1], + sample_names=self.sample_names[key:key + 1]) else: - return self.copy(coefficients=self.coefficients[key]) + return self.copy(coefficients=self.coefficients[key], + sample_names=np.array(self.sample_names)[key]) def __add__(self, other): """Addition for FDataBasis object.""" + if isinstance(other, FDataBasis): if self.basis != other.basis: return NotImplemented else: basis, coefs = self.basis._add_same_basis(self.coefficients, other.coefficients) + else: try: basis, coefs = self.basis._add_constant(self.coefficients, @@ -738,7 +760,7 @@ def __add__(self, other): except TypeError: return NotImplemented - return self.copy(basis=basis, coefficients=coefs) + return self._copy_op(other, basis=basis, coefficients=coefs) def __radd__(self, other): """Addition for FDataBasis object.""" @@ -760,7 +782,7 @@ def __sub__(self, other): except TypeError: return NotImplemented - return self.copy(basis=basis, coefficients=coefs) + return self._copy_op(other, basis=basis, coefficients=coefs) def __rsub__(self, other): """Right subtraction for FDataBasis object.""" @@ -776,7 +798,7 @@ def __mul__(self, other): except TypeError: return NotImplemented - return self.copy(basis=basis, coefficients=coefs) + return self._copy_op(other, basis=basis, coefficients=coefs) def __rmul__(self, other): """Multiplication for FDataBasis object.""" diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 7d67d965e..2ecb74c75 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -139,6 +139,7 @@ def __init__(self, data_matrix, sample_points=None, dataset_name=None, argument_names=None, coordinate_names=None, + sample_names=None, axes_labels=None, extrapolation=None, interpolation=None): """Construct a FDataGrid object. @@ -218,7 +219,8 @@ def __init__(self, data_matrix, sample_points=None, dataset_name=dataset_name, axes_labels=axes_labels, argument_names=argument_names, - coordinate_names=coordinate_names) + coordinate_names=coordinate_names, + sample_names=sample_names) def round(self, decimals=0): """Evenly round to the given number of decimals. @@ -459,11 +461,13 @@ def mean(self, weights=None): if weights is not None: return self.copy(data_matrix=np.average( - self.data_matrix, weights=weights, axis=0)[np.newaxis, ...] + self.data_matrix, weights=weights, axis=0)[np.newaxis, ...], + sample_names=("mean",) ) return self.copy(data_matrix=self.data_matrix.mean(axis=0, - keepdims=True)) + keepdims=True), + sample_names=("mean",)) def var(self): """Compute the variance of a set of samples in a FDataGrid object. @@ -473,7 +477,8 @@ def var(self): variance of all the samples in the original FDataGrid object. """ - return self.copy(data_matrix=[np.var(self.data_matrix, 0)]) + return self.copy(data_matrix=[np.var(self.data_matrix, 0)], + sample_names=("variance",)) def cov(self): """Compute the covariance. @@ -502,7 +507,8 @@ def cov(self): domain_range=[self.domain_range[0], self.domain_range[0]], dataset_name=dataset_name, - argument_names=self.argument_names * 2) + argument_names=self.argument_names * 2, + sample_names=("covariance",)) def gmean(self): """Compute the geometric mean of all samples in the FDataGrid object. @@ -514,7 +520,8 @@ def gmean(self): """ return self.copy(data_matrix=[ - scipy.stats.mstats.gmean(self.data_matrix, 0)]) + scipy.stats.mstats.gmean(self.data_matrix, 0)], + sample_names=("geometric mean",)) def __eq__(self, other): """Comparison of FDataGrid objects""" @@ -575,7 +582,7 @@ def __add__(self, other): if data_matrix is None: return NotImplemented - return self.copy(data_matrix=self.data_matrix + data_matrix) + return self._copy_op(other, data_matrix=self.data_matrix + data_matrix) def __radd__(self, other): """Addition for FDataGrid object. @@ -596,7 +603,7 @@ def __sub__(self, other): if data_matrix is None: return NotImplemented - return self.copy(data_matrix=self.data_matrix - data_matrix) + return self._copy_op(other, data_matrix=self.data_matrix - data_matrix) def __rsub__(self, other): """Right Subtraction for FDataGrid object. @@ -620,7 +627,7 @@ def __mul__(self, other): if data_matrix is None: return NotImplemented - return self.copy(data_matrix=self.data_matrix * data_matrix) + return self._copy_op(other, data_matrix=self.data_matrix * data_matrix) def __rmul__(self, other): """Multiplication for FDataGrid object. @@ -640,7 +647,7 @@ def __truediv__(self, other): if data_matrix is None: return NotImplemented - return self.copy(data_matrix=self.data_matrix / data_matrix) + return self._copy_op(other, data_matrix=self.data_matrix / data_matrix) def __rtruediv__(self, other): """Division for FDataGrid object. @@ -652,7 +659,7 @@ def __rtruediv__(self, other): if data_matrix is None: return NotImplemented - return self.copy(data_matrix=data_matrix / self.data_matrix) + return self._copy_op(other, data_matrix=data_matrix / self.data_matrix) def concatenate(self, *others, as_coordinates=False): """Join samples from a similar FDataGrid object. @@ -711,14 +718,17 @@ def concatenate(self, *others, as_coordinates=False): if as_coordinates: - coordinate_names = [ - fd.coordinate_names for fd in [self, *others]] + coordinate_names = [fd.coordinate_names for fd in [self, *others]] return self.copy(data_matrix=np.concatenate(data, axis=-1), coordinate_names=sum(coordinate_names, ())) else: - return self.copy(data_matrix=np.concatenate(data, axis=0)) + + sample_names = [fd.sample_names for fd in [self, *others]] + + return self.copy(data_matrix=np.concatenate(data, axis=0), + sample_names=sum(sample_names, ())) def scatter(self, *args, **kwargs): """Scatter plot of the FDatGrid object. @@ -811,6 +821,7 @@ def copy(self, *, dataset_name=None, argument_names=None, coordinate_names=None, + sample_names=None, extrapolation=None, interpolation=None): """Returns a copy of the FDataGrid. @@ -842,6 +853,10 @@ def copy(self, *, # Tuple, immutable coordinate_names = self.coordinate_names + if sample_names is None: + # Tuple, immutable + sample_names = self.sample_names + if extrapolation is None: extrapolation = self.extrapolation @@ -853,6 +868,7 @@ def copy(self, *, dataset_name=dataset_name, argument_names=argument_names, coordinate_names=coordinate_names, + sample_names=sample_names, extrapolation=extrapolation, interpolation=interpolation) @@ -1030,10 +1046,12 @@ def __getitem__(self, key): if isinstance(key, numbers.Integral): # To accept also numpy ints key = int(key) - return self.copy(data_matrix=self.data_matrix[key:key + 1]) + return self.copy(data_matrix=self.data_matrix[key:key + 1], + sample_names=self.sample_names[key:key + 1]) else: - return self.copy(data_matrix=self.data_matrix[key]) + return self.copy(data_matrix=self.data_matrix[key], + sample_names=np.array(self.sample_names)[key]) ##################################################################### # Numpy methods From 915093e8246a407c0f43d26d8d63014105eb51a6 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 24 Jul 2020 13:04:50 +0200 Subject: [PATCH 009/210] Pass constructor tests for Pandas ExtensionArrays. --- skfda/representation/_functional_data.py | 16 +- skfda/representation/basis/_fdatabasis.py | 5 +- skfda/representation/grid.py | 5 +- tests/test_pandas.py | 10 +- tests/test_pandas_fdatabasis.py | 191 ++++++++++++++++++++++ tests/test_pandas_fdatagrid.py | 190 +++++++++++++++++++++ 6 files changed, 402 insertions(+), 15 deletions(-) create mode 100644 tests/test_pandas_fdatabasis.py create mode 100644 tests/test_pandas_fdatagrid.py diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index b62acdb92..7d6b4e29c 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -691,7 +691,7 @@ def __len__(self): # Numpy methods ##################################################################### - def to_numpy(self): + def __array__(self, *args, **kwargs): """Returns a numpy array with the objects""" # This is to prevent numpy to access inner dimensions @@ -719,6 +719,9 @@ def ndim(self): @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): + if isinstance(scalars, cls): + scalars = [scalars] + if copy: scalars = [f.copy() for f in scalars] @@ -794,7 +797,7 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=0): # If the ExtensionArray is backed by an ndarray, then # just pass that here instead of coercing to object. - data = self.to_numpy() + data = np.asarray(self) if allow_fill and fill_value is None: fill_value = self.dtype.na_value # fill value should always be translated from the scalar @@ -817,11 +820,12 @@ def _concat_same_type( Returns: FData - """ - first, *others = to_concat + """ + if isinstance(to_concat, cls): + return to_concat - return first.concatenate(*others) + return concatenate(to_concat) def concatenate(objects, as_coordinates=False): @@ -848,7 +852,7 @@ def concatenate(objects, as_coordinates=False): objects = iter(objects) first = next(objects, None) - if not first: + if first is None: raise ValueError("At least one FData object must be provided " "to concatenate.") diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 8a719a7db..035214e74 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -805,7 +805,7 @@ def __rtruediv__(self, other): @property def dtype(self): """The dtype for this extension array, FDataGridDType""" - return FDataBasisDType + return FDataBasisDType() @property def nbytes(self) -> int: @@ -815,11 +815,12 @@ def nbytes(self) -> int: return self.coefficients.nbytes() +@pandas.api.extensions.register_extension_dtype class FDataBasisDType(pandas.api.extensions.ExtensionDtype): """ DType corresponding to FDataBasis in Pandas """ - name = 'functional data (basis)' + name = 'FDataBasis' kind = 'O' type = FDataBasis na_value = None diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 7d67d965e..40d95e7ed 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -1078,7 +1078,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @property def dtype(self): """The dtype for this extension array, FDataGridDType""" - return FDataGridDType + return FDataGridDType() @property def nbytes(self) -> int: @@ -1089,11 +1089,12 @@ def nbytes(self) -> int: p.nbytes() for p in self.sample_points) +@pandas.api.extensions.register_extension_dtype class FDataGridDType(pandas.api.extensions.ExtensionDtype): """ DType corresponding to FDataGrid in Pandas """ - name = 'functional data (grid)' + name = 'FDataGrid' kind = 'O' type = FDataGrid na_value = None diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 5c0247a38..2f69fd590 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -1,7 +1,7 @@ +import skfda import unittest import pandas as pd -import skfda class TestPandas(unittest.TestCase): @@ -14,28 +14,28 @@ def setUp(self): def test_fdatagrid_series(self): series = pd.Series(self.fd) - self.assertEqual( + self.assertIsInstance( series.dtype, skfda.representation.grid.FDataGridDType) self.assertEqual(len(series), self.fd.n_samples) self.assertEqual(series[0], self.fd[0]) def test_fdatabasis_series(self): series = pd.Series(self.fd_basis) - self.assertEqual( + self.assertIsInstance( series.dtype, skfda.representation.basis.FDataBasisDType) self.assertEqual(len(series), self.fd_basis.n_samples) self.assertEqual(series[0], self.fd_basis[0]) def test_fdatagrid_dataframe(self): df = pd.DataFrame({"function": self.fd}) - self.assertEqual( + self.assertIsInstance( df["function"].dtype, skfda.representation.grid.FDataGridDType) self.assertEqual(len(df["function"]), self.fd.n_samples) self.assertEqual(df["function"][0], self.fd[0]) def test_fdatabasis_dataframe(self): df = pd.DataFrame({"function": self.fd_basis}) - self.assertEqual( + self.assertIsInstance( df["function"].dtype, skfda.representation.basis.FDataBasisDType) self.assertEqual(len(df["function"]), self.fd_basis.n_samples) self.assertEqual(df["function"][0], self.fd_basis[0]) diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py new file mode 100644 index 000000000..b467d6f93 --- /dev/null +++ b/tests/test_pandas_fdatabasis.py @@ -0,0 +1,191 @@ +import skfda + +from pandas import Series +from pandas.tests.extension import base +import pytest +import numpy as np + +############################################################################## +# Fixtures +############################################################################## + + +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + return skfda.representation.basis.FDataBasisDType + + +@pytest.fixture +def data(): + """ + Length-100 array for this type. + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not be equal + """ + + basis = skfda.representation.basis.Monomial(n_basis=5) + coef_matrix = np.arange(100 * 5).reshape(100, 5) + + return skfda.FDataBasis(basis=basis, coefficients=coef_matrix) + + +@pytest.fixture +def data_for_twos(): + """Length-100 array in which all the elements are two.""" + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing + + +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + Parameters + ---------- + data : fixture implementing `data` + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + + def gen(count): + for _ in range(count): + yield data + + return gen + + +@pytest.fixture +def data_for_sorting(): + """ + Length-3 array with a known sort order. + This should be three items [B, C, A] with + A < B < C + """ + raise NotImplementedError + + +@pytest.fixture +def data_missing_for_sorting(): + """ + Length-3 array with a known sort order. + This should be three items [B, NA, A] with + A < B and NA missing. + """ + raise NotImplementedError + + +@pytest.fixture +def na_cmp(): + """ + Binary operator for comparing NA values. + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + By default, uses ``operator.is_`` + """ + return operator.is_ + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return None + + +@pytest.fixture +def data_for_grouping(): + """ + Data for factorization, grouping, and unique tests. + Expected to be like [B, B, NA, NA, A, A, B, C] + Where A < B < C and NA is missing + """ + raise NotImplementedError + + +@pytest.fixture(params=[True, False]) +def box_in_series(request): + """Whether to box the data in a Series""" + return request.param + + +@pytest.fixture( + params=[ + lambda x: 1, + lambda x: [1] * len(x), + lambda x: Series([1] * len(x)), + lambda x: x, + ], + ids=["scalar", "list", "series", "object"], +) +def groupby_apply_op(request): + """ + Functions to test groupby.apply(). + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_frame(request): + """ + Boolean fixture to support Series and Series.to_frame() comparison testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_series(request): + """ + Boolean fixture to support arr and Series(arr) comparison testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def use_numpy(request): + """ + Boolean fixture to support comparison testing of ExtensionDtype array + and numpy array. + """ + return request.param + + +@pytest.fixture(params=["ffill", "bfill"]) +def fillna_method(request): + """ + Parametrized fixture giving method parameters 'ffill' and 'bfill' for + Series.fillna(method=) testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_array(request): + """ + Boolean fixture to support ExtensionDtype _from_sequence method testing. + """ + return request.param + +############################################################################## +# Tests +############################################################################## + + +class TestConstructors(base.BaseConstructorsTests): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py new file mode 100644 index 000000000..66e1300b0 --- /dev/null +++ b/tests/test_pandas_fdatagrid.py @@ -0,0 +1,190 @@ +import skfda + +from pandas import Series +from pandas.tests.extension import base +import pytest +import numpy as np + +############################################################################## +# Fixtures +############################################################################## + + +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + return skfda.representation.grid.FDataGridDType + + +@pytest.fixture +def data(): + """ + Length-100 array for this type. + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not be equal + """ + + data_matrix = np.arange(100 * 10).reshape(100, 10) + + return skfda.FDataGrid(data_matrix) + + +@pytest.fixture +def data_for_twos(): + """Length-100 array in which all the elements are two.""" + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing + + +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + Parameters + ---------- + data : fixture implementing `data` + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + + def gen(count): + for _ in range(count): + yield data + + return gen + + +@pytest.fixture +def data_for_sorting(): + """ + Length-3 array with a known sort order. + This should be three items [B, C, A] with + A < B < C + """ + raise NotImplementedError + + +@pytest.fixture +def data_missing_for_sorting(): + """ + Length-3 array with a known sort order. + This should be three items [B, NA, A] with + A < B and NA missing. + """ + raise NotImplementedError + + +@pytest.fixture +def na_cmp(): + """ + Binary operator for comparing NA values. + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + By default, uses ``operator.is_`` + """ + return operator.is_ + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return None + + +@pytest.fixture +def data_for_grouping(): + """ + Data for factorization, grouping, and unique tests. + Expected to be like [B, B, NA, NA, A, A, B, C] + Where A < B < C and NA is missing + """ + raise NotImplementedError + + +@pytest.fixture(params=[True, False]) +def box_in_series(request): + """Whether to box the data in a Series""" + return request.param + + +@pytest.fixture( + params=[ + lambda x: 1, + lambda x: [1] * len(x), + lambda x: Series([1] * len(x)), + lambda x: x, + ], + ids=["scalar", "list", "series", "object"], +) +def groupby_apply_op(request): + """ + Functions to test groupby.apply(). + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_frame(request): + """ + Boolean fixture to support Series and Series.to_frame() comparison testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_series(request): + """ + Boolean fixture to support arr and Series(arr) comparison testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def use_numpy(request): + """ + Boolean fixture to support comparison testing of ExtensionDtype array + and numpy array. + """ + return request.param + + +@pytest.fixture(params=["ffill", "bfill"]) +def fillna_method(request): + """ + Parametrized fixture giving method parameters 'ffill' and 'bfill' for + Series.fillna(method=) testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_array(request): + """ + Boolean fixture to support ExtensionDtype _from_sequence method testing. + """ + return request.param + +############################################################################## +# Tests +############################################################################## + + +class TestConstructors(base.BaseConstructorsTests): + pass From 4ded16632c7f0d467ec7cc244853726d0c43542d Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 24 Jul 2020 13:41:22 +0200 Subject: [PATCH 010/210] Pass Pandas ExtensionArray Dtype tests. --- skfda/representation/_functional_data.py | 7 +++++++ skfda/representation/basis/_fdatabasis.py | 4 ++-- skfda/representation/grid.py | 4 ++-- tests/test_pandas_fdatabasis.py | 6 +++++- tests/test_pandas_fdatagrid.py | 6 +++++- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 7d6b4e29c..1757af105 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -827,6 +827,13 @@ def _concat_same_type( return concatenate(to_concat) + def astype(self, dtype, copy=True): + if isinstance(dtype, type(self.dtype)): + if copy: + self = self.copy() + return self + return super().astype(dtype) + def concatenate(objects, as_coordinates=False): """ diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 035214e74..38ffed7d4 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -830,8 +830,8 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError( + f"Cannot construct a '{cls.__name__}' from '{string}'") @classmethod def construct_array_type(cls): diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 40d95e7ed..cf9ee1f54 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -1104,8 +1104,8 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError( + f"Cannot construct a '{cls.__name__}' from '{string}'") @classmethod def construct_array_type(cls): diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index b467d6f93..20f08cda3 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -13,7 +13,7 @@ @pytest.fixture def dtype(): """A fixture providing the ExtensionDtype to validate.""" - return skfda.representation.basis.FDataBasisDType + return skfda.representation.basis.FDataBasisDType() @pytest.fixture @@ -189,3 +189,7 @@ def as_array(request): class TestConstructors(base.BaseConstructorsTests): pass + + +class TestDtype(base.BaseDtypeTests): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 66e1300b0..28cf5b528 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -13,7 +13,7 @@ @pytest.fixture def dtype(): """A fixture providing the ExtensionDtype to validate.""" - return skfda.representation.grid.FDataGridDType + return skfda.representation.grid.FDataGridDType() @pytest.fixture @@ -188,3 +188,7 @@ def as_array(request): class TestConstructors(base.BaseConstructorsTests): pass + + +class TestDtype(base.BaseDtypeTests): + pass From 04e3d80f2bc5db1d03d949a48622e9f312ab0458 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 24 Jul 2020 16:01:44 +0200 Subject: [PATCH 011/210] Travis should not download release candidates of the dependencies. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index e48c60eda..412fb3656 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,6 +36,7 @@ install: # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only script: - | + pip3 install . if [[ $PEP8COVERAGE == true ]]; then flake8 --exit-zero skfda; coverage run --source=skfda/ setup.py test; From 7b19cafa6bc0fd321300d25f5e469af123920b8b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 24 Jul 2020 17:13:08 +0200 Subject: [PATCH 012/210] Fix numpy errors and warnings. --- skfda/_utils/_utils.py | 2 +- skfda/ml/regression/linear.py | 7 +++---- skfda/representation/interpolation.py | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index b2332041a..7d303b0de 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -327,7 +327,7 @@ def _evaluate_grid(axes, *, evaluate_method, eval_points, shape = zip( *[_one_grid_to_points(a, dim_domain=dim_domain) for a in axes]) - eval_points = np.array(eval_points) + eval_points = _to_array_maybe_ragged(eval_points) # Evaluate the points res = evaluate_method(eval_points, diff --git a/skfda/ml/regression/linear.py b/skfda/ml/regression/linear.py index 30cbe5faf..322d122a1 100644 --- a/skfda/ml/regression/linear.py +++ b/skfda/ml/regression/linear.py @@ -218,13 +218,12 @@ def _argcheck_X_y(self, X, y, sample_weight=None, coef_basis=None): X = self._argcheck_X(X) - y = np.asarray(y) - - if (np.issubdtype(y.dtype, np.object_) - and any(isinstance(i, FData) for i in y)): + if any(isinstance(i, FData) for i in y): raise ValueError( "Some of the response variables are not scalar") + y = np.asarray(y) + if coef_basis is None: coef_basis = [None] * len(X) diff --git a/skfda/representation/interpolation.py b/skfda/representation/interpolation.py index 2967c29a8..0b5a40f91 100644 --- a/skfda/representation/interpolation.py +++ b/skfda/representation/interpolation.py @@ -10,6 +10,7 @@ import numpy as np +from .._utils import _to_array_maybe_ragged from .evaluator import Evaluator @@ -47,7 +48,7 @@ def evaluate(self, fdata, eval_points, *, derivative=0, aligned=True): fdata.dim_codomain) else: - res = np.array([self._evaluate_codomain( + res = _to_array_maybe_ragged([self._evaluate_codomain( s, e, derivative=derivative) for s, e in zip(self.splines, eval_points)]) From b8ad1e1bd499c3014ed5a1700e4bad0497c663bd Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 28 Jul 2020 18:36:53 +0200 Subject: [PATCH 013/210] Preserve labels. --- skfda/preprocessing/smoothing/_basis.py | 5 ++++- skfda/representation/grid.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/skfda/preprocessing/smoothing/_basis.py b/skfda/preprocessing/smoothing/_basis.py index 8258dbbb9..577a61d91 100644 --- a/skfda/preprocessing/smoothing/_basis.py +++ b/skfda/preprocessing/smoothing/_basis.py @@ -462,7 +462,10 @@ def fit_transform(self, X: FDataGrid, y=None): f"({data_matrix.shape[0]}).") fdatabasis = FDataBasis( - basis=self.basis, coefficients=coefficients) + basis=self.basis, coefficients=coefficients, + dataset_name=X.dataset_name, + argument_names=X.argument_names, + coordinate_names=X.coordinate_names) if self.return_basis: return fdatabasis diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index cf9ee1f54..be65ca07f 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -791,10 +791,16 @@ def to_basis(self, basis, **kwargs): basis = basis.copy() basis.domain_range = self.domain_range - return fdbasis.FDataBasis.from_data(self.data_matrix, - self.sample_points, - basis, - **kwargs) + fdatabasis = fdbasis.FDataBasis.from_data(self.data_matrix, + self.sample_points, + basis, + **kwargs) + + fdatabasis.dataset_name = self.dataset_name + fdatabasis.argument_names = self.argument_names + fdatabasis.coordinate_names = self.coordinate_names + + return fdatabasis def to_grid(self, sample_points=None): From 53d870e7b8e89a07c2f8509d828e45ec615ceaed Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 3 Aug 2020 18:17:01 +0200 Subject: [PATCH 014/210] FDataBasis from_data uses FDataGrid to_basis. --- skfda/representation/basis/_fdatabasis.py | 8 +------- skfda/representation/grid.py | 16 +++++++--------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 38ffed7d4..466da5144 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -186,7 +186,6 @@ def from_data(cls, data_matrix, sample_points, basis, Data Analysis* (pp. 86-87). Springer. """ - from ...preprocessing.smoothing import BasisSmoother from ..grid import FDataGrid # n is the samples @@ -198,12 +197,7 @@ def from_data(cls, data_matrix, sample_points, basis, fd = FDataGrid(data_matrix=data_matrix, sample_points=sample_points) - smoother = BasisSmoother( - basis=basis, - method=method, - return_basis=True) - - return smoother.fit_transform(fd) + return fd.to_basis(basis=basis, method=method) @property def n_samples(self): diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index be65ca07f..77ac5e484 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -775,6 +775,8 @@ def to_basis(self, basis, **kwargs): array([[ 2. , 0.71, 0.71]]) """ + from ..preprocessing.smoothing import BasisSmoother + if self.dim_domain != basis.dim_domain: raise ValueError(f"The domain of the function has " f"dimension {self.dim_domain} " @@ -791,16 +793,12 @@ def to_basis(self, basis, **kwargs): basis = basis.copy() basis.domain_range = self.domain_range - fdatabasis = fdbasis.FDataBasis.from_data(self.data_matrix, - self.sample_points, - basis, - **kwargs) - - fdatabasis.dataset_name = self.dataset_name - fdatabasis.argument_names = self.argument_names - fdatabasis.coordinate_names = self.coordinate_names + smoother = BasisSmoother( + basis=basis, + **kwargs, + return_basis=True) - return fdatabasis + return smoother.fit_transform(self) def to_grid(self, sample_points=None): From 3781b0027724de4c510bb1297b8c25bff2a806f9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 3 Aug 2020 18:24:41 +0200 Subject: [PATCH 015/210] Preserve sample_labels. --- skfda/preprocessing/smoothing/_basis.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/smoothing/_basis.py b/skfda/preprocessing/smoothing/_basis.py index 577a61d91..5bdad52b2 100644 --- a/skfda/preprocessing/smoothing/_basis.py +++ b/skfda/preprocessing/smoothing/_basis.py @@ -465,7 +465,8 @@ def fit_transform(self, X: FDataGrid, y=None): basis=self.basis, coefficients=coefficients, dataset_name=X.dataset_name, argument_names=X.argument_names, - coordinate_names=X.coordinate_names) + coordinate_names=X.coordinate_names, + sample_names=X.sample_names) if self.return_basis: return fdatabasis From 62602b49884f28096858d8e9b43afdd386fb3218 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 5 Aug 2020 18:41:48 +0200 Subject: [PATCH 016/210] Try to make basis constant. --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 8 ++-- skfda/exploratory/visualization/_boxplot.py | 4 +- .../visualization/_magnitude_shape_plot.py | 2 +- .../visualization/representation.py | 6 +-- .../_linear_differential_operator.py | 18 ++++---- skfda/ml/regression/linear.py | 4 +- skfda/preprocessing/smoothing/validation.py | 2 +- skfda/representation/basis/_basis.py | 40 ++++++++++------ skfda/representation/basis/_bspline.py | 28 +++++------ skfda/representation/basis/_fdatabasis.py | 46 +++++++++++++------ skfda/representation/basis/_fourier.py | 26 +++++------ skfda/representation/basis/_tensor_basis.py | 14 +++++- skfda/representation/basis/_vector_basis.py | 14 +++++- skfda/representation/grid.py | 19 ++++---- tests/test_pandas_fdatabasis.py | 22 ++++++++- tests/test_pandas_fdatagrid.py | 12 ++--- 17 files changed, 168 insertions(+), 99 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index c58ce4023..589169bf3 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -1,6 +1,6 @@ from . import constants -from ._utils import (_list_of_arrays, _cartesian_product, +from ._utils import (_tuple_of_arrays, _cartesian_product, _check_estimator, parameter_aliases, _to_grid, check_is_univariate, _same_domain, _to_array_maybe_ragged, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 7d303b0de..f839c47a1 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -80,7 +80,7 @@ def _to_grid(X, y, eval_points=None): return X, y -def _list_of_arrays(original_array): +def _tuple_of_arrays(original_array): """Convert to a list of arrays. If the original list is one-dimensional (e.g. [1, 2, 3]), return list to @@ -107,9 +107,9 @@ def _list_of_arrays(original_array): unidimensional = True if unidimensional: - return [np.asarray(original_array)] + return (np.asarray(original_array),) else: - return [np.asarray(i) for i in original_array] + return tuple(np.asarray(i) for i in original_array) def _to_array_maybe_ragged(array, *, row_shape=None): @@ -251,7 +251,7 @@ def _one_grid_to_points(axes, *, dim_domain): Returns also the shape containing the information of how each point is formed. """ - axes = _list_of_arrays(axes) + axes = _tuple_of_arrays(axes) if len(axes) != dim_domain: raise ValueError(f"Length of axes should be " diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index 90e1f0ab9..d509c6f27 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -190,7 +190,7 @@ class Boxplot(FDataBoxplot): [-1. ], [-1. ], [-1. ]]]), - sample_points=[array([ 0, 2, 4, 6, 8, 10])], + sample_points=(array([ 0, 2, 4, 6, 8, 10]),), domain_range=array([[ 0, 10]]), dataset_name='dataset', argument_names=('x_label',), @@ -516,7 +516,7 @@ class SurfaceBoxplot(FDataBoxplot): [[ 3. ], [ 0.6], [ 3. ]]]]), - sample_points=[array([2, 4]), array([3, 6, 8])], + sample_points=(array([2, 4]), array([3, 6, 8])), domain_range=array([[2, 4], [3, 8]]), dataset_name='dataset', diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 5b21f60f4..1053d94bf 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -145,7 +145,7 @@ class MagnitudeShapePlot: [-1. ], [-1. ], [-1. ]]]), - sample_points=[array([ 0, 2, 4, 6, 8, 10])], + sample_points=(array([ 0, 2, 4, 6, 8, 10]),), domain_range=array([[ 0, 10]]), ...), depth_method=projection_depth, diff --git a/skfda/exploratory/visualization/representation.py b/skfda/exploratory/visualization/representation.py index 739fbbb57..3389a343f 100644 --- a/skfda/exploratory/visualization/representation.py +++ b/skfda/exploratory/visualization/representation.py @@ -4,7 +4,7 @@ import numpy as np -from ..._utils import _list_of_arrays, constants +from ..._utils import _tuple_of_arrays, constants from ._utils import (_get_figure_and_axes, _set_figure_layout_for_fdata, _set_labels) @@ -148,7 +148,7 @@ def plot_graph(fdata, chart=None, *, fig=None, axes=None, if domain_range is None: domain_range = fdata.domain_range else: - domain_range = _list_of_arrays(domain_range) + domain_range = _tuple_of_arrays(domain_range) sample_colors, patches = _get_color_info( fdata, group, group_names, group_colors, legend, kwargs) @@ -282,7 +282,7 @@ def plot_scatter(fdata, chart=None, *, sample_points=None, if domain_range is None: domain_range = fdata.domain_range else: - domain_range = _list_of_arrays(domain_range) + domain_range = _tuple_of_arrays(domain_range) sample_colors, patches = _get_color_info( fdata, group, group_names, group_colors, legend, kwargs) diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py index fa55f4ee5..19329a949 100644 --- a/skfda/misc/operators/_linear_differential_operator.py +++ b/skfda/misc/operators/_linear_differential_operator.py @@ -43,15 +43,15 @@ class LinearDifferentialOperator(Operator): LinearDifferentialOperator( weights=[ FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[1]], ...)] ) @@ -63,15 +63,15 @@ class LinearDifferentialOperator(Operator): LinearDifferentialOperator( weights=[ FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[2]], ...), FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[3]], ...)] ) @@ -87,15 +87,15 @@ class LinearDifferentialOperator(Operator): LinearDifferentialOperator( weights=[ FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Monomial(domain_range=[array([0, 1])], n_basis=3), + basis=Monomial(domain_range=(array([0, 1]),), n_basis=3), coefficients=[[1 2 3]], ...)] ) diff --git a/skfda/ml/regression/linear.py b/skfda/ml/regression/linear.py index 322d122a1..3ba07ee51 100644 --- a/skfda/ml/regression/linear.py +++ b/skfda/ml/regression/linear.py @@ -80,7 +80,7 @@ class LinearRegression(BaseEstimator, RegressorMixin): >>> _ = linear.fit(x_fd, y) >>> linear.coef_[0] FDataBasis( - basis=Monomial(domain_range=[array([0, 1])], n_basis=3), + basis=Monomial(domain_range=(array([0, 1]),), n_basis=3), coefficients=[[-15. 96. -90.]], ...) >>> linear.intercept_ @@ -106,7 +106,7 @@ class LinearRegression(BaseEstimator, RegressorMixin): array([ 2., 1.]) >>> linear.coef_[1] FDataBasis( - basis=Constant(domain_range=[array([0, 1])], n_basis=1), + basis=Constant(domain_range=(array([0, 1]),), n_basis=1), coefficients=[[ 1.]], ...) >>> linear.intercept_ diff --git a/skfda/preprocessing/smoothing/validation.py b/skfda/preprocessing/smoothing/validation.py index 9ae5139b2..f8117d79d 100644 --- a/skfda/preprocessing/smoothing/validation.py +++ b/skfda/preprocessing/smoothing/validation.py @@ -189,7 +189,7 @@ class SmoothingParameterSearch(GridSearchCV): [ 0.67], [ 1.67], [ 2.5 ]]]), - sample_points=[array([-2., -1., 0., 1., 2.])], + sample_points=(array([-2., -1., 0., 1., 2.]),), domain_range=array([[-2., 2.]]), ...) diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index 4bc3e3ed1..5b0c11a3e 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -10,7 +10,7 @@ import numpy as np -from ..._utils import (_list_of_arrays, _same_domain, +from ..._utils import (_tuple_of_arrays, _same_domain, _reshape_eval_points, _evaluate_grid) @@ -46,8 +46,8 @@ def __init__(self, domain_range=None, n_basis=1): """ if domain_range is not None: - # TODO: Allow multiple dimensions - domain_range = _list_of_arrays(domain_range) + + domain_range = _tuple_of_arrays(domain_range) # Some checks _check_domain(domain_range) @@ -57,7 +57,7 @@ def __init__(self, domain_range=None, n_basis=1): "possitive.") self._domain_range = domain_range - self.n_basis = n_basis + self._n_basis = n_basis super().__init__() @@ -72,13 +72,13 @@ def dim_codomain(self): @property def domain_range(self): if self._domain_range is None: - return [np.array([0, 1])] + return (np.array([0, 1]),) * self.dim_domain else: return self._domain_range - @domain_range.setter - def domain_range(self, value): - self._domain_range = value + @property + def n_basis(self): + return self._n_basis @abstractmethod def _evaluate(self, eval_points): @@ -225,14 +225,22 @@ def rescale(self, domain_range=None): the original basis. """ - if domain_range is None: - domain_range = self.domain_range - - return type(self)(domain_range, self.n_basis) + return self.copy(domain_range=domain_range) - def copy(self): + def copy(self, domain_range=None): """Basis copy""" - return copy.deepcopy(self) + + new_copy = copy.deepcopy(self) + + if domain_range is not None: + domain_range = _tuple_of_arrays(domain_range) + + # Some checks + _check_domain(domain_range) + + new_copy._domain_range = domain_range + + return new_copy def to_basis(self): from . import FDataBasis @@ -356,3 +364,7 @@ def __eq__(self, other): return (type(self) == type(other) and _same_domain(self, other) and self.n_basis == other.n_basis) + + def __hash__(self): + """Hash of Basis""" + return hash((self.domain_range, self.n_basis)) diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index a85ce8da9..2bb1e110d 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -5,8 +5,8 @@ import numpy as np -from ..._utils import _list_of_arrays from ..._utils import _same_domain +from ..._utils import _tuple_of_arrays from ._basis import Basis @@ -104,7 +104,7 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): """ if domain_range is not None: - domain_range = _list_of_arrays(domain_range) + domain_range = _tuple_of_arrays(domain_range) if len(domain_range) != 1: raise ValueError("Domain range should be unidimensional.") @@ -135,8 +135,8 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): f"order of the bspline ({order}) should be " f"greater than 3.") - self.order = order - self.knots = None if knots is None else list(knots) + self._order = order + self._knots = None if knots is None else list(knots) super().__init__(domain_range, n_basis) # Checks @@ -153,9 +153,9 @@ def knots(self): else: return self._knots - @knots.setter - def knots(self, value): - self._knots = value + @property + def order(self): + return self._order def _evaluation_knots(self): """ @@ -246,12 +246,6 @@ def __repr__(self): f"n_basis={self.n_basis}, order={self.order}, " f"knots={self.knots})") - def __eq__(self, other): - """Equality of Basis""" - return (super().__eq__(other) - and self.order == other.order - and self.knots == other.knots) - def _gram_matrix(self): # Places m knots at the boundaries knots = self._evaluation_knots() @@ -403,3 +397,11 @@ def _from_scipy_BSpline(bspline): def inknots(self): """Return number of basis.""" return self.knots[1:len(self.knots) - 1] + + def __eq__(self, other): + return (super().__eq__(other) + and self.order == other.order + and self.knots == other.knots) + + def __hash__(self): + return hash((super().__hash__(), self.order, self.knots)) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index ecd8fcefd..6b90858af 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -1,6 +1,7 @@ from builtins import isinstance import copy import numbers +from typing import Any import pandas.api.extensions @@ -55,7 +56,7 @@ class FDataBasis(FData): >>> coefficients = [1, 1, 3, .5] >>> FDataBasis(basis, coefficients) FDataBasis( - basis=Monomial(domain_range=[array([0, 1])], n_basis=4), + basis=Monomial(domain_range=(array([0, 1]),), n_basis=4), coefficients=[[ 1. 1. 3. 0.5]], ...) @@ -389,7 +390,7 @@ def mean(self, weights=None): >>> coefficients = [[0.5, 1, 2, .5], [1.5, 1, 4, .5]] >>> FDataBasis(basis, coefficients).mean() FDataBasis( - basis=Monomial(domain_range=[array([0, 1])], n_basis=4), + basis=Monomial(domain_range=(array([0, 1]),), n_basis=4), coefficients=[[ 1. 1. 3. 0.5]], ...) @@ -495,7 +496,7 @@ def to_grid(self, sample_points=None): [[1], [2], [5]]]), - sample_points=[array([0, 1, 2])], + sample_points=(array([0, 1, 2]),), domain_range=array([[0, 5]]), ...) @@ -827,7 +828,7 @@ def __rtruediv__(self, other): @property def dtype(self): """The dtype for this extension array, FDataGridDType""" - return FDataBasisDType() + return FDataBasisDType(basis=self.basis) @property def nbytes(self) -> int: @@ -837,24 +838,39 @@ def nbytes(self) -> int: return self.coefficients.nbytes() -@pandas.api.extensions.register_extension_dtype class FDataBasisDType(pandas.api.extensions.ExtensionDtype): """ DType corresponding to FDataBasis in Pandas """ - name = 'FDataBasis' kind = 'O' type = FDataBasis - na_value = None + name = 'FDataBasis' + na_value = pandas.NA - @classmethod - def construct_from_string(cls, string): - if string == cls.name: - return cls() - else: - raise TypeError( - f"Cannot construct a '{cls.__name__}' from '{string}'") + _metadata = ("basis") + + def __init__(self, basis) -> None: + self.basis = basis @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> type: return FDataBasis + + def __eq__(self, other: Any) -> bool: + """ + Rules for equality (similar to categorical): + 1) Any FData is equal to the string 'category' + 2) Any FData is equal to itself + 3) Otherwise, they are equal if the arguments are equal. + 6) Any other comparison returns False + """ + if isinstance(other, str): + return other == self.name + elif other is self: + return True + else: + return (isinstance(other, FDataBasisDType) + and self.basis == other.basis) + + def __hash__(self): + return hash(self.basis) diff --git a/skfda/representation/basis/_fourier.py b/skfda/representation/basis/_fourier.py index 4da88672a..4a6181c7f 100644 --- a/skfda/representation/basis/_fourier.py +++ b/skfda/representation/basis/_fourier.py @@ -1,7 +1,7 @@ import numpy as np -from ..._utils import _list_of_arrays from ..._utils import _same_domain +from ..._utils import _tuple_of_arrays from ._basis import Basis @@ -84,14 +84,14 @@ def __init__(self, domain_range=None, n_basis=3, period=None): """ if domain_range is not None: - domain_range = _list_of_arrays(domain_range) + domain_range = _tuple_of_arrays(domain_range) if len(domain_range) != 1: raise ValueError("Domain range should be unidimensional.") domain_range = domain_range[0] - self.period = period + self._period = period # If number of basis is even, add 1 n_basis += 1 - n_basis % 2 super().__init__(domain_range, n_basis) @@ -103,10 +103,6 @@ def period(self): else: return self._period - @period.setter - def period(self, value): - self._period = value - def _evaluate(self, eval_points): # Input is scalar @@ -197,15 +193,15 @@ def rescale(self, domain_range=None, *, rescale_period=False): rescale_basis = super().rescale(domain_range) - if rescale_period is False: - rescale_basis.period = self.period - else: + if rescale_period is True: + domain_rescaled = rescale_basis.domain_range[0] domain = self.domain_range[0] - rescale_basis.period = (self.period * - (domain_rescaled[1] - domain_rescaled[0]) / - (domain[1] - domain[0])) + rescale_basis._period = ( + self.period * + (domain_rescaled[1] - domain_rescaled[0]) / + (domain[1] - domain[0])) return rescale_basis @@ -221,5 +217,7 @@ def __repr__(self): f"n_basis={self.n_basis}, period={self.period})") def __eq__(self, other): - """Equality of Basis""" return super().__eq__(other) and self.period == other.period + + def __hash__(self): + return hash((super().__hash__(), self.period)) diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index b1d96aa35..b3b5c6c0e 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -62,17 +62,23 @@ class Tensor(Basis): def __init__(self, basis_list): + basis_list = tuple(basis_list) + if not all(b.dim_domain == 1 and b.dim_codomain == 1 for b in basis_list): raise ValueError("The basis functions must be " "univariate and scalar valued") - self.basis_list = basis_list + self._basis_list = basis_list super().__init__( domain_range=[b.domain_range[0] for b in basis_list], n_basis=np.prod([b.n_basis for b in basis_list])) + @property + def basis_list(self): + return self._basis_list + @property def dim_domain(self): return len(self.basis_list) @@ -110,3 +116,9 @@ def basis_of_product(self, other): def rbasis_of_product(self, other): pass + + def __eq__(self, other): + return super().__eq__(other) and self.basis_list == other.basis_list + + def __hash__(self): + return hash((super().__hash__(), self.basis_list)) diff --git a/skfda/representation/basis/_vector_basis.py b/skfda/representation/basis/_vector_basis.py index c59c046c2..9a2bae2f1 100644 --- a/skfda/representation/basis/_vector_basis.py +++ b/skfda/representation/basis/_vector_basis.py @@ -59,6 +59,8 @@ class VectorValued(Basis): def __init__(self, basis_list): + basis_list = tuple(basis_list) + if not all(b.dim_codomain == 1 for b in basis_list): raise ValueError("The basis functions must be " "scalar valued") @@ -69,12 +71,16 @@ def __init__(self, basis_list): raise ValueError("The basis must all have the same domain " "dimension an range") - self.basis_list = basis_list + self._basis_list = basis_list super().__init__( domain_range=basis_list[0].domain_range, n_basis=sum(b.n_basis for b in basis_list)) + @property + def basis_list(self): + return self._basis_list + @property def dim_domain(self): return self.basis_list[0].dim_domain @@ -149,3 +155,9 @@ def basis_of_product(self, other): def rbasis_of_product(self, other): pass + + def __eq__(self, other): + return super().__eq__(other) and self.basis_list == other.basis_list + + def __hash__(self): + return hash((super().__hash__(), self.basis_list)) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index e308d3fb0..158a5c534 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -16,7 +16,7 @@ import numpy as np from . import basis as fdbasis -from .._utils import _list_of_arrays, constants +from .._utils import _tuple_of_arrays, constants from ._functional_data import FData from .interpolation import SplineInterpolation @@ -68,7 +68,7 @@ class FDataGrid(FData): [[4], [5], [6]]]), - sample_points=[array([2, 4, 5])], + sample_points=(array([2, 4, 5]),), domain_range=array([[2, 5]]), ...) @@ -166,7 +166,7 @@ def __init__(self, data_matrix, sample_points=None, self.data_matrix = np.atleast_2d(data_matrix) if sample_points is None: - self.sample_points = _list_of_arrays( + self.sample_points = _tuple_of_arrays( [np.linspace(0, 1, self.data_matrix.shape[i]) for i in range(1, self.data_matrix.ndim)]) @@ -174,7 +174,7 @@ def __init__(self, data_matrix, sample_points=None, # Check that the dimension of the data matches the sample_points # list - self.sample_points = _list_of_arrays(sample_points) + self.sample_points = _tuple_of_arrays(sample_points) data_shape = self.data_matrix.shape[1: 1 + self.dim_domain] sample_points_shape = [len(i) for i in self.sample_points] @@ -402,7 +402,7 @@ def derivative(self, *, order=1): [ 1.5], [ 2. ], [ 4. ]]]), - sample_points=[array([0, 1, 2, 3, 4])], + sample_points=(array([0, 1, 2, 3, 4]),), domain_range=array([[0, 4]]), ...) @@ -416,7 +416,7 @@ def derivative(self, *, order=1): [-1.], [ 2.], [ 5.]]]), - sample_points=[array([0, 1, 2, 3, 4])], + sample_points=(array([0, 1, 2, 3, 4]),), domain_range=array([[0, 4]]), ...) @@ -693,7 +693,7 @@ def concatenate(self, *others, as_coordinates=False): [7], [9], [2]]]), - sample_points=[array([0, 1, 2, 3, 4])], + sample_points=(array([0, 1, 2, 3, 4]),), domain_range=array([[0, 4]]), ...) @@ -798,8 +798,7 @@ def to_basis(self, basis, **kwargs): # Readjust the domain range if there was not an explicit one if basis._domain_range is None: - basis = basis.copy() - basis.domain_range = self.domain_range + basis = basis.copy(domain_range=self.domain_range) return fdbasis.FDataBasis.from_data(self.data_matrix, self.sample_points, @@ -1115,7 +1114,7 @@ class FDataGridDType(pandas.api.extensions.ExtensionDtype): name = 'FDataGrid' kind = 'O' type = FDataGrid - na_value = None + na_value = pandas.NA @classmethod def construct_from_string(cls, string): diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 20f08cda3..7ade52feb 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -13,7 +13,10 @@ @pytest.fixture def dtype(): """A fixture providing the ExtensionDtype to validate.""" - return skfda.representation.basis.FDataBasisDType() + + basis = skfda.representation.basis.BSpline(n_basis=5) + + return skfda.representation.basis.FDataBasisDType(basis=basis) @pytest.fixture @@ -192,4 +195,19 @@ class TestConstructors(base.BaseConstructorsTests): class TestDtype(base.BaseDtypeTests): - pass + + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string_own_name(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_is_dtype_from_name(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_eq_with_str(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string(self, dtype): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 28cf5b528..122f13187 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -186,9 +186,9 @@ def as_array(request): ############################################################################## -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestDtype(base.BaseDtypeTests): - pass +# class TestConstructors(base.BaseConstructorsTests): +# pass +# +# +# class TestDtype(base.BaseDtypeTests): +# pass From 012f965de9a9fb2751cbe7143725c41bb9e12989 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 5 Aug 2020 19:29:48 +0200 Subject: [PATCH 017/210] Set `domain_range` as tuple of tuples. --- skfda/_utils/__init__.py | 3 +- skfda/_utils/_utils.py | 15 +++++++ skfda/exploratory/visualization/_boxplot.py | 5 +-- .../visualization/_magnitude_shape_plot.py | 2 +- .../_linear_differential_operator.py | 18 ++++---- skfda/ml/regression/linear.py | 4 +- skfda/preprocessing/smoothing/validation.py | 2 +- skfda/representation/basis/_basis.py | 10 ++--- skfda/representation/basis/_fdatabasis.py | 6 +-- skfda/representation/grid.py | 38 ++++++++--------- tests/test_pandas_fdatabasis.py | 42 +++++++++---------- tests/test_registration.py | 2 +- 12 files changed, 80 insertions(+), 67 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 589169bf3..50c5975aa 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -6,4 +6,5 @@ _same_domain, _to_array_maybe_ragged, _reshape_eval_points, _evaluate_grid, nquad_vec, - _FDataCallable, _pairwise_commutative) + _FDataCallable, _pairwise_commutative, + _domain_range) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f839c47a1..9537071dc 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -112,6 +112,21 @@ def _tuple_of_arrays(original_array): return tuple(np.asarray(i) for i in original_array) +def _domain_range(sequence): + + try: + iter(sequence[0]) + except TypeError: + sequence = (sequence,) + + sequence = tuple(tuple(s) for s in sequence) + + if not all(len(s) == 2 for s in sequence): + raise ValueError("Domain intervals should have 2 bounds each") + + return sequence + + def _to_array_maybe_ragged(array, *, row_shape=None): """ Convert to an array where each element may or may not be of equal length. diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index d509c6f27..376c16157 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -191,7 +191,7 @@ class Boxplot(FDataBoxplot): [-1. ], [-1. ]]]), sample_points=(array([ 0, 2, 4, 6, 8, 10]),), - domain_range=array([[ 0, 10]]), + domain_range=((0, 10),), dataset_name='dataset', argument_names=('x_label',), coordinate_names=('y_label',), @@ -517,8 +517,7 @@ class SurfaceBoxplot(FDataBoxplot): [ 0.6], [ 3. ]]]]), sample_points=(array([2, 4]), array([3, 6, 8])), - domain_range=array([[2, 4], - [3, 8]]), + domain_range=((2, 4), (3, 8)), dataset_name='dataset', argument_names=('x1_label', 'x2_label'), coordinate_names=('y_label',), diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 1053d94bf..8830a7ca9 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -146,7 +146,7 @@ class MagnitudeShapePlot: [-1. ], [-1. ]]]), sample_points=(array([ 0, 2, 4, 6, 8, 10]),), - domain_range=array([[ 0, 10]]), + domain_range=((0, 10),), ...), depth_method=projection_depth, pointwise_weights=None, diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py index 19329a949..a4e522bd4 100644 --- a/skfda/misc/operators/_linear_differential_operator.py +++ b/skfda/misc/operators/_linear_differential_operator.py @@ -43,15 +43,15 @@ class LinearDifferentialOperator(Operator): LinearDifferentialOperator( weights=[ FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[1]], ...)] ) @@ -63,15 +63,15 @@ class LinearDifferentialOperator(Operator): LinearDifferentialOperator( weights=[ FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[2]], ...), FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[3]], ...)] ) @@ -87,15 +87,15 @@ class LinearDifferentialOperator(Operator): LinearDifferentialOperator( weights=[ FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[0]], ...), FDataBasis( - basis=Monomial(domain_range=(array([0, 1]),), n_basis=3), + basis=Monomial(domain_range=((0, 1),), n_basis=3), coefficients=[[1 2 3]], ...)] ) diff --git a/skfda/ml/regression/linear.py b/skfda/ml/regression/linear.py index 3ba07ee51..383987cf0 100644 --- a/skfda/ml/regression/linear.py +++ b/skfda/ml/regression/linear.py @@ -80,7 +80,7 @@ class LinearRegression(BaseEstimator, RegressorMixin): >>> _ = linear.fit(x_fd, y) >>> linear.coef_[0] FDataBasis( - basis=Monomial(domain_range=(array([0, 1]),), n_basis=3), + basis=Monomial(domain_range=((0, 1),), n_basis=3), coefficients=[[-15. 96. -90.]], ...) >>> linear.intercept_ @@ -106,7 +106,7 @@ class LinearRegression(BaseEstimator, RegressorMixin): array([ 2., 1.]) >>> linear.coef_[1] FDataBasis( - basis=Constant(domain_range=(array([0, 1]),), n_basis=1), + basis=Constant(domain_range=((0, 1),), n_basis=1), coefficients=[[ 1.]], ...) >>> linear.intercept_ diff --git a/skfda/preprocessing/smoothing/validation.py b/skfda/preprocessing/smoothing/validation.py index f8117d79d..3b54305cb 100644 --- a/skfda/preprocessing/smoothing/validation.py +++ b/skfda/preprocessing/smoothing/validation.py @@ -190,7 +190,7 @@ class SmoothingParameterSearch(GridSearchCV): [ 1.67], [ 2.5 ]]]), sample_points=(array([-2., -1., 0., 1., 2.]),), - domain_range=array([[-2., 2.]]), + domain_range=((-2.0, 2.0),), ...) Other validation methods can be used such as cross-validation or diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index 5b0c11a3e..a03676847 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -10,8 +10,8 @@ import numpy as np -from ..._utils import (_tuple_of_arrays, _same_domain, - _reshape_eval_points, _evaluate_grid) +from ..._utils import (_domain_range, _same_domain, + _reshape_eval_points) __author__ = "Miguel Carbajo Berrocal" @@ -47,7 +47,7 @@ def __init__(self, domain_range=None, n_basis=1): if domain_range is not None: - domain_range = _tuple_of_arrays(domain_range) + domain_range = _domain_range(domain_range) # Some checks _check_domain(domain_range) @@ -72,7 +72,7 @@ def dim_codomain(self): @property def domain_range(self): if self._domain_range is None: - return (np.array([0, 1]),) * self.dim_domain + return ((0, 1),) * self.dim_domain else: return self._domain_range @@ -233,7 +233,7 @@ def copy(self, domain_range=None): new_copy = copy.deepcopy(self) if domain_range is not None: - domain_range = _tuple_of_arrays(domain_range) + domain_range = _domain_range(domain_range) # Some checks _check_domain(domain_range) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 6b90858af..e1d41b08a 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -56,7 +56,7 @@ class FDataBasis(FData): >>> coefficients = [1, 1, 3, .5] >>> FDataBasis(basis, coefficients) FDataBasis( - basis=Monomial(domain_range=(array([0, 1]),), n_basis=4), + basis=Monomial(domain_range=((0, 1),), n_basis=4), coefficients=[[ 1. 1. 3. 0.5]], ...) @@ -390,7 +390,7 @@ def mean(self, weights=None): >>> coefficients = [[0.5, 1, 2, .5], [1.5, 1, 4, .5]] >>> FDataBasis(basis, coefficients).mean() FDataBasis( - basis=Monomial(domain_range=(array([0, 1]),), n_basis=4), + basis=Monomial(domain_range=((0, 1),), n_basis=4), coefficients=[[ 1. 1. 3. 0.5]], ...) @@ -497,7 +497,7 @@ def to_grid(self, sample_points=None): [2], [5]]]), sample_points=(array([0, 1, 2]),), - domain_range=array([[0, 5]]), + domain_range=((0, 5),), ...) """ diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 158a5c534..90534fc5a 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -16,7 +16,7 @@ import numpy as np from . import basis as fdbasis -from .._utils import _tuple_of_arrays, constants +from .._utils import _tuple_of_arrays, constants, _domain_range from ._functional_data import FData from .interpolation import SplineInterpolation @@ -69,7 +69,7 @@ class FDataGrid(FData): [5], [6]]]), sample_points=(array([2, 4, 5]),), - domain_range=array([[2, 5]]), + domain_range=((2, 5),), ...) The number of columns of data_matrix have to be the length of @@ -190,23 +190,21 @@ def __init__(self, data_matrix, sample_points=None, for i in range(self.dim_domain)]) if domain_range is None: - self._domain_range = self.sample_range + domain_range = self.sample_range # Default value for domain_range is a list of tuples with # the first and last element of each list ofthe sample_points. - else: - self._domain_range = np.atleast_2d(domain_range) - # sample range must by a 2 dimension matrix with as many rows as - # dimensions in the domain and 2 columns - if (self._domain_range.ndim != 2 - or self._domain_range.shape[1] != 2 - or self._domain_range.shape[0] != self.dim_domain): - raise ValueError("Incorrect shape of domain_range.") - for i in range(self.dim_domain): - if (self._domain_range[i, 0] > self.sample_points[i][0] - or self._domain_range[i, -1] < self.sample_points[i] - [-1]): - raise ValueError("Sample points must be within the domain " - "range.") + + self._domain_range = _domain_range(domain_range) + + if len(self._domain_range) != self.dim_domain: + raise ValueError("Incorrect shape of domain_range.") + + for i in range(self.dim_domain): + if (self._domain_range[i][0] > self.sample_points[i][0] + or self._domain_range[i][-1] < self.sample_points[i] + [-1]): + raise ValueError("Sample points must be within the domain " + "range.") # Adjust the data matrix if the dimension of the image is one if self.data_matrix.ndim == 1 + self.dim_domain: @@ -403,7 +401,7 @@ def derivative(self, *, order=1): [ 2. ], [ 4. ]]]), sample_points=(array([0, 1, 2, 3, 4]),), - domain_range=array([[0, 4]]), + domain_range=((0, 4),), ...) Second order derivative @@ -417,7 +415,7 @@ def derivative(self, *, order=1): [ 2.], [ 5.]]]), sample_points=(array([0, 1, 2, 3, 4]),), - domain_range=array([[0, 4]]), + domain_range=((0, 4),), ...) """ @@ -694,7 +692,7 @@ def concatenate(self, *others, as_coordinates=False): [9], [2]]]), sample_points=(array([0, 1, 2, 3, 4]),), - domain_range=array([[0, 4]]), + domain_range=((0, 4),), ...) """ diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 7ade52feb..3c8811024 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -190,24 +190,24 @@ def as_array(request): ############################################################################## -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestDtype(base.BaseDtypeTests): - - @pytest.mark.skip(reason="Unsupported") - def test_construct_from_string_own_name(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_is_dtype_from_name(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_eq_with_str(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_construct_from_string(self, dtype): - pass +# class TestConstructors(base.BaseConstructorsTests): +# pass +# +# +# class TestDtype(base.BaseDtypeTests): +# +# @pytest.mark.skip(reason="Unsupported") +# def test_construct_from_string_own_name(self): +# pass +# +# @pytest.mark.skip(reason="Unsupported") +# def test_is_dtype_from_name(self): +# pass +# +# @pytest.mark.skip(reason="Unsupported") +# def test_eq_with_str(self): +# pass +# +# @pytest.mark.skip(reason="Unsupported") +# def test_construct_from_string(self, dtype): +# pass diff --git a/tests/test_registration.py b/tests/test_registration.py index 411b0cacc..8c455bdca 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -299,7 +299,7 @@ def test_restrict_domain(self): fd_registered_1 = reg.fit_transform(self.fd) np.testing.assert_array_almost_equal( - fd_registered_1.domain_range.round(3), [[0.022, 0.969]]) + np.array(fd_registered_1.domain_range).round(3), [[0.022, 0.969]]) reg2 = ShiftRegistration(restrict_domain=True, template=reg.template_) fd_registered_2 = reg2.fit_transform(self.fd) From 9adba69142636b43fec4b7dd095bc7caa1be8c95 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 01:45:04 +0200 Subject: [PATCH 018/210] Tests FDataBasis. --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 7 +++ skfda/representation/_functional_data.py | 14 ++++-- skfda/representation/basis/_bspline.py | 12 ++--- skfda/representation/basis/_fdatabasis.py | 9 +++- tests/test_pandas_fdatabasis.py | 60 +++++++++++++---------- 6 files changed, 67 insertions(+), 37 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 50c5975aa..f0d1befed 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -7,4 +7,4 @@ _reshape_eval_points, _evaluate_grid, nquad_vec, _FDataCallable, _pairwise_commutative, - _domain_range) + _domain_range, _nanequals) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9537071dc..c0f0b135e 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -413,6 +413,13 @@ def _pairwise_commutative(function, arg1, arg2=None, **kwargs): (len(arg1), len(arg2))) +def _nanequals(a, b): + """ + Compare two arrays considering that NaNs are equal. + """ + return (a == b) | (np.isnan(a) & np.isnan(b)) + + def parameter_aliases(**alias_assignments): """Allows using aliases for parameters""" def decorator(f): diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 936d26069..893d77824 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -640,7 +640,8 @@ def __getitem__(self, key): def __eq__(self, other): return ( - self.extrapolation == other.extrapolation + type(self) == type(other) + and self.extrapolation == other.extrapolation and self.dataset_name == other.dataset_name and self.argument_names == other.argument_names and self.coordinate_names == other.coordinate_names @@ -751,8 +752,15 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if copy: scalars = [f.copy() for f in scalars] - if dtype is not None and dtype != cls.dtype.fget(None): - raise ValueError(f"Invalid dtype {dtype}") + if dtype is None: + first_element = next(s for s in scalars if s is not pandas.NA) + dtype = first_element.dtype + + scalars = [s if s is not pandas.NA else dtype._na_repr() + for s in scalars] + + if len(scalars) == 0: + scalars = [dtype._na_repr()[0:0]] return cls._concat_same_type(scalars) diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index 2bb1e110d..cfd918d4e 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -117,7 +117,7 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): raise ValueError("Must provide either a list of knots or the" "number of basis.") else: - knots = list(knots) + knots = tuple(knots) knots.sort() if domain_range is None: domain_range = (knots[0], knots[-1]) @@ -136,7 +136,7 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): f"greater than 3.") self._order = order - self._knots = None if knots is None else list(knots) + self._knots = None if knots is None else tuple(knots) super().__init__(domain_range, n_basis) # Checks @@ -148,8 +148,8 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): @property def knots(self): if self._knots is None: - return list(np.linspace(*self.domain_range[0], - self.n_basis - self.order + 2)) + return tuple(np.linspace(*self.domain_range[0], + self.n_basis - self.order + 2)) else: return self._knots @@ -166,8 +166,8 @@ def _evaluation_knots(self): .. [RS05] Ramsay, J., Silverman, B. W. (2005). *Functional Data Analysis*. Springer. 50-51. """ - return np.array([self.knots[0]] * (self.order - 1) + self.knots + - [self.knots[-1]] * (self.order - 1)) + return np.array((self.knots[0],) * (self.order - 1) + self.knots + + (self.knots[-1],) * (self.order - 1)) def _evaluate(self, eval_points): diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index e1d41b08a..aa2d6ba33 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -8,7 +8,7 @@ import numpy as np from .. import grid -from ..._utils import constants +from ..._utils import constants, _nanequals from .._functional_data import FData @@ -670,7 +670,7 @@ def __eq__(self, other): # TODO check all other params return (super().__eq__(other) and self.basis == other.basis - and np.all(self.coefficients == other.coefficients)) + and np.all(_nanequals(self.coefficients, other.coefficients))) def concatenate(self, *others, as_coordinates=False): """Join samples from a similar FDataBasis object. @@ -856,6 +856,11 @@ def __init__(self, basis) -> None: def construct_array_type(cls) -> type: return FDataBasis + def _na_repr(self): + return FDataBasis( + basis=self.basis, + coefficients=((np.NaN,) * self.basis.n_basis,)) + def __eq__(self, other: Any) -> bool: """ Rules for equality (similar to categorical): diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 3c8811024..6041f1d60 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -1,15 +1,17 @@ +import operator import skfda from pandas import Series +import pandas from pandas.tests.extension import base import pytest + import numpy as np + ############################################################################## # Fixtures ############################################################################## - - @pytest.fixture def dtype(): """A fixture providing the ExtensionDtype to validate.""" @@ -27,7 +29,7 @@ def data(): * data[0] and data[1] should not be equal """ - basis = skfda.representation.basis.Monomial(n_basis=5) + basis = skfda.representation.basis.BSpline(n_basis=5) coef_matrix = np.arange(100 * 5).reshape(100, 5) return skfda.FDataBasis(basis=basis, coefficients=coef_matrix) @@ -109,7 +111,7 @@ def na_cmp(): @pytest.fixture def na_value(): """The scalar missing value for this type. Default 'None'""" - return None + return pandas.NA @pytest.fixture @@ -190,24 +192,32 @@ def as_array(request): ############################################################################## -# class TestConstructors(base.BaseConstructorsTests): -# pass -# -# -# class TestDtype(base.BaseDtypeTests): -# -# @pytest.mark.skip(reason="Unsupported") -# def test_construct_from_string_own_name(self): -# pass -# -# @pytest.mark.skip(reason="Unsupported") -# def test_is_dtype_from_name(self): -# pass -# -# @pytest.mark.skip(reason="Unsupported") -# def test_eq_with_str(self): -# pass -# -# @pytest.mark.skip(reason="Unsupported") -# def test_construct_from_string(self, dtype): -# pass +class TestConstructors(base.BaseConstructorsTests): + + # Does not support scalars which are also ExtensionArrays + @pytest.mark.skip(reason="Unsupported") + def test_series_constructor_scalar_with_index(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_from_dtype(self): + pass + + +class TestDtype(base.BaseDtypeTests): + + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string_own_name(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_is_dtype_from_name(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_eq_with_str(self): + pass + + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string(self, dtype): + pass From 5f8aef04d553f28072098c7ad99cc697cccac4eb Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 01:57:08 +0200 Subject: [PATCH 019/210] Fix sort of BSpline knots. --- skfda/representation/basis/_bspline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index cfd918d4e..004009c0e 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -118,7 +118,7 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): "number of basis.") else: knots = tuple(knots) - knots.sort() + knots = sorted(knots) if domain_range is None: domain_range = (knots[0], knots[-1]) else: From 18968c7c2230d8e49dbf5c0499f33525d8e3bb9d Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 18:20:43 +0200 Subject: [PATCH 020/210] Make __eq__ elementwise. --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 7 - skfda/representation/_functional_data.py | 2 +- skfda/representation/basis/_fdatabasis.py | 26 ++- skfda/representation/grid.py | 86 ++++++-- tests/test_basis.py | 230 ++++++++++----------- tests/test_fdatagrid_numpy.py | 6 +- tests/test_grid.py | 2 +- tests/test_linear_differential_operator.py | 22 +- tests/test_pandas.py | 12 +- tests/test_pandas_fdatabasis.py | 63 +++--- tests/test_pandas_fdatagrid.py | 40 +++- 12 files changed, 296 insertions(+), 202 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index f0d1befed..50c5975aa 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -7,4 +7,4 @@ _reshape_eval_points, _evaluate_grid, nquad_vec, _FDataCallable, _pairwise_commutative, - _domain_range, _nanequals) + _domain_range) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index c0f0b135e..9537071dc 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -413,13 +413,6 @@ def _pairwise_commutative(function, arg1, arg2=None, **kwargs): (len(arg1), len(arg2))) -def _nanequals(a, b): - """ - Compare two arrays considering that NaNs are equal. - """ - return (a == b) | (np.isnan(a) & np.isnan(b)) - - def parameter_aliases(**alias_assignments): """Allows using aliases for parameters""" def decorator(f): diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 893d77824..053a98ca7 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -638,7 +638,7 @@ def __getitem__(self, key): pass - def __eq__(self, other): + def equals(self, other): return ( type(self) == type(other) and self.extrapolation == other.extrapolation diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index aa2d6ba33..1e99d8020 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -8,7 +8,7 @@ import numpy as np from .. import grid -from ..._utils import constants, _nanequals +from ..._utils import constants from .._functional_data import FData @@ -665,12 +665,26 @@ def __str__(self): f"\n_basis={self.basis}," f"\ncoefficients={self.coefficients})").replace('\n', '\n ') - def __eq__(self, other): + def equals(self, other): """Equality of FDataBasis""" # TODO check all other params - return (super().__eq__(other) + return (super().equals(other) and self.basis == other.basis - and np.all(_nanequals(self.coefficients, other.coefficients))) + and np.array_equal(self.coefficients, other.coefficients, + equal_nan=True)) + + def __eq__(self, other): + """Elementwise equality of FDataBasis""" + + if type(self) != type(other) or self.dtype != other.dtype: + raise TypeError("Types are not equal") + + if len(self) != len(other): + raise ValueError(f"Different lengths: " + f"len(self)={len(self)} and " + f"len(other)={len(other)}") + + return np.all(self.coefficients == other.coefficients, axis=1) def concatenate(self, *others, as_coordinates=False): """Join samples from a similar FDataBasis object. @@ -856,7 +870,7 @@ def __init__(self, basis) -> None: def construct_array_type(cls) -> type: return FDataBasis - def _na_repr(self): + def _na_repr(self) -> FDataBasis: return FDataBasis( basis=self.basis, coefficients=((np.NaN,) * self.basis.n_basis,)) @@ -877,5 +891,5 @@ def __eq__(self, other: Any) -> bool: return (isinstance(other, FDataBasisDType) and self.basis == other.basis) - def __hash__(self): + def __hash__(self) -> int: return hash(self.basis) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 90534fc5a..3a7777475 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -8,8 +8,9 @@ import copy import numbers -import findiff +from typing import Any +import findiff import pandas.api.extensions import scipy.stats.mstats @@ -521,15 +522,13 @@ def gmean(self): scipy.stats.mstats.gmean(self.data_matrix, 0)], sample_names=("geometric mean",)) - def __eq__(self, other): + def equals(self, other): """Comparison of FDataGrid objects""" - if not isinstance(other, FDataGrid): - return NotImplemented - - if not super().__eq__(other): + if not super().equals(other): return False - if not np.array_equal(self.data_matrix, other.data_matrix): + if not np.array_equal(self.data_matrix, other.data_matrix, + equal_nan=True): return False if len(self.sample_points) != len(other.sample_points): @@ -547,6 +546,19 @@ def __eq__(self, other): return True + def __eq__(self, other): + """Elementwise equality of FDataGrid""" + + if type(self) != type(other) or self.dtype != other.dtype: + raise TypeError("Types are not equal") + + if len(self) != len(other): + raise ValueError(f"Different lengths: " + f"len(self)={len(self)} and " + f"len(other)={len(other)}") + + return np.all(self.coefficients == other.coefficients, axis=1) + def _get_op_matrix(self, other): if isinstance(other, numbers.Number): return other @@ -1093,7 +1105,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @property def dtype(self): """The dtype for this extension array, FDataGridDType""" - return FDataGridDType() + return FDataGridDType( + sample_points=self.sample_points, + domain_range=self.domain_range, + dim_codomain=self.dim_codomain) @property def nbytes(self) -> int: @@ -1104,7 +1119,6 @@ def nbytes(self) -> int: p.nbytes() for p in self.sample_points) -@pandas.api.extensions.register_extension_dtype class FDataGridDType(pandas.api.extensions.ExtensionDtype): """ DType corresponding to FDataGrid in Pandas @@ -1114,14 +1128,54 @@ class FDataGridDType(pandas.api.extensions.ExtensionDtype): type = FDataGrid na_value = pandas.NA - @classmethod - def construct_from_string(cls, string): - if string == cls.name: - return cls() - else: - raise TypeError( - f"Cannot construct a '{cls.__name__}' from '{string}'") + def __init__(self, sample_points, dim_codomain, domain_range=None) -> None: + sample_points = _tuple_of_arrays(sample_points) + + self.sample_points = tuple(tuple(s) for s in sample_points) + + if domain_range is None: + domain_range = np.array( + [(self.sample_points[i][0], self.sample_points[i][-1]) + for i in range(self.dim_domain)]) + + self.domain_range = _domain_range(domain_range) + self.dim_codomain = dim_codomain @classmethod def construct_array_type(cls): return FDataGrid + + def _na_repr(self) -> FDataGrid: + + shape = ((1,) + + tuple(len(s) for s in self.sample_points) + + (self.dim_codomain,)) + + data_matrix = np.full(shape=shape, fill_value=np.NaN) + + return FDataGrid( + sample_points=self.sample_points, + domain_range=self.domain_range, + data_matrix=data_matrix) + + def __eq__(self, other: Any) -> bool: + """ + Rules for equality (similar to categorical): + 1) Any FData is equal to the string 'category' + 2) Any FData is equal to itself + 3) Otherwise, they are equal if the arguments are equal. + 6) Any other comparison returns False + """ + if isinstance(other, str): + return other == self.name + elif other is self: + return True + else: + return (isinstance(other, FDataGridDType) + and self.dim_codomain == other.dim_codomain + and self.domain_range == other.domain_range + and self.sample_points == other.sample_points) + + def __hash__(self) -> int: + return hash((self.sample_points, + self.domain_range, self.dim_codomain)) diff --git a/tests/test_basis.py b/tests/test_basis.py index da2531eaf..3c0863a67 100644 --- a/tests/test_basis.py +++ b/tests/test_basis.py @@ -242,21 +242,21 @@ def test_fdatabasis__add__(self): monomial1 = FDataBasis(Monomial(n_basis=3), [1, 2, 3]) monomial2 = FDataBasis(Monomial(n_basis=3), [[1, 2, 3], [3, 4, 5]]) - np.testing.assert_equal(monomial1 + monomial2, - FDataBasis(Monomial(n_basis=3), - [[2, 4, 6], [4, 6, 8]])) - np.testing.assert_equal(monomial2 + 1, - FDataBasis(Monomial(n_basis=3), - [[2, 2, 3], [4, 4, 5]])) - np.testing.assert_equal(1 + monomial2, - FDataBasis(Monomial(n_basis=3), - [[2, 2, 3], [4, 4, 5]])) - np.testing.assert_equal(monomial2 + [1, 2], - FDataBasis(Monomial(n_basis=3), - [[2, 2, 3], [5, 4, 5]])) - np.testing.assert_equal([1, 2] + monomial2, - FDataBasis(Monomial(n_basis=3), - [[2, 2, 3], [5, 4, 5]])) + self.assertTrue((monomial1 + monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[2, 4, 6], [4, 6, 8]]))) + self.assertTrue((monomial2 + 1).equals( + FDataBasis(Monomial(n_basis=3), + [[2, 2, 3], [4, 4, 5]]))) + self.assertTrue((1 + monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[2, 2, 3], [4, 4, 5]]))) + self.assertTrue((monomial2 + [1, 2]).equals( + FDataBasis(Monomial(n_basis=3), + [[2, 2, 3], [5, 4, 5]]))) + self.assertTrue(([1, 2] + monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[2, 2, 3], [5, 4, 5]]))) with np.testing.assert_raises(TypeError): monomial2 + FDataBasis(Fourier(n_basis=3), @@ -266,21 +266,21 @@ def test_fdatabasis__sub__(self): monomial1 = FDataBasis(Monomial(n_basis=3), [1, 2, 3]) monomial2 = FDataBasis(Monomial(n_basis=3), [[1, 2, 3], [3, 4, 5]]) - np.testing.assert_equal(monomial1 - monomial2, - FDataBasis(Monomial(n_basis=3), - [[0, 0, 0], [-2, -2, -2]])) - np.testing.assert_equal(monomial2 - 1, - FDataBasis(Monomial(n_basis=3), - [[0, 2, 3], [2, 4, 5]])) - np.testing.assert_equal(1 - monomial2, - FDataBasis(Monomial(n_basis=3), - [[0, -2, -3], [-2, -4, -5]])) - np.testing.assert_equal(monomial2 - [1, 2], - FDataBasis(Monomial(n_basis=3), - [[0, 2, 3], [1, 4, 5]])) - np.testing.assert_equal([1, 2] - monomial2, - FDataBasis(Monomial(n_basis=3), - [[0, -2, -3], [-1, -4, -5]])) + self.assertTrue((monomial1 - monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[0, 0, 0], [-2, -2, -2]]))) + self.assertTrue((monomial2 - 1).equals( + FDataBasis(Monomial(n_basis=3), + [[0, 2, 3], [2, 4, 5]]))) + self.assertTrue((1 - monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[0, -2, -3], [-2, -4, -5]]))) + self.assertTrue((monomial2 - [1, 2]).equals( + FDataBasis(Monomial(n_basis=3), + [[0, 2, 3], [1, 4, 5]]))) + self.assertTrue(([1, 2] - monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[0, -2, -3], [-1, -4, -5]]))) with np.testing.assert_raises(TypeError): monomial2 - FDataBasis(Fourier(n_basis=3), @@ -290,21 +290,21 @@ def test_fdatabasis__mul__(self): monomial1 = FDataBasis(Monomial(n_basis=3), [1, 2, 3]) monomial2 = FDataBasis(Monomial(n_basis=3), [[1, 2, 3], [3, 4, 5]]) - np.testing.assert_equal(monomial1 * 2, - FDataBasis(Monomial(n_basis=3), - [[2, 4, 6]])) - np.testing.assert_equal(3 * monomial2, - FDataBasis(Monomial(n_basis=3), - [[3, 6, 9], [9, 12, 15]])) - np.testing.assert_equal(3 * monomial2, - monomial2 * 3) - - np.testing.assert_equal(monomial2 * [1, 2], - FDataBasis(Monomial(n_basis=3), - [[1, 2, 3], [6, 8, 10]])) - np.testing.assert_equal([1, 2] * monomial2, - FDataBasis(Monomial(n_basis=3), - [[1, 2, 3], [6, 8, 10]])) + self.assertTrue((monomial1 * 2).equals( + FDataBasis(Monomial(n_basis=3), + [[2, 4, 6]]))) + self.assertTrue((3 * monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[3, 6, 9], [9, 12, 15]]))) + self.assertTrue((3 * monomial2).equals( + monomial2 * 3)) + + self.assertTrue((monomial2 * [1, 2]).equals( + FDataBasis(Monomial(n_basis=3), + [[1, 2, 3], [6, 8, 10]]))) + self.assertTrue(([1, 2] * monomial2).equals( + FDataBasis(Monomial(n_basis=3), + [[1, 2, 3], [6, 8, 10]]))) with np.testing.assert_raises(TypeError): monomial2 * FDataBasis(Fourier(n_basis=3), @@ -313,48 +313,31 @@ def test_fdatabasis__mul__(self): with np.testing.assert_raises(TypeError): monomial2 * monomial2 - def test_fdatabasis__mul__2(self): + def test_fdatabasis__div__(self): monomial1 = FDataBasis(Monomial(n_basis=3), [1, 2, 3]) monomial2 = FDataBasis(Monomial(n_basis=3), [[1, 2, 3], [3, 4, 5]]) - np.testing.assert_equal(monomial1 / 2, - FDataBasis(Monomial(n_basis=3), - [[1 / 2, 1, 3 / 2]])) - np.testing.assert_equal(monomial2 / 2, - FDataBasis(Monomial(n_basis=3), - [[1 / 2, 1, 3 / 2], [3 / 2, 2, 5 / 2]])) + self.assertTrue((monomial1 / 2).equals( + FDataBasis(Monomial(n_basis=3), + [[1 / 2, 1, 3 / 2]]))) + self.assertTrue((monomial2 / 2).equals( + FDataBasis(Monomial(n_basis=3), + [[1 / 2, 1, 3 / 2], [3 / 2, 2, 5 / 2]]))) - np.testing.assert_equal(monomial2 / [1, 2], - FDataBasis(Monomial(n_basis=3), - [[1, 2, 3], [3 / 2, 2, 5 / 2]])) + self.assertTrue((monomial2 / [1, 2]).equals( + FDataBasis(Monomial(n_basis=3), + [[1, 2, 3], [3 / 2, 2, 5 / 2]]))) def test_fdatabasis_derivative_constant(self): - monomial = FDataBasis(Monomial(n_basis=8), - [1, 5, 8, 9, 7, 8, 4, 5]) - monomial2 = FDataBasis(Monomial(n_basis=5), - [[4, 9, 7, 4, 3], - [1, 7, 9, 8, 5], - [4, 6, 6, 6, 8]]) + constant = FDataBasis(Constant(), + [[1], [2], [3], [4]]) - np.testing.assert_equal(monomial.derivative(), - FDataBasis(Monomial(n_basis=7), - [5, 16, 27, 28, 40, 24, 35])) - np.testing.assert_equal(monomial.derivative(order=0), monomial) - np.testing.assert_equal(monomial.derivative(order=6), - FDataBasis(Monomial(n_basis=2), - [2880, 25200])) - - np.testing.assert_equal(monomial2.derivative(), - FDataBasis(Monomial(n_basis=4), - [[9, 14, 12, 12], - [7, 18, 24, 20], - [6, 12, 18, 32]])) - np.testing.assert_equal(monomial2.derivative(order=0), monomial2) - np.testing.assert_equal(monomial2.derivative(order=3), - FDataBasis(Monomial(n_basis=2), - [[24, 72], - [48, 120], - [36, 192]])) + self.assertTrue(constant.derivative().equals( + FDataBasis(Constant(), + [[0], [0], [0], [0]]))) + self.assertTrue(constant.derivative(order=0).equals( + FDataBasis(Constant(), + [[1], [2], [3], [4]]))) def test_fdatabasis_derivative_monomial(self): monomial = FDataBasis(Monomial(n_basis=8), @@ -364,25 +347,24 @@ def test_fdatabasis_derivative_monomial(self): [1, 7, 9, 8, 5], [4, 6, 6, 6, 8]]) - np.testing.assert_equal(monomial.derivative(), - FDataBasis(Monomial(n_basis=7), - [5, 16, 27, 28, 40, 24, 35])) - np.testing.assert_equal(monomial.derivative(order=0), monomial) - np.testing.assert_equal(monomial.derivative(order=6), - FDataBasis(Monomial(n_basis=2), - [2880, 25200])) - - np.testing.assert_equal(monomial2.derivative(), - FDataBasis(Monomial(n_basis=4), - [[9, 14, 12, 12], - [7, 18, 24, 20], - [6, 12, 18, 32]])) - np.testing.assert_equal(monomial2.derivative(order=0), monomial2) - np.testing.assert_equal(monomial2.derivative(order=3), - FDataBasis(Monomial(n_basis=2), - [[24, 72], - [48, 120], - [36, 192]])) + self.assertTrue(monomial.derivative().equals( + FDataBasis(Monomial(n_basis=7), + [5, 16, 27, 28, 40, 24, 35]))) + self.assertTrue(monomial.derivative(order=0).equals(monomial)) + self.assertTrue(monomial.derivative(order=6).equals( + FDataBasis(Monomial(n_basis=2), + [2880, 25200]))) + self.assertTrue(monomial2.derivative().equals( + FDataBasis(Monomial(n_basis=4), + [[9, 14, 12, 12], + [7, 18, 24, 20], + [6, 12, 18, 32]]))) + self.assertTrue(monomial2.derivative(order=0).equals(monomial2)) + self.assertTrue(monomial2.derivative(order=3).equals( + FDataBasis(Monomial(n_basis=2), + [[24, 72], + [48, 120], + [36, 192]]))) def test_fdatabasis_derivative_fourier(self): fourier = FDataBasis(Fourier(n_basis=7), @@ -397,34 +379,38 @@ def test_fdatabasis_derivative_fourier(self): fou2 = fourier.derivative(order=2) np.testing.assert_equal(fou1.basis, fourier.basis) - np.testing.assert_almost_equal(fou1.coefficients.round(5), - np.atleast_2d([0, -50.26548, 31.41593, - -100.53096, 113.09734, - -94.24778, 75.39822])) - np.testing.assert_equal(fou0, fourier) + np.testing.assert_almost_equal( + fou1.coefficients.round(5), + np.atleast_2d([0, -50.26548, 31.41593, + -100.53096, 113.09734, + -94.24778, 75.39822])) + self.assertTrue(fou0.equals(fourier)) np.testing.assert_equal(fou2.basis, fourier.basis) - np.testing.assert_almost_equal(fou2.coefficients.round(5), - np.atleast_2d([0, -197.39209, -315.82734, - -1421.22303, -1263.30936, - -1421.22303, -1776.52879])) + np.testing.assert_almost_equal( + fou2.coefficients.round(5), + np.atleast_2d([0, -197.39209, -315.82734, + -1421.22303, -1263.30936, + -1421.22303, -1776.52879])) fou0 = fourier2.derivative(order=0) fou1 = fourier2.derivative() fou2 = fourier2.derivative(order=2) np.testing.assert_equal(fou1.basis, fourier2.basis) - np.testing.assert_almost_equal(fou1.coefficients.round(5), - [[0, -43.98230, 56.54867, -37.69911, 50.26548], - [0, -56.54867, 43.98230, - - 62.83185, 100.53096], - [0, -37.69911, 37.69911, -100.53096, 75.39822]]) - np.testing.assert_equal(fou0, fourier2) + np.testing.assert_almost_equal( + fou1.coefficients.round(5), + [[0, -43.98230, 56.54867, -37.69911, 50.26548], + [0, -56.54867, 43.98230, - + 62.83185, 100.53096], + [0, -37.69911, 37.69911, -100.53096, 75.39822]]) + self.assertTrue(fou0.equals(fourier2)) np.testing.assert_equal(fou2.basis, fourier2.basis) - np.testing.assert_almost_equal(fou2.coefficients.round(5), - [[0, -355.30576, -276.34892, -631.65468, -473.74101], - [0, -276.34892, -355.30576, - - 1263.30936, -789.56835], - [0, -236.87051, -236.87051, -947.48202, -1263.30936]]) + np.testing.assert_almost_equal( + fou2.coefficients.round(5), + [[0, -355.30576, -276.34892, -631.65468, -473.74101], + [0, -276.34892, -355.30576, - + 1263.30936, -789.56835], + [0, -236.87051, -236.87051, -947.48202, -1263.30936]]) def test_fdatabasis_derivative_bspline(self): bspline = FDataBasis(BSpline(n_basis=8), @@ -441,7 +427,7 @@ def test_fdatabasis_derivative_bspline(self): np.testing.assert_almost_equal(bs1.coefficients, np.atleast_2d([60, 22.5, 5, -10, 5, -30, 15])) - np.testing.assert_equal(bs0, bspline) + self.assertTrue(bs0.equals(bspline)) np.testing.assert_equal(bs2.basis, BSpline(n_basis=6, order=2)) np.testing.assert_almost_equal(bs2.coefficients, np.atleast_2d([-375, -87.5, -75, @@ -456,7 +442,7 @@ def test_fdatabasis_derivative_bspline(self): [[30, -6, -9, -6], [36, 6, -3, -18], [12, 0, 0, 12]]) - np.testing.assert_equal(bs0, bspline2) + self.assertTrue(bs0.equals(bspline2)) np.testing.assert_equal(bs2.basis, BSpline(n_basis=3, order=2)) np.testing.assert_almost_equal(bs2.coefficients, [[-144, -6, 12], @@ -478,7 +464,7 @@ def test_concatenate(self): [fd1.coefficients, fd2.coefficients])) def test_vector_valued(self): - X, y = skfda.datasets.fetch_weather(return_X_y=True) + X, _ = skfda.datasets.fetch_weather(return_X_y=True) basis_dim = skfda.representation.basis.Fourier( n_basis=7, domain_range=X.domain_range) diff --git a/tests/test_fdatagrid_numpy.py b/tests/test_fdatagrid_numpy.py index b1a3b13cb..e58d5396d 100644 --- a/tests/test_fdatagrid_numpy.py +++ b/tests/test_fdatagrid_numpy.py @@ -14,7 +14,7 @@ def test_monary_ufunc(self): fd_sqrt_build = FDataGrid(np.sqrt(data_matrix)) - self.assertEqual(fd_sqrt, fd_sqrt_build) + self.assertTrue(fd_sqrt.equals(fd_sqrt_build)) def test_binary_ufunc(self): data_matrix = np.arange(15).reshape(3, 5) @@ -27,7 +27,7 @@ def test_binary_ufunc(self): fd_mul_build = FDataGrid(data_matrix * data_matrix2) - self.assertEqual(fd_mul, fd_mul_build) + self.assertTrue(fd_mul.equals(fd_mul_build)) def test_out_ufunc(self): data_matrix = np.arange(15.).reshape(3, 5) @@ -39,7 +39,7 @@ def test_out_ufunc(self): fd_sqrt_build = FDataGrid(np.sqrt(data_matrix_copy)) - self.assertEqual(fd, fd_sqrt_build) + self.assertTrue(fd.equals(fd_sqrt_build)) if __name__ == '__main__': diff --git a/tests/test_grid.py b/tests/test_grid.py index e39db303a..4e85ae5ea 100644 --- a/tests/test_grid.py +++ b/tests/test_grid.py @@ -23,7 +23,7 @@ def test_init(self): def test_copy_equals(self): fd = FDataGrid([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]]) - self.assertEqual(fd, fd.copy()) + self.assertTrue(fd.equals(fd.copy())) def test_mean(self): fd = FDataGrid([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]]) diff --git a/tests/test_linear_differential_operator.py b/tests/test_linear_differential_operator.py index 9bdd506a5..2e22e0948 100644 --- a/tests/test_linear_differential_operator.py +++ b/tests/test_linear_differential_operator.py @@ -7,12 +7,24 @@ class TestLinearDifferentialOperator(unittest.TestCase): + def _assert_equal_weights(self, weights, weights2, msg): + self.assertEqual(len(weights), len(weights2), msg) + + for w, w2 in zip(weights, weights2): + + eq = getattr(w, "equals", None) + + if eq is None: + self.assertEqual(w, w2, msg) + else: + self.assertTrue(eq(w2), msg) + def test_init_default(self): """Tests default initialization (do not penalize).""" lfd = LinearDifferentialOperator() weightfd = [FDataBasis(Constant((0, 1)), 0)] - np.testing.assert_equal( + self._assert_equal_weights( lfd.weights, weightfd, "Wrong list of weight functions of the linear operator") @@ -23,7 +35,7 @@ def test_init_integer(self): lfd_0 = LinearDifferentialOperator(order=0) weightfd = [FDataBasis(Constant((0, 1)), 1)] - np.testing.assert_equal( + self._assert_equal_weights( lfd_0.weights, weightfd, "Wrong list of weight functions of the linear operator") @@ -32,7 +44,7 @@ def test_init_integer(self): consfd = FDataBasis(Constant((0, 1)), [[0], [0], [0], [1]]) bwtlist3 = list(consfd) - np.testing.assert_equal( + self._assert_equal_weights( lfd_3.weights, bwtlist3, "Wrong list of weight functions of the linear operator") @@ -50,7 +62,7 @@ def test_init_list_int(self): lfd = LinearDifferentialOperator(weights=coefficients) - np.testing.assert_equal( + self._assert_equal_weights( lfd.weights, list(fd), "Wrong list of weight functions of the linear operator") @@ -69,7 +81,7 @@ def test_init_list_fdatabasis(self): fdlist = [FDataBasis(monomial, w) for w in weights] lfd = LinearDifferentialOperator(weights=fdlist) - np.testing.assert_equal( + self._assert_equal_weights( lfd.weights, list(fd), "Wrong list of weight functions of the linear operator") diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 2f69fd590..a05075eaa 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -17,29 +17,29 @@ def test_fdatagrid_series(self): self.assertIsInstance( series.dtype, skfda.representation.grid.FDataGridDType) self.assertEqual(len(series), self.fd.n_samples) - self.assertEqual(series[0], self.fd[0]) + self.assertTrue(series[0].equals(self.fd[0])) def test_fdatabasis_series(self): series = pd.Series(self.fd_basis) self.assertIsInstance( series.dtype, skfda.representation.basis.FDataBasisDType) self.assertEqual(len(series), self.fd_basis.n_samples) - self.assertEqual(series[0], self.fd_basis[0]) + self.assertTrue(series[0].equals(self.fd_basis[0])) def test_fdatagrid_dataframe(self): df = pd.DataFrame({"function": self.fd}) self.assertIsInstance( df["function"].dtype, skfda.representation.grid.FDataGridDType) self.assertEqual(len(df["function"]), self.fd.n_samples) - self.assertEqual(df["function"][0], self.fd[0]) + self.assertTrue(df["function"][0].equals(self.fd[0])) def test_fdatabasis_dataframe(self): df = pd.DataFrame({"function": self.fd_basis}) self.assertIsInstance( df["function"].dtype, skfda.representation.basis.FDataBasisDType) self.assertEqual(len(df["function"]), self.fd_basis.n_samples) - self.assertEqual(df["function"][0], self.fd_basis[0]) + self.assertTrue(df["function"][0].equals(self.fd_basis[0])) def test_take(self): - self.assertEqual(self.fd.take(0), self.fd[0]) - self.assertEqual(self.fd.take(0, axis=0), self.fd[0]) + self.assertTrue(self.fd.take(0).equals(self.fd[0])) + self.assertTrue(self.fd.take(0, axis=0).equals(self.fd[0])) diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 6041f1d60..de0e7fbe7 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -192,32 +192,37 @@ def as_array(request): ############################################################################## -class TestConstructors(base.BaseConstructorsTests): - - # Does not support scalars which are also ExtensionArrays - @pytest.mark.skip(reason="Unsupported") - def test_series_constructor_scalar_with_index(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_from_dtype(self): - pass - - -class TestDtype(base.BaseDtypeTests): - - @pytest.mark.skip(reason="Unsupported") - def test_construct_from_string_own_name(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_is_dtype_from_name(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_eq_with_str(self): - pass - - @pytest.mark.skip(reason="Unsupported") - def test_construct_from_string(self, dtype): - pass +# class TestConstructors(base.BaseConstructorsTests): +# +# # Does not support scalars which are also ExtensionArrays +# @pytest.mark.skip(reason="Unsupported") +# def test_series_constructor_scalar_with_index(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_from_dtype(self): +# pass +# +# +# class TestDtype(base.BaseDtypeTests): +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_construct_from_string_own_name(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_is_dtype_from_name(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_eq_with_str(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_construct_from_string(self, dtype): +# pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 122f13187..0b4badbb7 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -1,15 +1,17 @@ +import operator import skfda from pandas import Series +import pandas from pandas.tests.extension import base import pytest + import numpy as np + ############################################################################## # Fixtures ############################################################################## - - @pytest.fixture def dtype(): """A fixture providing the ExtensionDtype to validate.""" @@ -105,7 +107,7 @@ def na_cmp(): @pytest.fixture def na_value(): """The scalar missing value for this type. Default 'None'""" - return None + return pandas.NA @pytest.fixture @@ -187,8 +189,36 @@ def as_array(request): # class TestConstructors(base.BaseConstructorsTests): -# pass +# +# # Does not support scalars which are also ExtensionArrays +# @pytest.mark.skip(reason="Unsupported") +# def test_series_constructor_scalar_with_index(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_from_dtype(self): +# pass # # # class TestDtype(base.BaseDtypeTests): -# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_construct_from_string_own_name(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_is_dtype_from_name(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_eq_with_str(self): +# pass +# +# # Tries to construct dtype from string +# @pytest.mark.skip(reason="Unsupported") +# def test_construct_from_string(self, dtype): +# pass From 7f1e41591511105c05dc696723540f1db356130d Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 18:54:39 +0200 Subject: [PATCH 021/210] Passing constructor and dtype tests. --- skfda/representation/_functional_data.py | 9 --- skfda/representation/basis/_fdatabasis.py | 12 +++- skfda/representation/grid.py | 22 ++++-- tests/test_pandas_fdatabasis.py | 68 +++++++++---------- tests/test_pandas_fdatagrid.py | 82 +++++++++++++---------- 5 files changed, 104 insertions(+), 89 deletions(-) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 053a98ca7..05afe47bc 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -769,15 +769,6 @@ def _from_factorized(cls, values, original): raise NotImplementedError("Factorization does not make sense for " "functional data") - def isna(self): - """ - A 1-D array indicating if each value is missing. - - Returns: - na_values (np.ndarray): Array full of False values. - """ - return np.zeros(self.n_samples, dtype=bool) - def take(self, indices, allow_fill=False, fill_value=None, axis=0): """Take elements from an array. diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 1e99d8020..f63766af2 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -670,8 +670,7 @@ def equals(self, other): # TODO check all other params return (super().equals(other) and self.basis == other.basis - and np.array_equal(self.coefficients, other.coefficients, - equal_nan=True)) + and np.array_equal(self.coefficients, other.coefficients)) def __eq__(self, other): """Elementwise equality of FDataBasis""" @@ -851,6 +850,15 @@ def nbytes(self) -> int: """ return self.coefficients.nbytes() + def isna(self): + """ + A 1-D array indicating if each value is missing. + + Returns: + na_values (np.ndarray): Positions of NA. + """ + return np.all(np.isnan(self.coefficients), axis=1) + class FDataBasisDType(pandas.api.extensions.ExtensionDtype): """ diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 3a7777475..f97a55f57 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -187,8 +187,7 @@ def __init__(self, data_matrix, sample_points=None, .format(data_shape, sample_points_shape)) self._sample_range = np.array( - [(self.sample_points[i][0], self.sample_points[i][-1]) - for i in range(self.dim_domain)]) + [(s[0], s[-1]) for s in self.sample_points]) if domain_range is None: domain_range = self.sample_range @@ -527,8 +526,7 @@ def equals(self, other): if not super().equals(other): return False - if not np.array_equal(self.data_matrix, other.data_matrix, - equal_nan=True): + if not np.array_equal(self.data_matrix, other.data_matrix): return False if len(self.sample_points) != len(other.sample_points): @@ -557,7 +555,8 @@ def __eq__(self, other): f"len(self)={len(self)} and " f"len(other)={len(other)}") - return np.all(self.coefficients == other.coefficients, axis=1) + return np.all(self.data_matrix == other.data_matrix, + axis=tuple(range(1, self.data_matrix.ndim))) def _get_op_matrix(self, other): if isinstance(other, numbers.Number): @@ -1118,6 +1117,16 @@ def nbytes(self) -> int: return self.data_matrix.nbytes() + sum( p.nbytes() for p in self.sample_points) + def isna(self): + """ + A 1-D array indicating if each value is missing. + + Returns: + na_values (np.ndarray): Positions of NA. + """ + return np.all(np.isnan(self.data_matrix), + axis=tuple(range(1, self.data_matrix.ndim))) + class FDataGridDType(pandas.api.extensions.ExtensionDtype): """ @@ -1135,8 +1144,7 @@ def __init__(self, sample_points, dim_codomain, domain_range=None) -> None: if domain_range is None: domain_range = np.array( - [(self.sample_points[i][0], self.sample_points[i][-1]) - for i in range(self.dim_domain)]) + [(s[0], s[-1]) for s in self.sample_points]) self.domain_range = _domain_range(domain_range) self.dim_codomain = dim_codomain diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index de0e7fbe7..4b7480be2 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -192,37 +192,37 @@ def as_array(request): ############################################################################## -# class TestConstructors(base.BaseConstructorsTests): -# -# # Does not support scalars which are also ExtensionArrays -# @pytest.mark.skip(reason="Unsupported") -# def test_series_constructor_scalar_with_index(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_from_dtype(self): -# pass -# -# -# class TestDtype(base.BaseDtypeTests): -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_construct_from_string_own_name(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_is_dtype_from_name(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_eq_with_str(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_construct_from_string(self, dtype): -# pass +class TestConstructors(base.BaseConstructorsTests): + + # Does not support scalars which are also ExtensionArrays + @pytest.mark.skip(reason="Unsupported") + def test_series_constructor_scalar_with_index(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_from_dtype(self): + pass + + +class TestDtype(base.BaseDtypeTests): + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string_own_name(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_is_dtype_from_name(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_eq_with_str(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string(self, dtype): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 0b4badbb7..04bb55cb9 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -15,7 +15,12 @@ @pytest.fixture def dtype(): """A fixture providing the ExtensionDtype to validate.""" - return skfda.representation.grid.FDataGridDType() + return skfda.representation.grid.FDataGridDType( + sample_points=[ + np.arange(10), + np.arange(10) / 10], + dim_codomain=3 + ) @pytest.fixture @@ -26,9 +31,12 @@ def data(): * data[0] and data[1] should not be equal """ - data_matrix = np.arange(100 * 10).reshape(100, 10) + data_matrix = np.arange(100 * 10 * 10 * 3).reshape(100, 10, 10, 3) + sample_points = [ + np.arange(10), + np.arange(10) / 10] - return skfda.FDataGrid(data_matrix) + return skfda.FDataGrid(data_matrix, sample_points=sample_points) @pytest.fixture @@ -188,37 +196,37 @@ def as_array(request): ############################################################################## -# class TestConstructors(base.BaseConstructorsTests): -# -# # Does not support scalars which are also ExtensionArrays -# @pytest.mark.skip(reason="Unsupported") -# def test_series_constructor_scalar_with_index(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_from_dtype(self): -# pass -# -# -# class TestDtype(base.BaseDtypeTests): -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_construct_from_string_own_name(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_is_dtype_from_name(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_eq_with_str(self): -# pass -# -# # Tries to construct dtype from string -# @pytest.mark.skip(reason="Unsupported") -# def test_construct_from_string(self, dtype): -# pass +class TestConstructors(base.BaseConstructorsTests): + + # Does not support scalars which are also ExtensionArrays + @pytest.mark.skip(reason="Unsupported") + def test_series_constructor_scalar_with_index(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_from_dtype(self): + pass + + +class TestDtype(base.BaseDtypeTests): + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string_own_name(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_is_dtype_from_name(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_eq_with_str(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_construct_from_string(self, dtype): + pass From 8aaa1cbf996c85fb069f9e893d32c7de9872ca02 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 19:27:22 +0200 Subject: [PATCH 022/210] Pandas casting tests. --- tests/test_pandas_fdatabasis.py | 20 +++++++++++++++++++- tests/test_pandas_fdatagrid.py | 22 +++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 4b7480be2..9ae0df0a1 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -44,7 +44,12 @@ def data_for_twos(): @pytest.fixture def data_missing(): """Length-2 array with [NA, Valid]""" - raise NotImplementedError + + basis = skfda.representation.basis.BSpline(n_basis=5) + coef_matrix = np.arange(2 * 5).reshape(2, 5) + coef_matrix[0, :] = np.NaN + + return skfda.FDataBasis(basis=basis, coefficients=coef_matrix) @pytest.fixture(params=["data", "data_missing"]) @@ -192,6 +197,19 @@ def as_array(request): ############################################################################## +class TestCasting(base.BaseCastingTests): + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_astype_str(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_astype_string(self): + pass + + class TestConstructors(base.BaseConstructorsTests): # Does not support scalars which are also ExtensionArrays diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 04bb55cb9..4847129cd 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -48,7 +48,14 @@ def data_for_twos(): @pytest.fixture def data_missing(): """Length-2 array with [NA, Valid]""" - raise NotImplementedError + + data_matrix = np.arange(2 * 10 * 10 * 3).reshape(2, 10, 10, 3) + data_matrix[0, ...] = np.NaN + sample_points = [ + np.arange(10), + np.arange(10) / 10] + + return skfda.FDataGrid(data_matrix, sample_points=sample_points) @pytest.fixture(params=["data", "data_missing"]) @@ -196,6 +203,19 @@ def as_array(request): ############################################################################## +class TestCasting(base.BaseCastingTests): + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_astype_str(self): + pass + + # Tries to construct dtype from string + @pytest.mark.skip(reason="Unsupported") + def test_astype_string(self): + pass + + class TestConstructors(base.BaseConstructorsTests): # Does not support scalars which are also ExtensionArrays From 8964af462c4e45ddc714191d69a9d18f8d9eb4f9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 20:17:26 +0200 Subject: [PATCH 023/210] Basis coefficients must be reals. --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 58 +------------------ .../_linear_differential_operator.py | 24 ++++---- skfda/representation/basis/_fdatabasis.py | 19 +++--- tests/test_pandas_fdatabasis.py | 10 +++- 5 files changed, 35 insertions(+), 78 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 50c5975aa..839661be6 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -1,7 +1,7 @@ from . import constants from ._utils import (_tuple_of_arrays, _cartesian_product, - _check_estimator, parameter_aliases, + _check_estimator, _int_to_real, _to_grid, check_is_univariate, _same_domain, _to_array_maybe_ragged, _reshape_eval_points, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9537071dc..0c90af43c 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -413,63 +413,9 @@ def _pairwise_commutative(function, arg1, arg2=None, **kwargs): (len(arg1), len(arg2))) -def parameter_aliases(**alias_assignments): - """Allows using aliases for parameters""" - def decorator(f): - - if isinstance(f, (types.FunctionType, types.LambdaType)): - # f is a function - @functools.wraps(f) - def aliasing_function(*args, **kwargs): - nonlocal alias_assignments - for parameter_name, aliases in alias_assignments.items(): - aliases = tuple(aliases) - aliases_used = [a for a in kwargs - if a in aliases + (parameter_name,)] - if len(aliases_used) > 1: - raise ValueError( - f"Several arguments with the same meaning used: " + - str(aliases_used)) - - elif len(aliases_used) == 1: - arg = kwargs.pop(aliases_used[0]) - kwargs[parameter_name] = arg - - return f(*args, **kwargs) - return aliasing_function +def _int_to_real(array): - else: - # f is a class - - class cls(f): - pass - - nonlocal alias_assignments - init = cls.__init__ - cls.__init__ = parameter_aliases(**alias_assignments)(init) - - set_params = getattr(cls, "set_params", None) - if set_params is not None: # For estimators - cls.set_params = parameter_aliases( - **alias_assignments)(set_params) - - for key, value in alias_assignments.items(): - def getter(self): - return getattr(self, key) - - def setter(self, new_value): - return setattr(self, key, new_value) - - for alias in value: - setattr(cls, alias, property(getter, setter)) - - cls.__name__ = f.__name__ - cls.__doc__ = f.__doc__ - cls.__module__ = f.__module__ - - return cls - - return decorator + return array + 0.0 def _check_estimator(estimator): diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py index a4e522bd4..06a67d817 100644 --- a/skfda/misc/operators/_linear_differential_operator.py +++ b/skfda/misc/operators/_linear_differential_operator.py @@ -44,15 +44,15 @@ class LinearDifferentialOperator(Operator): weights=[ FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[0]], + coefficients=[[ 0.]], ...), FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[0]], + coefficients=[[ 0.]], ...), FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[1]], + coefficients=[[ 1.]], ...)] ) @@ -64,15 +64,15 @@ class LinearDifferentialOperator(Operator): weights=[ FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[0]], + coefficients=[[ 0.]], ...), FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[2]], + coefficients=[[ 2.]], ...), FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[3]], + coefficients=[[ 3.]], ...)] ) @@ -80,23 +80,23 @@ class LinearDifferentialOperator(Operator): >>> constant = Constant() >>> monomial = Monomial((0, 1), n_basis=3) - >>> fdlist = [FDataBasis(constant, [0]), - ... FDataBasis(constant, [0]), - ... FDataBasis(monomial, [1, 2, 3])] + >>> fdlist = [FDataBasis(constant, [0.]), + ... FDataBasis(constant, [0.]), + ... FDataBasis(monomial, [1., 2., 3.])] >>> LinearDifferentialOperator(weights=fdlist) LinearDifferentialOperator( weights=[ FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[0]], + coefficients=[[ 0.]], ...), FDataBasis( basis=Constant(domain_range=((0, 1),), n_basis=1), - coefficients=[[0]], + coefficients=[[ 0.]], ...), FDataBasis( basis=Monomial(domain_range=((0, 1),), n_basis=3), - coefficients=[[1 2 3]], + coefficients=[[ 1. 2. 3.]], ...)] ) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index f63766af2..b5333e266 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -8,7 +8,7 @@ import numpy as np from .. import grid -from ..._utils import constants +from ..._utils import constants, _int_to_real from .._functional_data import FData @@ -101,7 +101,7 @@ def __init__(self, basis, coefficients, *, dataset_label=None, have the same length or number of columns as the number of basis function in the basis. """ - coefficients = np.atleast_2d(coefficients) + coefficients = _int_to_real(np.atleast_2d(coefficients)) if coefficients.shape[1] != basis.n_basis: raise ValueError("The length or number of columns of coefficients " "has to be the same equal to the number of " @@ -490,12 +490,12 @@ def to_grid(self, sample_points=None): ... basis=Monomial((0,5), n_basis=3)) >>> fd.to_grid([0, 1, 2]) FDataGrid( - array([[[1], - [3], - [7]], - [[1], - [2], - [5]]]), + array([[[ 1.], + [ 3.], + [ 7.]], + [[ 1.], + [ 2.], + [ 5.]]]), sample_points=(array([0, 1, 2]),), domain_range=((0, 5),), ...) @@ -751,6 +751,9 @@ def __getitem__(self, key): if isinstance(key, numbers.Integral): # To accept also numpy ints key = int(key) + if key < 0: + key = range(len(self))[key] + return self.copy(coefficients=self.coefficients[key:key + 1], sample_names=self.sample_names[key:key + 1]) else: diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 9ae0df0a1..a66e9ebff 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -110,7 +110,11 @@ def na_cmp(): True if both arguments are (scalar) NA for your type. By default, uses ``operator.is_`` """ - return operator.is_ + def isna(x, y): + return ((x is pandas.NA or all(x.isna())) + and (y is pandas.NA or all(y.isna()))) + + return isna @pytest.fixture @@ -244,3 +248,7 @@ def test_eq_with_str(self): @pytest.mark.skip(reason="Unsupported") def test_construct_from_string(self, dtype): pass + + +# class TestGetitem(base.BaseGetitemTests): +# pass From a3fbaaddc48bece336a9362dcc5e193af0eb2e35 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 23:14:08 +0200 Subject: [PATCH 024/210] Test getitem FDataBasis. --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 22 +++++++++++++++++++++- skfda/representation/basis/_fdatabasis.py | 14 ++++---------- tests/test_pandas_fdatabasis.py | 6 +++--- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 839661be6..ae95c7872 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -7,4 +7,4 @@ _reshape_eval_points, _evaluate_grid, nquad_vec, _FDataCallable, _pairwise_commutative, - _domain_range) + _domain_range, _check_array_key) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 0c90af43c..d791df0a0 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -1,8 +1,10 @@ """Module with generic methods""" import functools +import numbers import types +from pandas.api.indexers import check_array_indexer import scipy.integrate import numpy as np @@ -414,10 +416,28 @@ def _pairwise_commutative(function, arg1, arg2=None, **kwargs): def _int_to_real(array): - + """ + Convert integer arrays to floating point. + """ return array + 0.0 +def _check_array_key(array, key): + """ + Checks a getitem key. + """ + + key = check_array_indexer(array, key) + + if isinstance(key, numbers.Integral): # To accept also numpy ints + key = int(key) + key = range(len(array))[key] + + return slice(key, key + 1) + else: + return key + + def _check_estimator(estimator): from sklearn.utils.estimator_checks import ( check_get_params_invariance, check_set_params) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index b5333e266..580577901 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -8,7 +8,7 @@ import numpy as np from .. import grid -from ..._utils import constants, _int_to_real +from ..._utils import constants, _int_to_real, _check_array_key from .._functional_data import FData @@ -749,16 +749,10 @@ def compose(self, fd, *, eval_points=None, **kwargs): def __getitem__(self, key): """Return self[key].""" - if isinstance(key, numbers.Integral): # To accept also numpy ints - key = int(key) - if key < 0: - key = range(len(self))[key] + key = _check_array_key(self.coefficients, key) - return self.copy(coefficients=self.coefficients[key:key + 1], - sample_names=self.sample_names[key:key + 1]) - else: - return self.copy(coefficients=self.coefficients[key], - sample_names=np.array(self.sample_names)[key]) + return self.copy(coefficients=self.coefficients[key], + sample_names=np.array(self.sample_names)[key]) def __add__(self, other): """Addition for FDataBasis object.""" diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index a66e9ebff..d120c5157 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -46,7 +46,7 @@ def data_missing(): """Length-2 array with [NA, Valid]""" basis = skfda.representation.basis.BSpline(n_basis=5) - coef_matrix = np.arange(2 * 5).reshape(2, 5) + coef_matrix = np.arange(2 * 5, dtype=np.float_).reshape(2, 5) coef_matrix[0, :] = np.NaN return skfda.FDataBasis(basis=basis, coefficients=coef_matrix) @@ -250,5 +250,5 @@ def test_construct_from_string(self, dtype): pass -# class TestGetitem(base.BaseGetitemTests): -# pass +class TestGetitem(base.BaseGetitemTests): + pass From 1fa08e093d76b91f047b7df8b1223dbb5c3432f4 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 23:46:08 +0200 Subject: [PATCH 025/210] Make `data_matrix` and `sample_points` always floating point. --- skfda/_utils/_utils.py | 4 +- skfda/exploratory/visualization/_boxplot.py | 8 +-- .../visualization/_magnitude_shape_plot.py | 6 +-- .../representation/_evaluation_trasformer.py | 4 +- skfda/representation/basis/_bspline.py | 4 +- skfda/representation/basis/_fdatabasis.py | 2 +- skfda/representation/basis/_fourier.py | 4 +- skfda/representation/grid.py | 54 +++++++++---------- 8 files changed, 43 insertions(+), 43 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index d791df0a0..333d74360 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -109,9 +109,9 @@ def _tuple_of_arrays(original_array): unidimensional = True if unidimensional: - return (np.asarray(original_array),) + return (_int_to_real(np.asarray(original_array)),) else: - return tuple(np.asarray(i) for i in original_array) + return tuple(_int_to_real(np.asarray(i)) for i in original_array) def _domain_range(sequence): diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index 376c16157..6b1ce495e 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -190,8 +190,8 @@ class Boxplot(FDataBoxplot): [-1. ], [-1. ], [-1. ]]]), - sample_points=(array([ 0, 2, 4, 6, 8, 10]),), - domain_range=((0, 10),), + sample_points=(array([ 0., 2., 4., 6., 8., 10.]),), + domain_range=((0.0, 10.0),), dataset_name='dataset', argument_names=('x_label',), coordinate_names=('y_label',), @@ -516,8 +516,8 @@ class SurfaceBoxplot(FDataBoxplot): [[ 3. ], [ 0.6], [ 3. ]]]]), - sample_points=(array([2, 4]), array([3, 6, 8])), - domain_range=((2, 4), (3, 8)), + sample_points=(array([ 2., 4.]), array([ 3., 6., 8.])), + domain_range=((2.0, 4.0), (3.0, 8.0)), dataset_name='dataset', argument_names=('x1_label', 'x2_label'), coordinate_names=('y_label',), diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 8830a7ca9..6992e4e87 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -116,7 +116,7 @@ class MagnitudeShapePlot: ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] + >>> sample_points = [ 0., 2., 4., 6., 8., 10.] >>> fd = skfda.FDataGrid(data_matrix, sample_points) >>> MagnitudeShapePlot(fd) MagnitudeShapePlot( @@ -145,8 +145,8 @@ class MagnitudeShapePlot: [-1. ], [-1. ], [-1. ]]]), - sample_points=(array([ 0, 2, 4, 6, 8, 10]),), - domain_range=((0, 10),), + sample_points=(array([ 0., 2., 4., 6., 8., 10.]),), + domain_range=((0.0, 10.0),), ...), depth_method=projection_depth, pointwise_weights=None, diff --git a/skfda/representation/_evaluation_trasformer.py b/skfda/representation/_evaluation_trasformer.py index 927304a30..feadd24f8 100644 --- a/skfda/representation/_evaluation_trasformer.py +++ b/skfda/representation/_evaluation_trasformer.py @@ -42,8 +42,8 @@ class EvaluationTransformer(BaseEstimator, TransformerMixin): >>> >>> transformer = EvaluationTransformer() >>> transformer.fit_transform(fd) - array([[1, 2], - [2, 3]]) + array([[ 1., 2.], + [ 2., 3.]]) Functional data object with 2 samples representing a function :math:`f : \mathbb{R}\longmapsto\mathbb{R}^2`. diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index 004009c0e..323687453 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -5,8 +5,8 @@ import numpy as np +from ..._utils import _domain_range from ..._utils import _same_domain -from ..._utils import _tuple_of_arrays from ._basis import Basis @@ -104,7 +104,7 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): """ if domain_range is not None: - domain_range = _tuple_of_arrays(domain_range) + domain_range = _domain_range(domain_range) if len(domain_range) != 1: raise ValueError("Domain range should be unidimensional.") diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 580577901..580016889 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -496,7 +496,7 @@ def to_grid(self, sample_points=None): [[ 1.], [ 2.], [ 5.]]]), - sample_points=(array([0, 1, 2]),), + sample_points=(array([ 0., 1., 2.]),), domain_range=((0, 5),), ...) diff --git a/skfda/representation/basis/_fourier.py b/skfda/representation/basis/_fourier.py index 4a6181c7f..ee9302010 100644 --- a/skfda/representation/basis/_fourier.py +++ b/skfda/representation/basis/_fourier.py @@ -1,7 +1,7 @@ import numpy as np +from ..._utils import _domain_range from ..._utils import _same_domain -from ..._utils import _tuple_of_arrays from ._basis import Basis @@ -84,7 +84,7 @@ def __init__(self, domain_range=None, n_basis=3, period=None): """ if domain_range is not None: - domain_range = _tuple_of_arrays(domain_range) + domain_range = _domain_range(domain_range) if len(domain_range) != 1: raise ValueError("Domain range should be unidimensional.") diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index f97a55f57..8aecd5f0a 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -17,7 +17,7 @@ import numpy as np from . import basis as fdbasis -from .._utils import _tuple_of_arrays, constants, _domain_range +from .._utils import _tuple_of_arrays, constants, _domain_range, _int_to_real from ._functional_data import FData from .interpolation import SplineInterpolation @@ -62,15 +62,15 @@ class FDataGrid(FData): >>> sample_points = [2, 4, 5] >>> FDataGrid(data_matrix, sample_points) FDataGrid( - array([[[1], - [2], - [3]], + array([[[ 1.], + [ 2.], + [ 3.]], - [[4], - [5], - [6]]]), - sample_points=(array([2, 4, 5]),), - domain_range=((2, 5),), + [[ 4.], + [ 5.], + [ 6.]]]), + sample_points=(array([ 2., 4., 5.]),), + domain_range=((2.0, 5.0),), ...) The number of columns of data_matrix have to be the length of @@ -164,11 +164,11 @@ def __init__(self, data_matrix, sample_points=None, of the number of dimensions of the domain plus the number of dimensions of the image. """ - self.data_matrix = np.atleast_2d(data_matrix) + self.data_matrix = _int_to_real(np.atleast_2d(data_matrix)) if sample_points is None: self.sample_points = _tuple_of_arrays( - [np.linspace(0, 1, self.data_matrix.shape[i]) for i in + [np.linspace(0., 1., self.data_matrix.shape[i]) for i in range(1, self.data_matrix.ndim)]) else: @@ -400,8 +400,8 @@ def derivative(self, *, order=1): [ 1.5], [ 2. ], [ 4. ]]]), - sample_points=(array([0, 1, 2, 3, 4]),), - domain_range=((0, 4),), + sample_points=(array([ 0., 1., 2., 3., 4.]),), + domain_range=((0.0, 4.0),), ...) Second order derivative @@ -414,8 +414,8 @@ def derivative(self, *, order=1): [-1.], [ 2.], [ 5.]]]), - sample_points=(array([0, 1, 2, 3, 4]),), - domain_range=((0, 4),), + sample_points=(array([ 0., 1., 2., 3., 4.]),), + domain_range=((0.0, 4.0),), ...) """ @@ -691,19 +691,19 @@ def concatenate(self, *others, as_coordinates=False): >>> fd_2 = FDataGrid([3,4,7,9,2], range(5)) >>> fd.concatenate(fd_2) FDataGrid( - array([[[1], - [2], - [4], - [5], - [8]], + array([[[ 1.], + [ 2.], + [ 4.], + [ 5.], + [ 8.]], - [[3], - [4], - [7], - [9], - [2]]]), - sample_points=(array([0, 1, 2, 3, 4]),), - domain_range=((0, 4),), + [[ 3.], + [ 4.], + [ 7.], + [ 9.], + [ 2.]]]), + sample_points=(array([ 0., 1., 2., 3., 4.]),), + domain_range=((0.0, 4.0),), ...) """ From e723738de7c17eb5604128e92b66269a3cac2b5a Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 6 Aug 2020 23:55:40 +0200 Subject: [PATCH 026/210] Getitem tests for FDataGrid. --- skfda/representation/grid.py | 13 +++++-------- tests/test_pandas_fdatagrid.py | 13 +++++++++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 8aecd5f0a..a647c9729 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -17,7 +17,8 @@ import numpy as np from . import basis as fdbasis -from .._utils import _tuple_of_arrays, constants, _domain_range, _int_to_real +from .._utils import (_tuple_of_arrays, constants, + _domain_range, _int_to_real, _check_array_key) from ._functional_data import FData from .interpolation import SplineInterpolation @@ -1052,14 +1053,10 @@ def __repr__(self): def __getitem__(self, key): """Return self[key].""" - if isinstance(key, numbers.Integral): # To accept also numpy ints - key = int(key) - return self.copy(data_matrix=self.data_matrix[key:key + 1], - sample_names=self.sample_names[key:key + 1]) + key = _check_array_key(self.data_matrix, key) - else: - return self.copy(data_matrix=self.data_matrix[key], - sample_names=np.array(self.sample_names)[key]) + return self.copy(data_matrix=self.data_matrix[key], + sample_names=np.array(self.sample_names)[key]) ##################################################################### # Numpy methods diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 4847129cd..6455459fc 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -49,7 +49,8 @@ def data_for_twos(): def data_missing(): """Length-2 array with [NA, Valid]""" - data_matrix = np.arange(2 * 10 * 10 * 3).reshape(2, 10, 10, 3) + data_matrix = np.arange( + 2 * 10 * 10 * 3, dtype=np.float_).reshape(2, 10, 10, 3) data_matrix[0, ...] = np.NaN sample_points = [ np.arange(10), @@ -116,7 +117,11 @@ def na_cmp(): True if both arguments are (scalar) NA for your type. By default, uses ``operator.is_`` """ - return operator.is_ + def isna(x, y): + return ((x is pandas.NA or all(x.isna())) + and (y is pandas.NA or all(y.isna()))) + + return isna @pytest.fixture @@ -250,3 +255,7 @@ def test_eq_with_str(self): @pytest.mark.skip(reason="Unsupported") def test_construct_from_string(self, dtype): pass + + +class TestGetitem(base.BaseGetitemTests): + pass From 3d75ac9abd4082f2e5a1111ffb488223527437d7 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 7 Aug 2020 00:11:13 +0200 Subject: [PATCH 027/210] Interface tests. --- skfda/representation/basis/_fdatabasis.py | 2 +- skfda/representation/grid.py | 4 ++-- tests/test_pandas_fdatabasis.py | 18 ++++++++++++++++++ tests/test_pandas_fdatagrid.py | 18 ++++++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 580016889..24d222b83 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -845,7 +845,7 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. """ - return self.coefficients.nbytes() + return self.coefficients.nbytes def isna(self): """ diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index a647c9729..65aa46f10 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -1111,8 +1111,8 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. """ - return self.data_matrix.nbytes() + sum( - p.nbytes() for p in self.sample_points) + return self.data_matrix.nbytes + sum( + p.nbytes for p in self.sample_points) def isna(self): """ diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index d120c5157..2b734d24e 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -252,3 +252,21 @@ def test_construct_from_string(self, dtype): class TestGetitem(base.BaseGetitemTests): pass + + +class TestInterface(base.BaseInterfaceTests): + + # Does not support scalars which are also array_like + @pytest.mark.skip(reason="Unsupported") + def test_array_interface(self): + pass + + # We do not implement setitem + @pytest.mark.skip(reason="Unsupported") + def test_copy(self, dtype): + pass + + # We do not implement setitem + @pytest.mark.skip(reason="Unsupported") + def test_view(self, dtype): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 6455459fc..7c0c30e06 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -259,3 +259,21 @@ def test_construct_from_string(self, dtype): class TestGetitem(base.BaseGetitemTests): pass + + +class TestInterface(base.BaseInterfaceTests): + + # Does not support scalars which are also array_like + @pytest.mark.skip(reason="Unsupported") + def test_array_interface(self): + pass + + # We do not implement setitem + @pytest.mark.skip(reason="Unsupported") + def test_copy(self, dtype): + pass + + # We do not implement setitem + @pytest.mark.skip(reason="Unsupported") + def test_view(self, dtype): + pass From c4aaaf8dfa8e9bcafb06119e717cf2a49cec24a3 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 7 Aug 2020 03:28:37 +0200 Subject: [PATCH 028/210] Fix the documentation. --- examples/plot_interpolation.py | 20 +++++++++---------- examples/plot_oneway_synthetic.py | 16 +++++++++------ .../dim_reduction/projection/_fpca.py | 3 ++- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/examples/plot_interpolation.py b/examples/plot_interpolation.py index 1c3abf7fc..13bd3d17b 100644 --- a/examples/plot_interpolation.py +++ b/examples/plot_interpolation.py @@ -16,7 +16,6 @@ from mpl_toolkits.mplot3d import axes3d -import matplotlib.pyplot as plt import numpy as np @@ -64,8 +63,9 @@ # # Sample with noise -fd_smooth = skfda.datasets.make_sinusoidal_process(n_samples=1, n_features=30, - random_state=1, error_std=.3) +fd_smooth = skfda.datasets.make_sinusoidal_process( + n_samples=1, n_features=30, + random_state=1, error_std=.3) # Cubic interpolation fd_smooth.interpolation = SplineInterpolation(interpolation_order=3) @@ -104,12 +104,12 @@ fig.legend() ############################################################################## -# All the interpolations will work regardless of the dimension of the image, but -# depending on the domain dimension some methods will not be available. +# All the interpolations will work regardless of the dimension of the image, +# but depending on the domain dimension some methods will not be available. # -# For the next examples it is constructed a surface, :math:`x_i: \mathbb{R}^2 -# \longmapsto \mathbb{R}`. By default, as in unidimensional samples, it is used -# linear interpolation. +# For the next examples it is constructed a surface, +# :math:`x_i: \mathbb{R}^2 \longmapsto \mathbb{R}`. By default, as in +# unidimensional samples, it is used linear interpolation. # X, Y, Z = axes3d.get_test_data(1.2) @@ -128,8 +128,8 @@ # # The degree of the interpolation polynomial does not have to coincide in both # directions, for example, cubic interpolation in the first -# component and quadratic in the second one could be defined using a tuple with -# the values (3,2). +# component and quadratic in the second one could be defined using a tuple +# with the values (3,2). # fd.interpolation = SplineInterpolation(interpolation_order=3) diff --git a/examples/plot_oneway_synthetic.py b/examples/plot_oneway_synthetic.py index 2d210d08a..cc99d7144 100644 --- a/examples/plot_oneway_synthetic.py +++ b/examples/plot_oneway_synthetic.py @@ -19,14 +19,16 @@ ########################################################################## -# *One-way ANOVA* (analysis of variance) is a test that can be used to +# **One-way ANOVA** (analysis of variance) is a test that can be used to # compare the means of different samples of data. # Let :math:`X_{ij}(t), j=1, \dots, n_i` be trajectories corresponding to -# :math:`k` independent samples :math:`(i=1,\dots,k)` and let :math:`E(X_i(t)) = -# m_i(t)`. Thus, the null hypothesis in the statistical test is: +# :math:`k` independent samples :math:`(i=1,\dots,k)` and let +# :math:`E(X_i(t)) = m_i(t)`. Thus, the null hypothesis in the statistical +# test is: # # .. math:: -# H_0: m_1(t) = \dots = m_k(t) +# +# H_0: m_1(t) = \dots = m_k(t) # # In this example we will explain the nature of ANOVA method and its behavior # under certain conditions simulating data. Specifically, we will generate @@ -34,6 +36,8 @@ # process by adding to them white noise. The main objective of the # test is to illustrate the differences in the results of the ANOVA method # when the covariance function of the brownian processes changes. + + ########################################################################## # First, the means for the future processes are drawn. n_samples = 10 @@ -52,8 +56,8 @@ dataset_name="Means to be used in the simulation").plot() ########################################################################## -# A total of `n_samples` trajectories will be created for each mean, so a array -# of labels is created to identify them when plotting. +# A total of ``n_samples`` trajectories will be created for each mean, so an +# array of labels is created to identify them when plotting. groups = np.full(n_samples * n_groups, 'Sample 1') groups[10:20] = 'Sample 2' diff --git a/skfda/preprocessing/dim_reduction/projection/_fpca.py b/skfda/preprocessing/dim_reduction/projection/_fpca.py index 5dd3d30a7..37fe82ed2 100644 --- a/skfda/preprocessing/dim_reduction/projection/_fpca.py +++ b/skfda/preprocessing/dim_reduction/projection/_fpca.py @@ -147,7 +147,8 @@ def _fit_basis(self, X: FDataBasis, y=None): components_basis = self.components_basis if components_basis is not None: # First fix domain range if not already done - components_basis.domain_range = X.basis.domain_range + components_basis = components_basis.copy( + domain_range=X.basis.domain_range) g_matrix = components_basis.gram_matrix() # the matrix that are in charge of changing the computed principal # components to target matrix is essentially the inner product From 0d4327ea3def6d29a3dc39dbba8baac4ccdf8d39 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 7 Aug 2020 17:12:10 +0200 Subject: [PATCH 029/210] Tests arithmetic operators. --- skfda/representation/basis/_fdatabasis.py | 8 +-- tests/test_pandas_fdatabasis.py | 70 ++++++++++++++++++++--- tests/test_pandas_fdatagrid.py | 52 ++++++++++++++++- 3 files changed, 115 insertions(+), 15 deletions(-) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 74802fe13..9fabb348a 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -762,7 +762,7 @@ def __add__(self, other): try: basis, coefs = self.basis._add_constant(self.coefficients, other) - except TypeError: + except Exception: return NotImplemented return self._copy_op(other, basis=basis, coefficients=coefs) @@ -784,7 +784,7 @@ def __sub__(self, other): try: basis, coefs = self.basis._sub_constant(self.coefficients, other) - except TypeError: + except Exception: return NotImplemented return self._copy_op(other, basis=basis, coefficients=coefs) @@ -800,7 +800,7 @@ def __mul__(self, other): try: basis, coefs = self.basis._mul_constant(self.coefficients, other) - except TypeError: + except Exception: return NotImplemented return self._copy_op(other, basis=basis, coefficients=coefs) @@ -816,7 +816,7 @@ def __truediv__(self, other): try: other = 1 / other - except TypeError: + except Exception: return NotImplemented return self * other diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 2b734d24e..caf9d22f8 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -1,5 +1,6 @@ import operator import skfda +from skfda.representation.basis import Monomial, Fourier, BSpline from pandas import Series import pandas @@ -12,25 +13,30 @@ ############################################################################## # Fixtures ############################################################################## -@pytest.fixture -def dtype(): +@pytest.fixture(params=[Monomial(n_basis=5), Fourier(n_basis=5), + BSpline(n_basis=5)]) +def basis(request): """A fixture providing the ExtensionDtype to validate.""" - basis = skfda.representation.basis.BSpline(n_basis=5) + return request.param + + +@pytest.fixture +def dtype(basis): + """A fixture providing the ExtensionDtype to validate.""" return skfda.representation.basis.FDataBasisDType(basis=basis) @pytest.fixture -def data(): +def data(basis): """ Length-100 array for this type. * data[0] and data[1] should both be non missing * data[0] and data[1] should not be equal """ - basis = skfda.representation.basis.BSpline(n_basis=5) - coef_matrix = np.arange(100 * 5).reshape(100, 5) + coef_matrix = np.arange(100 * basis.n_basis).reshape(100, basis.n_basis) return skfda.FDataBasis(basis=basis, coefficients=coef_matrix) @@ -42,11 +48,11 @@ def data_for_twos(): @pytest.fixture -def data_missing(): +def data_missing(basis): """Length-2 array with [NA, Valid]""" - basis = skfda.representation.basis.BSpline(n_basis=5) - coef_matrix = np.arange(2 * 5, dtype=np.float_).reshape(2, 5) + coef_matrix = np.arange( + 2 * basis.n_basis, dtype=np.float_).reshape(2, basis.n_basis) coef_matrix[0, :] = np.NaN return skfda.FDataBasis(basis=basis, coefficients=coef_matrix) @@ -196,6 +202,32 @@ def as_array(request): """ return request.param + +_all_arithmetic_operators = [ + "__add__", + "__radd__", + "__sub__", + "__rsub__", + # "__mul__", + # "__rmul__", + # "__floordiv__", + # "__rfloordiv__", + # "__truediv__", + # "__rtruediv__", + # "__pow__", + # "__rpow__", + # "__mod__", + # "__rmod__", +] + + +@pytest.fixture(params=_all_arithmetic_operators) +def all_arithmetic_operators(request): + """ + Fixture for dunder names for common arithmetic operations. + """ + return request.param + ############################################################################## # Tests ############################################################################## @@ -270,3 +302,23 @@ def test_copy(self, dtype): @pytest.mark.skip(reason="Unsupported") def test_view(self, dtype): pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + + series_scalar_exc = None + + # FDatabasis does not implement division by non constant + @pytest.mark.skip(reason="Unsupported") + def test_divmod_series_array(self, dtype): + pass + + # Does not convert properly a list of FData to a FData + @pytest.mark.skip(reason="Unsupported") + def test_arith_series_with_array(self, dtype): + pass + + # Does not error on operations + @pytest.mark.skip(reason="Unsupported") + def test_error(self, dtype): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 7c0c30e06..f95089527 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -31,7 +31,7 @@ def data(): * data[0] and data[1] should not be equal """ - data_matrix = np.arange(100 * 10 * 10 * 3).reshape(100, 10, 10, 3) + data_matrix = np.arange(1, 100 * 10 * 10 * 3 + 1).reshape(100, 10, 10, 3) sample_points = [ np.arange(10), np.arange(10) / 10] @@ -42,7 +42,14 @@ def data(): @pytest.fixture def data_for_twos(): """Length-100 array in which all the elements are two.""" - raise NotImplementedError + + data_matrix = np.full( + 100 * 10 * 10 * 3, fill_value=2).reshape(100, 10, 10, 3) + sample_points = [ + np.arange(10), + np.arange(10) / 10] + + return skfda.FDataGrid(data_matrix, sample_points=sample_points) @pytest.fixture @@ -203,6 +210,32 @@ def as_array(request): """ return request.param + +_all_arithmetic_operators = [ + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mul__", + "__rmul__", + # "__floordiv__", + # "__rfloordiv__", + "__truediv__", + "__rtruediv__", + # "__pow__", + # "__rpow__", + # "__mod__", + # "__rmod__", +] + + +@pytest.fixture(params=_all_arithmetic_operators) +def all_arithmetic_operators(request): + """ + Fixture for dunder names for common arithmetic operations. + """ + return request.param + ############################################################################## # Tests ############################################################################## @@ -277,3 +310,18 @@ def test_copy(self, dtype): @pytest.mark.skip(reason="Unsupported") def test_view(self, dtype): pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + + series_scalar_exc = None + + # Does not convert properly a list of FData to a FData + @pytest.mark.skip(reason="Unsupported") + def test_arith_series_with_array(self, dtype): + pass + + # Does not error on operations + @pytest.mark.skip(reason="Unsupported") + def test_error(self, dtype): + pass From 366880899b9e9f7f57b0b760049081979ba7bec9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 7 Aug 2020 23:12:01 +0200 Subject: [PATCH 030/210] Test comparison operators. --- skfda/representation/_functional_data.py | 14 ++++++++++++++ skfda/representation/basis/_fdatabasis.py | 6 +++++- skfda/representation/grid.py | 6 +++++- tests/test_pandas_fdatabasis.py | 23 +++++++++++++++++++++++ tests/test_pandas_fdatagrid.py | 23 +++++++++++++++++++++++ 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 05afe47bc..d724c014e 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -647,6 +647,20 @@ def equals(self, other): and self.coordinate_names == other.coordinate_names ) + @abstractmethod + def __eq__(self, key): + pass + + def __ne__(self, other): + """ + Return for `self != other` (element-wise in-equality). + """ + result = self.__eq__(other) + if result is NotImplemented: + return NotImplemented + + return ~result + def _copy_op(self, other, **kwargs): base_copy = (other if isinstance(other, type(self)) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 9fabb348a..3ddd33da7 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -670,7 +670,11 @@ def __eq__(self, other): """Elementwise equality of FDataBasis""" if type(self) != type(other) or self.dtype != other.dtype: - raise TypeError("Types are not equal") + if pandas.api.types.is_list_like(other) and not isinstance( + other, (pandas.Series, pandas.Index)): + return np.concatenate([x == y for x, y in zip(self, other)]) + else: + return NotImplemented if len(self) != len(other): raise ValueError(f"Different lengths: " diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index fbe38773e..ce02760ac 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -549,7 +549,11 @@ def __eq__(self, other): """Elementwise equality of FDataGrid""" if type(self) != type(other) or self.dtype != other.dtype: - raise TypeError("Types are not equal") + if pandas.api.types.is_list_like(other) and not isinstance( + other, (pandas.Series, pandas.Index)): + return np.concatenate([x == y for x, y in zip(self, other)]) + else: + return NotImplemented if len(self) != len(other): raise ValueError(f"Different lengths: " diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index caf9d22f8..68719c15d 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -228,6 +228,16 @@ def all_arithmetic_operators(request): """ return request.param + +@pytest.fixture(params=["__eq__", "__ne__", + # "__le__", "__lt__", "__ge__", "__gt__" + ]) +def all_compare_operators(request): + """ + Fixture for dunder names for common compare operations + """ + return request.param + ############################################################################## # Tests ############################################################################## @@ -322,3 +332,16 @@ def test_arith_series_with_array(self, dtype): @pytest.mark.skip(reason="Unsupported") def test_error(self, dtype): pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + + # Cannot be compared with 0 + @pytest.mark.skip(reason="Unsupported") + def test_compare_scalar(self, data, all_compare_operators): + pass + + # Not sure how to pass it. Should it be reimplemented? + @pytest.mark.skip(reason="Unsupported") + def test_compare_array(self, data, all_compare_operators): + pass diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index f95089527..774b50d89 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -236,6 +236,16 @@ def all_arithmetic_operators(request): """ return request.param + +@pytest.fixture(params=["__eq__", "__ne__", + # "__le__", "__lt__", "__ge__", "__gt__" + ]) +def all_compare_operators(request): + """ + Fixture for dunder names for common compare operations + """ + return request.param + ############################################################################## # Tests ############################################################################## @@ -325,3 +335,16 @@ def test_arith_series_with_array(self, dtype): @pytest.mark.skip(reason="Unsupported") def test_error(self, dtype): pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + + # Cannot be compared with 0 + @pytest.mark.skip(reason="Unsupported") + def test_compare_scalar(self, data, all_compare_operators): + pass + + # Not sure how to pass it. Should it be reimplemented? + @pytest.mark.skip(reason="Unsupported") + def test_compare_array(self, data, all_compare_operators): + pass From 8126365abd85f8e522c34ca7fc0e84896624ec82 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 9 Aug 2020 00:03:05 +0200 Subject: [PATCH 031/210] Make mean compatible with numpy. --- skfda/_neighbors/base.py | 16 ++++++- skfda/exploratory/stats/_stats.py | 6 +-- skfda/representation/_functional_data.py | 21 +++++++--- skfda/representation/basis/_fdatabasis.py | 23 ++++------ skfda/representation/grid.py | 51 ++++++++++++----------- tests/test_grid.py | 12 +----- tests/test_neighbors.py | 15 ++++--- tests/test_pandas_fdatabasis.py | 22 ++++++++++ 8 files changed, 98 insertions(+), 68 deletions(-) diff --git a/skfda/_neighbors/base.py b/skfda/_neighbors/base.py index 8b2ffc76d..17f8b5287 100644 --- a/skfda/_neighbors/base.py +++ b/skfda/_neighbors/base.py @@ -9,7 +9,6 @@ import numpy as np from .. import FDataGrid, FData -from ..exploratory.stats import mean as l2_mean from ..misc.metrics import lp_distance @@ -440,6 +439,19 @@ def predict(self, X): class NeighborsRegressorMixin(NeighborsMixin, RegressorMixin): """Mixin class for the regressors based on neighbors""" + def _mean_regressor(self, X, weights=None): + """ + Default regressor using weighted average. + + """ + + if weights is None: + return X.mean() + else: + weights /= np.sum(weights) + + return (X * weights).sum() + def fit(self, X, y): """Fit the model using X as training data and y as responses. @@ -500,7 +512,7 @@ def _functional_fit(self, X, y): self.estimator_.fit(self._transform_to_multivariate(X)) if self.regressor == 'mean': - self._regressor = l2_mean + self._regressor = self._mean_regressor else: self._regressor = self.regressor diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index d84fece80..5f0aafc34 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -2,7 +2,8 @@ """ from ..depth import modified_band_depth -def mean(fdata, weights=None): + +def mean(fdata): """Compute the mean of all the samples in a FData object. Computes the mean of all the samples in a FDataGrid or FDataBasis object. @@ -10,7 +11,6 @@ def mean(fdata, weights=None): Args: fdata (FDataGrid or FDataBasis): Object containing all the samples whose mean is wanted. - weight (array-like, optional): List of weights. Returns: @@ -19,7 +19,7 @@ def mean(fdata, weights=None): object. """ - return fdata.mean(weights) + return fdata.mean() def var(fdatagrid): diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index d724c014e..422e492ee 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -551,17 +551,26 @@ def copy(self, **kwargs): pass @abstractmethod - def mean(self, weights=None): - """Compute the mean of all the samples. - - weights (array-like, optional): List of weights. + def sum(self, *, axis=None, out=None, keepdims=False): + """Compute the sum of all the samples. Returns: FData : A FData object with just one sample representing - the mean of all the samples in the original object. + the sum of all the samples in the original object. """ - pass + if ((axis is not None and axis != 0) or + out is not None or keepdims is not False): + raise NotImplementedError( + "Not implemented for that parameter combination") + + def mean(self, *, axis=None, dtype=None, out=None, keepdims=False): + + if dtype is not None: + raise NotImplementedError( + "Not implemented for that parameter combination") + + return self.sum(axis=axis, out=out, keepdims=keepdims) / self.n_samples @abstractmethod def to_grid(self, sample_points=None): diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 3ddd33da7..b141c838d 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -369,12 +369,12 @@ def derivative(self, *, order=1): return FDataBasis(basis, coefficients) - def mean(self, weights=None): - """Compute the mean of all the samples in a FDataBasis object. + def sum(self, *, axis=None, out=None, keepdims=False): + """Compute the sum of all the samples in a FDataBasis object. Returns: :obj:`FDataBasis`: A FDataBais object with just one sample - representing the mean of all the samples in the original + representing the sum of all the samples in the original FDataBasis object. Examples: @@ -382,24 +382,17 @@ def mean(self, weights=None): >>> from skfda.representation.basis import FDataBasis, Monomial >>> basis = Monomial(n_basis=4) >>> coefficients = [[0.5, 1, 2, .5], [1.5, 1, 4, .5]] - >>> FDataBasis(basis, coefficients).mean() + >>> FDataBasis(basis, coefficients).sum() FDataBasis( basis=Monomial(domain_range=((0, 1),), n_basis=4), - coefficients=[[ 1. 1. 3. 0.5]], + coefficients=[[ 2. 2. 6. 1.]], ...) """ + super().sum(axis=axis, out=out, keepdims=keepdims) - if weights is not None: - return self.copy(coefficients=np.average(self.coefficients, - weights=weights, - axis=0 - )[np.newaxis, ...], - sample_names=("mean",) - ) - - return self.copy(coefficients=np.mean(self.coefficients, axis=0), - sample_names=("mean",)) + return self.copy(coefficients=np.sum(self.coefficients, axis=0), + sample_names=(None,)) def gmean(self, eval_points=None): """Compute the geometric mean of the functional data object. diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index ce02760ac..ff44a8408 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -446,27 +446,31 @@ def __check_same_dimensions(self, other): if not np.array_equal(self.sample_points, other.sample_points): raise ValueError("Sample points for both objects must be equal") - def mean(self, weights=None): - """Compute the mean of all the samples. - - Args: - weights (array-like, optional): List of weights. + def sum(self, *, axis=None, out=None, keepdims=False): + """Compute the sum of all the samples. Returns: FDataGrid : A FDataGrid object with just one sample representing - the mean of all the samples in the original object. + the sum of all the samples in the original object. - """ - if weights is not None: + Examples: + + >>> from skfda import FDataGrid + >>> data_matrix = [[0.5, 1, 2, .5], [1.5, 1, 4, .5]] + >>> FDataGrid(data_matrix).sum() + FDataGrid( + array([[[ 2.], + [ 2.], + [ 6.], + [ 1.]]]), + ...) - return self.copy(data_matrix=np.average( - self.data_matrix, weights=weights, axis=0)[np.newaxis, ...], - sample_names=("mean",) - ) + """ + super().sum(axis=axis, out=out, keepdims=keepdims) - return self.copy(data_matrix=self.data_matrix.mean(axis=0, - keepdims=True), - sample_names=("mean",)) + return self.copy(data_matrix=self.data_matrix.sum(axis=0, + keepdims=True), + sample_names=(None,)) def var(self): """Compute the variance of a set of samples in a FDataGrid object. @@ -567,18 +571,17 @@ def _get_op_matrix(self, other): if isinstance(other, numbers.Number): return other elif isinstance(other, np.ndarray): - # Product by number or matrix with equal dimensions, or - # matrix with same shape but only one sample - if(other.shape == () or other.shape == (1) - or other.shape == self.data_matrix.shape - or other.shape == self.data_matrix.shape[1:]): + + if other.shape == () or other.shape == (1,): return other - # Missing last dimension (codomain dimension) - elif (other.shape == self.data_matrix.shape[:-1] - or other.shape == self.data_matrix.shape[1:-1]): - return other[..., np.newaxis] + elif other.shape == (self.n_samples,): + other_index = ((slice(None),) + (np.newaxis,) * + (self.data_matrix.ndim - 1)) + + return other[other_index] else: return None + elif isinstance(other, FDataGrid): self.__check_same_dimensions(other) return other.data_matrix diff --git a/tests/test_grid.py b/tests/test_grid.py index 4e85ae5ea..028524c85 100644 --- a/tests/test_grid.py +++ b/tests/test_grid.py @@ -169,17 +169,9 @@ def test_add(self): np.testing.assert_array_equal(fd2.data_matrix[..., 0], [[3, 4, 5, 6], [4, 5, 6, 7]]) - fd2 = fd1 + np.array([1, 2, 3, 4]) + fd2 = fd1 + np.array([1, 2]) np.testing.assert_array_equal(fd2.data_matrix[..., 0], - [[2, 4, 6, 8], [3, 5, 7, 9]]) - - fd2 = fd1 + fd1.data_matrix - np.testing.assert_array_equal(fd2.data_matrix[..., 0], - [[2, 4, 6, 8], [4, 6, 8, 10]]) - - fd2 = fd1 + fd1.data_matrix[..., 0] - np.testing.assert_array_equal(fd2.data_matrix[..., 0], - [[2, 4, 6, 8], [4, 6, 8, 10]]) + [[2, 3, 4, 5], [4, 5, 6, 7]]) def test_composition(self): X, Y, Z = axes3d.get_test_data(1.2) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 22cded6c3..efea8946a 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -1,8 +1,6 @@ """Test neighbors classifiers and regressors""" -import unittest - -import numpy as np +from skfda._neighbors.outlier import LocalOutlierFactor # Pending theory from skfda.datasets import make_multimodal_samples, make_sinusoidal_process from skfda.exploratory.stats import mean as l2_mean from skfda.misc.metrics import lp_distance, pairwise_distance @@ -11,11 +9,13 @@ NearestCentroid) from skfda.ml.clustering import NearestNeighbors from skfda.ml.regression import KNeighborsRegressor, RadiusNeighborsRegressor -#from skfda.exploratory.outliers import LocalOutlierFactor -from skfda._neighbors.outlier import LocalOutlierFactor # Pending theory from skfda.representation.basis import Fourier +import unittest + +import numpy as np +#from skfda.exploratory.outliers import LocalOutlierFactor class TestNeighbors(unittest.TestCase): def setUp(self): @@ -186,8 +186,7 @@ def test_knn_functional_response_precomputed(self): def test_radius_functional_response(self): knnr = RadiusNeighborsRegressor(metric=lp_distance, - weights='distance', - regressor=l2_mean) + weights='distance') knnr.fit(self.X, self.X) @@ -222,7 +221,7 @@ def test_functional_regression_distance_weights(self): weights = 1 / distances weights /= weights.sum() - response = self.X[:10].mean(weights=weights) + response = (self.X[:10] * weights).sum() np.testing.assert_array_almost_equal(res.data_matrix, response.data_matrix) diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 68719c15d..4ada51a14 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -238,6 +238,28 @@ def all_compare_operators(request): """ return request.param + +_all_numeric_reductions = [ + "sum", + # "max", + # "min", + # "mean", + # "prod", + # "std", + # "var", + # "median", + # "kurt", + # "skew", +] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names. + """ + return request.param + ############################################################################## # Tests ############################################################################## From 8c9d6582c46a4cfcccc995c7a432abbd6ab7842b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Aug 2020 01:23:09 +0200 Subject: [PATCH 032/210] Implement sum and mean reduction for FData. --- skfda/representation/_functional_data.py | 18 +++++++++++--- skfda/representation/basis/_fdatabasis.py | 15 +++++++++--- skfda/representation/grid.py | 17 +++++++++---- tests/test_pandas_fdatabasis.py | 9 ++++++- tests/test_pandas_fdatagrid.py | 29 +++++++++++++++++++++++ 5 files changed, 77 insertions(+), 11 deletions(-) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 422e492ee..f7a809663 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -551,7 +551,8 @@ def copy(self, **kwargs): pass @abstractmethod - def sum(self, *, axis=None, out=None, keepdims=False): + def sum(self, *, axis=None, out=None, keepdims=False, skipna=False, + min_count=0): """Compute the sum of all the samples. Returns: @@ -564,13 +565,15 @@ def sum(self, *, axis=None, out=None, keepdims=False): raise NotImplementedError( "Not implemented for that parameter combination") - def mean(self, *, axis=None, dtype=None, out=None, keepdims=False): + def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, + skipna=False): if dtype is not None: raise NotImplementedError( "Not implemented for that parameter combination") - return self.sum(axis=axis, out=out, keepdims=keepdims) / self.n_samples + return (self.sum(axis=axis, out=out, keepdims=keepdims, skipna=skipna) + / self.n_samples) @abstractmethod def to_grid(self, sample_points=None): @@ -882,6 +885,15 @@ def astype(self, dtype, copy=True): return self return super().astype(dtype) + def _reduce(self, name, skipna=True, **kwargs): + meth = getattr(self, name, None) + if meth: + return meth(skipna=skipna, **kwargs) + else: + msg = (f"'{type(self).__name__}' does not implement " + f"reduction '{name}'") + raise TypeError(msg) + def concatenate(objects, as_coordinates=False): """ diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index b141c838d..93d6ff4c3 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -369,7 +369,8 @@ def derivative(self, *, order=1): return FDataBasis(basis, coefficients) - def sum(self, *, axis=None, out=None, keepdims=False): + def sum(self, *, axis=None, out=None, keepdims=False, skipna=False, + min_count=0): """Compute the sum of all the samples in a FDataBasis object. Returns: @@ -389,9 +390,17 @@ def sum(self, *, axis=None, out=None, keepdims=False): ...) """ - super().sum(axis=axis, out=out, keepdims=keepdims) + super().sum(axis=axis, out=out, keepdims=keepdims, skipna=skipna) - return self.copy(coefficients=np.sum(self.coefficients, axis=0), + coefs = (np.nansum(self.coefficients, axis=0) if skipna + else np.sum(self.coefficients, axis=0)) + + if min_count > 0: + valid = ~np.isnan(self.coefficients) + n_valid = np.sum(valid, axis=0) + coefs[n_valid < min_count] = np.NaN + + return self.copy(coefficients=coefs, sample_names=(None,)) def gmean(self, eval_points=None): diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index ff44a8408..bd90bce1b 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -12,6 +12,7 @@ import findiff import pandas.api.extensions +from pandas.tests.test_nanops import skipna import scipy.stats.mstats import numpy as np @@ -446,7 +447,8 @@ def __check_same_dimensions(self, other): if not np.array_equal(self.sample_points, other.sample_points): raise ValueError("Sample points for both objects must be equal") - def sum(self, *, axis=None, out=None, keepdims=False): + def sum(self, *, axis=None, out=None, keepdims=False, skipna=False, + min_count=0): """Compute the sum of all the samples. Returns: @@ -466,10 +468,17 @@ def sum(self, *, axis=None, out=None, keepdims=False): ...) """ - super().sum(axis=axis, out=out, keepdims=keepdims) + super().sum(axis=axis, out=out, keepdims=keepdims, skipna=skipna) - return self.copy(data_matrix=self.data_matrix.sum(axis=0, - keepdims=True), + data = (np.nansum(self.data_matrix, axis=0, keepdims=True) if skipna + else np.sum(self.data_matrix, axis=0, keepdims=True)) + + if min_count > 0: + valid = ~np.isnan(self.data_matrix) + n_valid = np.sum(valid, axis=0) + data[n_valid < min_count] = np.NaN + + return self.copy(data_matrix=data, sample_names=(None,)) def var(self): diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index 4ada51a14..a43857665 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -243,7 +243,7 @@ def all_compare_operators(request): "sum", # "max", # "min", - # "mean", + "mean", # "prod", # "std", # "var", @@ -367,3 +367,10 @@ def test_compare_scalar(self, data, all_compare_operators): @pytest.mark.skip(reason="Unsupported") def test_compare_array(self, data, all_compare_operators): pass + + +class TestNumericReduce(base.BaseNumericReduceTests): + + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + assert result.n_samples == 1 diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 774b50d89..73ba3df22 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -246,6 +246,28 @@ def all_compare_operators(request): """ return request.param + +_all_numeric_reductions = [ + "sum", + # "max", + # "min", + "mean", + # "prod", + # "std", + # "var", + # "median", + # "kurt", + # "skew", +] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names. + """ + return request.param + ############################################################################## # Tests ############################################################################## @@ -348,3 +370,10 @@ def test_compare_scalar(self, data, all_compare_operators): @pytest.mark.skip(reason="Unsupported") def test_compare_array(self, data, all_compare_operators): pass + + +class TestNumericReduce(base.BaseNumericReduceTests): + + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + assert result.n_samples == 1 From a37b409ad8988adbc292d3135209383845f5b097 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Aug 2020 13:35:06 +0200 Subject: [PATCH 033/210] Add Pytest to RTD dependencies. --- readthedocs-requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readthedocs-requirements.txt b/readthedocs-requirements.txt index 3562a73ad..e97c6a51e 100644 --- a/readthedocs-requirements.txt +++ b/readthedocs-requirements.txt @@ -12,4 +12,5 @@ mpldatacursor setuptools>=41.2 multimethod>=1.2 findiff -jupyter-sphinx \ No newline at end of file +jupyter-sphinx +pytest \ No newline at end of file From 3e08de7a4fd6d3c17a8a55693ec48337b9d5743b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Aug 2020 18:12:17 +0200 Subject: [PATCH 034/210] Remove `times` function from `FDataBasis`. Multiplication is impossible for basis expansion in the general case, so do not try to approximate it. Also currently there is no reason to do so. --- skfda/inference/anova/anova_oneway.py | 17 ++--- skfda/representation/basis/_basis.py | 20 ----- skfda/representation/basis/_bspline.py | 54 -------------- skfda/representation/basis/_constant.py | 11 --- skfda/representation/basis/_fdatabasis.py | 52 ------------- skfda/representation/basis/_fourier.py | 15 ---- skfda/representation/basis/_monomial.py | 14 ---- skfda/representation/basis/_tensor_basis.py | 6 -- skfda/representation/basis/_vector_basis.py | 6 -- tests/test_basis.py | 81 --------------------- 10 files changed, 8 insertions(+), 268 deletions(-) diff --git a/skfda/inference/anova/anova_oneway.py b/skfda/inference/anova/anova_oneway.py index 9432f4daa..d806b7fd1 100644 --- a/skfda/inference/anova/anova_oneway.py +++ b/skfda/inference/anova/anova_oneway.py @@ -1,10 +1,11 @@ -import numpy as np -from sklearn.utils import check_random_state - from skfda import concatenate +from skfda.datasets import make_gaussian_process from skfda.misc.metrics import lp_distance from skfda.representation import FData, FDataGrid -from skfda.datasets import make_gaussian_process + +from sklearn.utils import check_random_state + +import numpy as np def v_sample_stat(fd, weights, p=2): @@ -152,10 +153,7 @@ def v_asymptotic_stat(fd, weights, p=2): t_ind = np.tril_indices(fd.n_samples, -1) coef = np.sqrt(weights[t_ind[1]] / weights[t_ind[0]]) left_fd = fd[t_ind[1]] - if isinstance(fd, FDataGrid): - right_fd = coef[:, None, np.newaxis] * fd[t_ind[0]] - else: - right_fd = fd[t_ind[0]].times(coef) + right_fd = fd[t_ind[0]] * coef return np.sum(lp_distance(left_fd, right_fd, p=p) ** p) @@ -173,7 +171,8 @@ def _anova_bootstrap(fd_grouped, n_reps, random_state=None, p=2, start, stop = fd_grouped[0].domain_range[0] - sizes = [fd.n_samples for fd in fd_grouped] # List with sizes of each group + # List with sizes of each group + sizes = [fd.n_samples for fd in fd_grouped] # Instance a random state object in case random_state is an int random_state = check_random_state(random_state) diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index a03676847..f7e9c207f 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -195,26 +195,6 @@ def _coordinate(self, fdatabasis, key): return self._coordinate_nonfull(fdatabasis=fdatabasis, key=r_key) - @abstractmethod - def basis_of_product(self, other): - pass - - @abstractmethod - def rbasis_of_product(self, other): - pass - - @staticmethod - def default_basis_of_product(one, other): - """Default multiplication for a pair of basis""" - from ._bspline import BSpline - - if not _same_domain(one, other): - raise ValueError("Ranges are not equal.") - - norder = min(8, one.n_basis + other.n_basis) - n_basis = max(one.n_basis + other.n_basis, norder + 1) - return BSpline(one.domain_range, n_basis, norder) - def rescale(self, domain_range=None): r"""Return a copy of the basis with a new domain range, with the corresponding values rescaled to the new bounds. diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index 323687453..7aea4593d 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -316,60 +316,6 @@ def _gram_matrix(self): return matrix - def basis_of_product(self, other): - from ._constant import Constant - - """Multiplication of two Bspline Basis""" - if not _same_domain(self, other): - raise ValueError("Ranges are not equal.") - - if isinstance(other, Constant): - return other.rbasis_of_product(self) - - if isinstance(other, BSpline): - uniqueknots = np.union1d(self.inknots, other.inknots) - - multunique = np.zeros(len(uniqueknots), dtype=np.int32) - for i in range(len(uniqueknots)): - mult1 = np.count_nonzero(self.inknots == uniqueknots[i]) - mult2 = np.count_nonzero(other.inknots == uniqueknots[i]) - multunique[i] = max(mult1, mult2) - - m2 = 0 - allknots = np.zeros(np.sum(multunique)) - for i in range(len(uniqueknots)): - m1 = m2 - m2 = m2 + multunique[i] - allknots[m1:m2] = uniqueknots[i] - - norder1 = self.n_basis - len(self.inknots) - norder2 = other.n_basis - len(other.inknots) - norder = min(norder1 + norder2 - 1, 20) - - allbreaks = ([self.domain_range[0][0]] + - np.ndarray.tolist(allknots) + - [self.domain_range[0][1]]) - n_basis = len(allbreaks) + norder - 2 - return BSpline(self.domain_range, n_basis, norder, allbreaks) - else: - norder = min(self.n_basis - len(self.inknots) + 2, 8) - n_basis = max(self.n_basis + other.n_basis, norder + 1) - return BSpline(self.domain_range, n_basis, norder) - - def rbasis_of_product(self, other): - """Multiplication of a Bspline Basis with other basis""" - - norder = min(self.n_basis - len(self.inknots) + 2, 8) - n_basis = max(self.n_basis + other.n_basis, norder + 1) - return BSpline(self.domain_range, n_basis, norder) - - def _to_R(self): - drange = self.domain_range[0] - return ("create.bspline.basis(rangeval = c(" + str(drange[0]) + "," + - str(drange[1]) + "), nbasis = " + str(self.n_basis) + - ", norder = " + str(self.order) + ", breaks = " + - self._list_to_R(self.knots) + ")") - def _to_scipy_BSpline(self, coefs): knots = np.concatenate(( diff --git a/skfda/representation/basis/_constant.py b/skfda/representation/basis/_constant.py index fe139b826..283d5a93b 100644 --- a/skfda/representation/basis/_constant.py +++ b/skfda/representation/basis/_constant.py @@ -41,17 +41,6 @@ def _gram_matrix(self): return np.array([[self.domain_range[0][1] - self.domain_range[0][0]]]) - def basis_of_product(self, other): - """Multiplication of a Constant Basis with other Basis""" - if not _same_domain(self, other): - raise ValueError("Ranges are not equal.") - - return other.copy() - - def rbasis_of_product(self, other): - """Multiplication of a Constant Basis with other Basis""" - return other.copy() - def _to_R(self): drange = self.domain_range[0] return "create.constant.basis(rangeval = c(" + str(drange[0]) + "," +\ diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 93d6ff4c3..8cae42c01 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -566,58 +566,6 @@ def copy(self, *, basis=None, coefficients=None, sample_names=sample_names, extrapolation=extrapolation) - def times(self, other): - """"Provides a numerical approximation of the multiplication between - an FDataObject to other object - - Args: - other (int, list, FDataBasis): Object to multiply with the - FDataBasis object. - - * int: Multiplies all samples with the value - * list: multiply each values with the samples respectively. - Length should match with FDataBasis samples - * FDataBasis: if there is one sample it multiplies this with - all the samples in the object. If not, it multiplies each - sample respectively. Samples should match - - Returns: - (FDataBasis): FDataBasis object containing the multiplication - - """ - if isinstance(other, FDataBasis): - - if not _same_domain(self.domain_range, other.domain_range): - raise ValueError("The functions domains are different.") - - basisobj = self.basis.basis_of_product(other.basis) - neval = max(constants.BASIS_MIN_FACTOR * - max(self.n_basis, other.n_basis) + 1, - constants.N_POINTS_COARSE_MESH) - (left, right) = self.domain_range[0] - evalarg = np.linspace(left, right, neval) - - first = self.copy(coefficients=(np.repeat(self.coefficients, - other.n_samples, axis=0) - if (self.n_samples == 1 and - other.n_samples > 1) - else self.coefficients.copy())) - second = other.copy(coefficients=(np.repeat(other.coefficients, - self.n_samples, axis=0) - if (other.n_samples == 1 and - self.n_samples > 1) - else other.coefficients.copy())) - - fdarray = first.evaluate(evalarg) * second.evaluate(evalarg) - - return FDataBasis.from_data(fdarray, evalarg, basisobj) - - if isinstance(other, int): - other = [other for _ in range(self.n_samples)] - - coefs = np.transpose(np.atleast_2d(other)) - return self.copy(coefficients=self.coefficients * coefs) - def _to_R(self): """Gives the code to build the object on fda package on R""" return ("fd(coef = " + self._array_to_R(self.coefficients, True) + diff --git a/skfda/representation/basis/_fourier.py b/skfda/representation/basis/_fourier.py index ee9302010..3895e322e 100644 --- a/skfda/representation/basis/_fourier.py +++ b/skfda/representation/basis/_fourier.py @@ -163,21 +163,6 @@ def _gram_matrix(self): else: return super()._gram_matrix() - def basis_of_product(self, other): - """Multiplication of two Fourier Basis""" - if not _same_domain(self, other): - raise ValueError("Ranges are not equal.") - - if isinstance(other, Fourier) and self.period == other.period: - return Fourier(self.domain_range, self.n_basis + other.n_basis - 1, - self.period) - else: - return other.rbasis_of_product(self) - - def rbasis_of_product(self, other): - """Multiplication of a Fourier Basis with other Basis""" - return Basis.default_basis_of_product(other, self) - def rescale(self, domain_range=None, *, rescale_period=False): r"""Return a copy of the basis with a new domain range, with the corresponding values rescaled to the new bounds. diff --git a/skfda/representation/basis/_monomial.py b/skfda/representation/basis/_monomial.py index ce1442cdd..f6782ef0c 100644 --- a/skfda/representation/basis/_monomial.py +++ b/skfda/representation/basis/_monomial.py @@ -107,20 +107,6 @@ def _gram_matrix(self): ordered_evaluated_points[:self.n_basis], ordered_evaluated_points[self.n_basis - 1:]) - def basis_of_product(self, other): - """Multiplication of a Monomial Basis with other Basis""" - if not _same_domain(self, other): - raise ValueError("Ranges are not equal.") - - if isinstance(other, Monomial): - return Monomial(self.domain_range, self.n_basis + other.n_basis) - - return other.rbasis_of_product(self) - - def rbasis_of_product(self, other): - """Multiplication of a Monomial Basis with other Basis""" - return Basis.default_basis_of_product(self, other) - def _to_R(self): drange = self.domain_range[0] return "create.monomial.basis(rangeval = c(" + str(drange[0]) + "," +\ diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index b3b5c6c0e..a6bbf2f16 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -111,12 +111,6 @@ def _gram_matrix(self): return gram.reshape((self.n_basis, self.n_basis)) - def basis_of_product(self, other): - pass - - def rbasis_of_product(self, other): - pass - def __eq__(self, other): return super().__eq__(other) and self.basis_list == other.basis_list diff --git a/skfda/representation/basis/_vector_basis.py b/skfda/representation/basis/_vector_basis.py index 9a2bae2f1..1a9fa0fe7 100644 --- a/skfda/representation/basis/_vector_basis.py +++ b/skfda/representation/basis/_vector_basis.py @@ -150,12 +150,6 @@ def _coordinate_nonfull(self, fdatabasis, key): return fdatabasis.copy(basis=basis, coefficients=coefs, coordinate_names=coordinate_names) - def basis_of_product(self, other): - pass - - def rbasis_of_product(self, other): - pass - def __eq__(self, other): return super().__eq__(other) and self.basis_list == other.basis_list diff --git a/tests/test_basis.py b/tests/test_basis.py index 3c0863a67..c12d5af1d 100644 --- a/tests/test_basis.py +++ b/tests/test_basis.py @@ -33,51 +33,6 @@ def test_from_data_qr(self): np.array([[1., 2.78, -3., -0.78, 1.]]) ) - def test_basis_product_generic(self): - monomial = Monomial(n_basis=5) - fourier = Fourier(n_basis=3) - prod = BSpline(n_basis=9, order=8) - self.assertEqual(Basis.default_basis_of_product( - monomial, fourier), prod) - - def test_basis_constant_product(self): - constant = Constant() - monomial = Monomial() - fourier = Fourier() - bspline = BSpline(n_basis=5, order=3) - self.assertEqual(constant.basis_of_product(monomial), monomial) - self.assertEqual(constant.basis_of_product(fourier), fourier) - self.assertEqual(constant.basis_of_product(bspline), bspline) - self.assertEqual(monomial.basis_of_product(constant), monomial) - self.assertEqual(fourier.basis_of_product(constant), fourier) - self.assertEqual(bspline.basis_of_product(constant), bspline) - - def test_basis_fourier_product(self): - # Test when periods are the same - fourier = Fourier(n_basis=5) - fourier2 = Fourier(n_basis=3) - prod = Fourier(n_basis=7) - self.assertEqual(fourier.basis_of_product(fourier2), prod) - - # Test when periods are different - fourier2 = Fourier(n_basis=3, period=2) - prod = BSpline(n_basis=9, order=8) - self.assertEqual(fourier.basis_of_product(fourier2), prod) - - def test_basis_monomial_product(self): - monomial = Monomial(n_basis=5) - monomial2 = Monomial(n_basis=3) - prod = Monomial(n_basis=8) - self.assertEqual(monomial.basis_of_product(monomial2), prod) - - def test_basis_bspline_product(self): - bspline = BSpline(n_basis=6, order=4) - bspline2 = BSpline(domain_range=(0, 1), n_basis=6, - order=4, knots=[0, 0.3, 1 / 3, 1]) - prod = BSpline(domain_range=(0, 1), n_basis=10, order=7, - knots=[0, 0.3, 1 / 3, 2 / 3, 1]) - self.assertEqual(bspline.basis_of_product(bspline2), prod) - def test_basis_inner_matrix(self): np.testing.assert_array_almost_equal( Monomial(n_basis=3).inner_product_matrix(), @@ -202,42 +157,6 @@ def test_comutativity_inprod(self): np.transpose(inner_product_matrix(monomial, bsplinefd)) ) - def test_fdatabasis_times_fdatabasis_fdatabasis(self): - monomial = FDataBasis(Monomial(n_basis=3), [1, 2, 3]) - bspline = FDataBasis(BSpline(n_basis=6, order=4), [1, 2, 4, 1, 0, 1]) - times_fdar = monomial.times(bspline) - - prod_basis = BSpline(n_basis=9, order=6, knots=[0, 0.25, 0.5, 0.75, 1]) - prod_coefs = np.array([[0.9788352, 1.6289955, 2.7004969, 6.2678739, - 8.7636441, 4.0069960, 0.7126961, 2.8826708, - 6.0052311]]) - - self.assertEqual(prod_basis, times_fdar.basis) - np.testing.assert_array_almost_equal( - prod_coefs, times_fdar.coefficients) - - def test_fdatabasis_times_fdatabasis_list(self): - monomial = FDataBasis(Monomial(n_basis=3), - [[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = monomial.times([3, 2, 1]) - - expec_basis = Monomial(n_basis=3) - expec_coefs = np.array([[3, 6, 9], [8, 10, 12], [7, 8, 9]]) - - self.assertEqual(expec_basis, result.basis) - np.testing.assert_array_almost_equal(expec_coefs, result.coefficients) - - def test_fdatabasis_times_fdatabasis_int(self): - monomial = FDataBasis(Monomial(n_basis=3), - [[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = monomial.times(3) - - expec_basis = Monomial(n_basis=3) - expec_coefs = np.array([[3, 6, 9], [12, 15, 18], [21, 24, 27]]) - - self.assertEqual(expec_basis, result.basis) - np.testing.assert_array_almost_equal(expec_coefs, result.coefficients) - def test_fdatabasis__add__(self): monomial1 = FDataBasis(Monomial(n_basis=3), [1, 2, 3]) monomial2 = FDataBasis(Monomial(n_basis=3), [[1, 2, 3], [3, 4, 5]]) From e5d0e3116aac0c3410372944e7ac5b7e2809ab06 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Aug 2020 20:23:08 +0200 Subject: [PATCH 035/210] Add Maxima Hunting method. --- docs/modules/preprocessing/dim_reduction.rst | 1 + setup.py | 25 ++- .../variable_selection/__init__.py | 1 + .../variable_selection/_maxima_hunting.py | 192 ++++++++++++++++++ .../dim_reduction/variable_selection/_rkvs.py | 4 +- 5 files changed, 210 insertions(+), 13 deletions(-) create mode 100644 skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index 9082d931a..1a27db215 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -18,6 +18,7 @@ following: .. autosummary:: :toctree: autosummary + skfda.preprocessing.dim_reduction.variable_selection.MaximaHunting skfda.preprocessing.dim_reduction.variable_selection.RKHSVariableSelection Projection diff --git a/setup.py b/setup.py index 6d2ca1f3f..32f5741ef 100644 --- a/setup.py +++ b/setup.py @@ -79,17 +79,20 @@ 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Software Development :: Libraries :: Python Modules', ], - install_requires=['numpy>=1.16', - 'scipy>=1.3.0', - 'scikit-learn>=0.20', - 'pandas', - 'matplotlib', - 'scikit-datasets[cran]>=0.1.24', - 'rdata', - 'cython', - 'mpldatacursor', - 'multimethod>=1.2', - 'findiff'], + install_requires=[ + 'cython', + 'dcor,' + 'findiff' + 'matplotlib', + 'mpldatacursor', + 'multimethod>=1.2', + 'numpy>=1.16', + 'pandas', + 'rdata', + 'scikit-datasets[cran]>=0.1.24', + 'scikit-learn>=0.20', + 'scipy>=1.3.0', + ], setup_requires=pytest_runner, tests_require=['pytest'], test_suite='tests', diff --git a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py index 48c69de54..41cc407a2 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py @@ -1 +1,2 @@ +from ._maxima_hunting import MaximaHunting from ._rkvs import RKHSVariableSelection diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py new file mode 100644 index 000000000..3e102e994 --- /dev/null +++ b/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py @@ -0,0 +1,192 @@ +import dcor + +import sklearn.base +import sklearn.utils +import numpy as np +from ....representation import FDataGrid + + +def dependency(X, Y, dependency_measure=dcor.u_distance_correlation_sqr): + ''' + Computes the dependency of each point in each trajectory in X with the + corresponding class label in Y. + ''' + + def vectorial_dependency_measure(x): + x = np.atleast_2d(x).transpose() + + return dependency_measure(x, Y) + + vectorial_dependency_measure = np.vectorize( + vectorial_dependency_measure, + otypes=[float], + signature="(m,n)->()" + ) + + X_view = np.rollaxis(X, 0, len(X.shape)) + + return vectorial_dependency_measure(X_view) + + +def select_local_maxima(curve, smoothing: int=1): + + selected_features = [] + scores = [] + + # Grow the curve at left and right with non maxima points so that points + # near the extremes can be processed in the same way. + extra_1 = np.repeat(curve[0], smoothing) + extra_2 = np.repeat(curve[-1], smoothing) + new_curve = np.concatenate([extra_1, curve, extra_2]) + + for i in range(0, len(curve)): + interval = new_curve[i:i + 2 * smoothing + 1] + candidate_maximum = interval[smoothing] + assert candidate_maximum == curve[i] + + is_maxima_in_interval = np.all(interval <= candidate_maximum) + is_not_flat = (candidate_maximum > interval[smoothing - 1] or + candidate_maximum > interval[smoothing + 1]) + + # If the maximum is the point in the middle of the interval, it is + # selected. + if is_maxima_in_interval and is_not_flat: + selected_features.append(i) + scores.append(candidate_maximum) + + return np.array(selected_features), np.array(scores) + + +def maxima_hunting(X, y, + dependency_measure=dcor.u_distance_correlation_sqr, + smoothing=1): + r''' + Maxima Hunting variable selection. + + This is a filter variable selection method for problems with a target + variable. It evaluates a dependency measure between each point of the + function and the target variable, and keeps those points in which this + dependency is a local maximum. + + Selecting the local maxima serves two purposes. First, it ensures that + the points that are relevant in isolation are selected, as they must + maximice their dependency with the target variable. Second, the points + that are relevant only because they are near a relevant point (and are + thus highly correlated with it) are NOT selected, as only local maxima + are selected, minimizing the redundancy of the selected variables. + + For a longer explanation about the method, and comparison with other + functional variable selection methods, we refer the reader to the + original article [1]_. + + Parameters: + + dependency_measure (callable): dependency measure to use. By default, + it uses the bias corrected squared distance correlation. + + Examples: + + >>> from skfda.preprocessing.dim_reduction import variable_selection + >>> from skfda.datasets import make_gaussian_process + >>> import skfda + >>> import numpy as np + + We create trajectories from two classes, one with zero mean and the + other with a peak-like mean. Both have Brownian covariance. + + >>> n_samples = 10000 + >>> n_features = 100 + >>> + >>> def mean_1(t): + ... return (np.abs(t - 0.25) + ... - 2 * np.abs(t - 0.5) + ... + np.abs(t - 0.75)) + >>> + >>> X_0 = make_gaussian_process(n_samples=n_samples // 2, + ... n_features=n_features, + ... random_state=0) + >>> X_1 = make_gaussian_process(n_samples=n_samples // 2, + ... n_features=n_features, + ... mean=mean_1, + ... random_state=1) + >>> X = skfda.concatenate((X_0, X_1)) + >>> + >>> y = np.zeros(n_samples) + >>> y [n_samples // 2:] = 1 + + Select the relevant points to distinguish the two classes + + >>> rkvs = variable_selection.MaximaHunting(smoothing=10) + >>> _ = rkvs.fit(X, y) + >>> point_mask = rkvs.get_support() + >>> points = X.sample_points[0][point_mask] + >>> np.allclose(points, [0.5], rtol=0.1) + True + + Apply the learned dimensionality reduction + + >>> X_dimred = rkvs.transform(X) + >>> len(X.sample_points[0]) + 100 + >>> X_dimred.shape + (10000, 1) + + References: + + .. [1] J. R. Berrendero, A. Cuevas, and J. L. Torrecilla, “Variable + selection in functional data classification: a maxima-hunting + proposal,” STAT SINICA, vol. 26, no. 2, pp. 619–638, 2016, + doi: 10.5705/ss.202014.0014. + + ''' + + curve = dependency(X[..., None], y, dependency_measure) + + return select_local_maxima(curve, smoothing) + + +class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): + + def __init__(self, + dependency_measure=dcor.u_distance_correlation_sqr, + smoothing=1): + self.dependency_measure = dependency_measure + self.smoothing = smoothing + + def fit(self, X: FDataGrid, y): + + X, y = sklearn.utils.validation.check_X_y(X.data_matrix[..., 0], y) + + self.features_shape_ = X.shape[1:] + + self.results_ = maxima_hunting( + X=X, + y=y, + dependency_measure=self.dependency_measure, + smoothing=self.smoothing) + + indexes = np.argsort(self.results_[1])[::-1] + self.sorted_indexes_ = self.results_[0][indexes] + + return self + + def get_support(self, indices: bool=False): + indexes_unraveled = self.results_[0] + if indices: + return indexes_unraveled + else: + mask = np.zeros(self.features_shape_[0], dtype=bool) + mask[self.results_[0]] = True + return mask + + def transform(self, X, y=None): + + sklearn.utils.validation.check_is_fitted(self) + + X = sklearn.utils.validation.check_array(X.data_matrix[..., 0]) + + if X.shape[1:] != self.features_shape_: + raise ValueError("The trajectories have a different number of " + "points than the ones fitted") + + return X[:, self.sorted_indexes_] diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index db4518a71..569edae35 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -166,10 +166,10 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, References: - .. [1] J. R. Berrendero, A. Cuevas, y J. L. Torrecilla, «On the Use + .. [1] J. R. Berrendero, A. Cuevas, and J. L. Torrecilla, «On the Use of Reproducing Kernel Hilbert Spaces in Functional Classification», Journal of the American Statistical - Association, vol. 113, n.º 523, pp. 1210-1218, jul. 2018, + Association, vol. 113, no. 523, pp. 1210-1218, jul. 2018, doi: 10.1080/01621459.2017.1320287. ''' From 50f8d53a088bd1c8e0401d2e11c67bb0f3e5c1c5 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Aug 2020 20:27:29 +0200 Subject: [PATCH 036/210] Fix setup.py. --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 32f5741ef..f1a50bd45 100644 --- a/setup.py +++ b/setup.py @@ -81,8 +81,8 @@ ], install_requires=[ 'cython', - 'dcor,' - 'findiff' + 'dcor', + 'findiff', 'matplotlib', 'mpldatacursor', 'multimethod>=1.2', @@ -91,7 +91,7 @@ 'rdata', 'scikit-datasets[cran]>=0.1.24', 'scikit-learn>=0.20', - 'scipy>=1.3.0', + 'scipy>=1.3.0' ], setup_requires=pytest_runner, tests_require=['pytest'], From 240993799a87a78e67696ea0d1499ff087e12359 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 11 Aug 2020 17:56:11 +0200 Subject: [PATCH 037/210] Fix bug with matplotlib backend. --- skfda/representation/grid.py | 1 - 1 file changed, 1 deletion(-) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index bd90bce1b..782ce4ca3 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -12,7 +12,6 @@ import findiff import pandas.api.extensions -from pandas.tests.test_nanops import skipna import scipy.stats.mstats import numpy as np From 911befeab412d5ef8e05d8969c251f8af088ca23 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 12 Aug 2020 20:25:15 +0200 Subject: [PATCH 038/210] Vectorize local maxima function. --- .../variable_selection/_maxima_hunting.py | 120 ++++++++++-------- 1 file changed, 66 insertions(+), 54 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py index 3e102e994..dcf4394ad 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py @@ -1,76 +1,96 @@ import dcor +import scipy.signal import sklearn.base import sklearn.utils + import numpy as np + from ....representation import FDataGrid -def dependency(X, Y, dependency_measure=dcor.u_distance_correlation_sqr): +def compute_dependence(X, Y, *, dependence_measure): ''' - Computes the dependency of each point in each trajectory in X with the + Computes the dependence of each point in each trajectory in X with the corresponding class label in Y. ''' - def vectorial_dependency_measure(x): + def vectorial_dependence_measure(x): x = np.atleast_2d(x).transpose() - return dependency_measure(x, Y) + return dependence_measure(x, Y) - vectorial_dependency_measure = np.vectorize( - vectorial_dependency_measure, + vectorial_dependence_measure = np.vectorize( + vectorial_dependence_measure, otypes=[float], signature="(m,n)->()" ) X_view = np.rollaxis(X, 0, len(X.shape)) - return vectorial_dependency_measure(X_view) + return vectorial_dependence_measure(X_view) + + +def select_local_maxima(X, order: int=1): + r''' + Compute local maxima of a function. + + Points near the boundary are considered maxima looking only at one side. + + For flat regions only the boundary points of the flat region could be + considered maxima. + + Parameters: + + order (callable): How many points on each side to look, to check if + a point is a maximum in that interval. + + Examples: + + >>> from skfda.preprocessing.dim_reduction import variable_selection + >>> from skfda.datasets import make_gaussian_process + >>> import skfda + >>> import numpy as np + >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) + >>> select_local_maxima(x) + array([ 0, 5, 7, 10], dtype=int64) -def select_local_maxima(curve, smoothing: int=1): + The ``order`` parameter can be used to check a larger interval to see + if a point is still a maxima, effectively eliminating small local + maxima. - selected_features = [] - scores = [] + >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) + >>> select_local_maxima(x, order=3) + array([ 0, 5, 10], dtype=int64) - # Grow the curve at left and right with non maxima points so that points - # near the extremes can be processed in the same way. - extra_1 = np.repeat(curve[0], smoothing) - extra_2 = np.repeat(curve[-1], smoothing) - new_curve = np.concatenate([extra_1, curve, extra_2]) + ''' + indexes = scipy.signal.argrelextrema( + X, comparator=np.greater_equal, order=order)[0] - for i in range(0, len(curve)): - interval = new_curve[i:i + 2 * smoothing + 1] - candidate_maximum = interval[smoothing] - assert candidate_maximum == curve[i] + # Discard flat + maxima = X[indexes] - is_maxima_in_interval = np.all(interval <= candidate_maximum) - is_not_flat = (candidate_maximum > interval[smoothing - 1] or - candidate_maximum > interval[smoothing + 1]) + left_points = np.take(X, indexes - 1, mode='clip') + right_points = np.take(X, indexes + 1, mode='clip') - # If the maximum is the point in the middle of the interval, it is - # selected. - if is_maxima_in_interval and is_not_flat: - selected_features.append(i) - scores.append(candidate_maximum) + is_not_flat = (maxima > left_points) | (maxima > right_points) - return np.array(selected_features), np.array(scores) + return indexes[is_not_flat] -def maxima_hunting(X, y, - dependency_measure=dcor.u_distance_correlation_sqr, - smoothing=1): +class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): r''' Maxima Hunting variable selection. This is a filter variable selection method for problems with a target - variable. It evaluates a dependency measure between each point of the + variable. It evaluates a dependence measure between each point of the function and the target variable, and keeps those points in which this - dependency is a local maximum. + dependence is a local maximum. Selecting the local maxima serves two purposes. First, it ensures that the points that are relevant in isolation are selected, as they must - maximice their dependency with the target variable. Second, the points + maximice their dependence with the target variable. Second, the points that are relevant only because they are near a relevant point (and are thus highly correlated with it) are NOT selected, as only local maxima are selected, minimizing the redundancy of the selected variables. @@ -81,7 +101,7 @@ def maxima_hunting(X, y, Parameters: - dependency_measure (callable): dependency measure to use. By default, + dependence_measure (callable): Dependence measure to use. By default, it uses the bias corrected squared distance correlation. Examples: @@ -140,17 +160,10 @@ def maxima_hunting(X, y, ''' - curve = dependency(X[..., None], y, dependency_measure) - - return select_local_maxima(curve, smoothing) - - -class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): - def __init__(self, - dependency_measure=dcor.u_distance_correlation_sqr, + dependence_measure=dcor.u_distance_correlation_sqr, smoothing=1): - self.dependency_measure = dependency_measure + self.dependence_measure = dependence_measure self.smoothing = smoothing def fit(self, X: FDataGrid, y): @@ -158,25 +171,24 @@ def fit(self, X: FDataGrid, y): X, y = sklearn.utils.validation.check_X_y(X.data_matrix[..., 0], y) self.features_shape_ = X.shape[1:] + self.dependence_ = compute_dependence( + X[..., np.newaxis], y, + dependence_measure=self.dependence_measure) - self.results_ = maxima_hunting( - X=X, - y=y, - dependency_measure=self.dependency_measure, - smoothing=self.smoothing) + self.indexes_ = select_local_maxima(self.dependence_, + self.smoothing) - indexes = np.argsort(self.results_[1])[::-1] - self.sorted_indexes_ = self.results_[0][indexes] + sorting_indexes = np.argsort(self.dependence_[self.indexes_])[::-1] + self.sorted_indexes_ = self.indexes_[sorting_indexes] return self def get_support(self, indices: bool=False): - indexes_unraveled = self.results_[0] if indices: - return indexes_unraveled + return self.indexes_ else: mask = np.zeros(self.features_shape_[0], dtype=bool) - mask[self.results_[0]] = True + mask[self.indexes_] = True return mask def transform(self, X, y=None): From e969b214b91c12e550e144032b911b6c3e459b26 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 12 Aug 2020 20:34:27 +0200 Subject: [PATCH 039/210] Fix doctest. --- .../dim_reduction/variable_selection/_maxima_hunting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py index dcf4394ad..3f5ffb587 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py @@ -53,16 +53,16 @@ def select_local_maxima(X, order: int=1): >>> import numpy as np >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) - >>> select_local_maxima(x) - array([ 0, 5, 7, 10], dtype=int64) + >>> select_local_maxima(x).astype(np.int_) + array([ 0, 5, 7, 10]) The ``order`` parameter can be used to check a larger interval to see if a point is still a maxima, effectively eliminating small local maxima. >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) - >>> select_local_maxima(x, order=3) - array([ 0, 5, 10], dtype=int64) + >>> select_local_maxima(x, order=3).astype(np.int_) + array([ 0, 5, 10]) ''' indexes = scipy.signal.argrelextrema( From e3e5cc0e11c7dbc08341237c7afa062a973f53d8 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 12 Aug 2020 22:38:53 +0200 Subject: [PATCH 040/210] Local maxima selector is customizable. --- .../variable_selection/__init__.py | 4 ++- .../{_maxima_hunting.py => maxima_hunting.py} | 27 ++++++++++--------- 2 files changed, 18 insertions(+), 13 deletions(-) rename skfda/preprocessing/dim_reduction/variable_selection/{_maxima_hunting.py => maxima_hunting.py} (87%) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py index 41cc407a2..12793b5cb 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py @@ -1,2 +1,4 @@ -from ._maxima_hunting import MaximaHunting +from . import maxima_hunting + from ._rkvs import RKHSVariableSelection +from .maxima_hunting import MaximaHunting diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py similarity index 87% rename from skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py rename to skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index 3f5ffb587..a3f40522b 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -9,7 +9,7 @@ from ....representation import FDataGrid -def compute_dependence(X, Y, *, dependence_measure): +def _compute_dependence(X, Y, *, dependence_measure): ''' Computes the dependence of each point in each trajectory in X with the corresponding class label in Y. @@ -31,9 +31,9 @@ def vectorial_dependence_measure(x): return vectorial_dependence_measure(X_view) -def select_local_maxima(X, order: int=1): +def select_local_maxima(X, *, order: int=1): r''' - Compute local maxima of a function. + Compute local maxima of an array. Points near the boundary are considered maxima looking only at one side. @@ -47,9 +47,8 @@ def select_local_maxima(X, order: int=1): Examples: - >>> from skfda.preprocessing.dim_reduction import variable_selection - >>> from skfda.datasets import make_gaussian_process - >>> import skfda + >>> from skfda.preprocessing.dim_reduction.variable_selection.\ + ... maxima_hunting import select_local_maxima >>> import numpy as np >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) @@ -107,7 +106,10 @@ class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): Examples: >>> from skfda.preprocessing.dim_reduction import variable_selection + >>> from skfda.preprocessing.dim_reduction.variable_selection.\ + ... maxima_hunting import select_local_maxima >>> from skfda.datasets import make_gaussian_process + >>> from functools import partial >>> import skfda >>> import numpy as np @@ -136,7 +138,9 @@ class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): Select the relevant points to distinguish the two classes - >>> rkvs = variable_selection.MaximaHunting(smoothing=10) + >>> local_maxima_selector = partial(select_local_maxima, order=10) + >>> rkvs = variable_selection.MaximaHunting( + ... local_maxima_selector=local_maxima_selector) >>> _ = rkvs.fit(X, y) >>> point_mask = rkvs.get_support() >>> points = X.sample_points[0][point_mask] @@ -162,21 +166,20 @@ class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): def __init__(self, dependence_measure=dcor.u_distance_correlation_sqr, - smoothing=1): + local_maxima_selector=select_local_maxima): self.dependence_measure = dependence_measure - self.smoothing = smoothing + self.local_maxima_selector = local_maxima_selector def fit(self, X: FDataGrid, y): X, y = sklearn.utils.validation.check_X_y(X.data_matrix[..., 0], y) self.features_shape_ = X.shape[1:] - self.dependence_ = compute_dependence( + self.dependence_ = _compute_dependence( X[..., np.newaxis], y, dependence_measure=self.dependence_measure) - self.indexes_ = select_local_maxima(self.dependence_, - self.smoothing) + self.indexes_ = self.local_maxima_selector(self.dependence_) sorting_indexes = np.argsort(self.dependence_[self.indexes_])[::-1] self.sorted_indexes_ = self.indexes_[sorting_indexes] From c1010a0dc7c873dc56a431402187961ff83649d6 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 12 Aug 2020 22:52:14 +0200 Subject: [PATCH 041/210] Update documentation. --- .../dim_reduction/variable_selection/maxima_hunting.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index a3f40522b..8ad2d0fab 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -102,6 +102,10 @@ class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): dependence_measure (callable): Dependence measure to use. By default, it uses the bias corrected squared distance correlation. + local_maxima_selector (callable): Function to detect local maxima. The + default is :func:`select_local_maxima` with ``order`` parameter + equal to one. The original article used a similar function testing + different values of ``order``. Examples: From dd26fbcc89aff13ffa89c841a1dbad3133ef8816 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 13 Aug 2020 13:38:52 +0200 Subject: [PATCH 042/210] Change Canadian weather dataset so that the domain range is right. The sample points have also been moved to the middle of the day, similar to AEMET. --- skfda/datasets/_real_datasets.py | 3 +- tests/test_magnitude_shape.py | 70 ++++++++++++++++---------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 2b3f362a2..f89ea15f9 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -470,7 +470,8 @@ def fetch_weather(return_X_y: bool = False): temp_prec_daily = np.transpose(weather_daily[:, :, 0:2], axes=(1, 0, 2)) curves = FDataGrid(data_matrix=temp_prec_daily, - sample_points=range(1, 366), + sample_points=np.arange(0, 365) + 0.5, + domain_range=(0, 365), dataset_name="Canadian Weather", argument_names=("day",), coordinate_names=("temperature (ºC)", diff --git a/tests/test_magnitude_shape.py b/tests/test_magnitude_shape.py index 50509e483..4911af0f0 100644 --- a/tests/test_magnitude_shape.py +++ b/tests/test_magnitude_shape.py @@ -15,41 +15,41 @@ def test_magnitude_shape_plot(self): msplot = MagnitudeShapePlot( fd_temperatures, depth_method=modified_band_depth) np.testing.assert_allclose(msplot.points, - np.array([[0.2591055, 3.15861149], - [1.3811996, 0.91806814], - [0.94648379, 2.75695426], - [2.11346208, 7.24045853], - [0.82557436, 0.82727771], - [1.23249759, 0.22004329], - [-2.66259589, 0.96925352], - [0.15827963, 1.00235557], - [-0.43751765, 0.66236714], - [0.70695162, 0.66482897], - [0.73095525, 0.33165117], - [3.48445368, 12.59745018], - [3.15539264, 13.85234879], - [3.52759979, 10.49810783], - [3.95518808, 15.28317686], - [-0.48486514, 0.5035343], - [0.64492781, 6.83385521], - [-0.83185751, 0.81125541], - [-3.47125418, 1.10683451], - [0.22241054, 1.76783493], - [-0.54402406, 0.95229119], - [-1.71310618, 0.61875513], - [-0.44161441, 0.77815135], - [0.13851408, 1.02560672], - [7.59909246, 40.82126568], - [7.57868277, 36.03923856], - [7.12930634, 45.96866318], - [0.05746528, 1.75817588], - [1.53092075, 8.85227], - [-1.48696387, 0.22472872], - [-2.853082, 4.50814844], - [-2.42297615, 1.46926902], - [-5.8873129, 5.35742609], - [-5.44346193, 5.18338576], - [-16.38949483, 0.94027717]] + np.array([[0.25839562, 3.14995827], + [1.3774155, 0.91556716], + [0.94389069, 2.74940766], + [2.10767177, 7.22065509], + [0.82331252, 0.8250163], + [1.22912089, 0.2194518], + [-2.65530111, 0.9666511], + [0.15784599, 0.99960958], + [-0.43631897, 0.66055387], + [0.70501476, 0.66301126], + [0.72895263, 0.33074653], + [3.47490723, 12.5630275], + [3.14674773, 13.81447167], + [3.51793514, 10.46943904], + [3.94435195, 15.24142224], + [-0.48353674, 0.50215652], + [0.64316089, 6.81513544], + [-0.82957845, 0.80903798], + [-3.4617439, 1.10389229], + [0.2218012, 1.76299192], + [-0.54253359, 0.94968438], + [-1.70841274, 0.61708188], + [-0.44040451, 0.77602089], + [0.13813459, 1.02279698], + [7.57827303, 40.70985885], + [7.55791925, 35.94093086], + [7.10977399, 45.84310211], + [0.05730784, 1.75335899], + [1.52672644, 8.82803475], + [-1.48288999, 0.22412958], + [-2.84526533, 4.49585828], + [-2.41633786, 1.46528758], + [-5.87118328, 5.34300766], + [-5.42854833, 5.1694065], + [-16.34459211, 0.9397118]] )) np.testing.assert_array_almost_equal(msplot.outliers, np.array( From 34eaf0daf50e36e0d0c73f953f8ee1c9486e534f Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sat, 15 Aug 2020 15:09:03 +0200 Subject: [PATCH 043/210] First version of RMH. --- .../variable_selection/__init__.py | 1 + .../variable_selection/maxima_hunting.py | 11 +- .../recursive_maxima_hunting.py | 963 ++++++++++++++++++ 3 files changed, 969 insertions(+), 6 deletions(-) create mode 100644 skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py diff --git a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py index 12793b5cb..ecb8cb0f6 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py @@ -2,3 +2,4 @@ from ._rkvs import RKHSVariableSelection from .maxima_hunting import MaximaHunting +from .recursive_maxima_hunting import RecursiveMaximaHunting diff --git a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index 8ad2d0fab..ba7ee36d2 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -1,9 +1,8 @@ -import dcor - import scipy.signal import sklearn.base import sklearn.utils +import dcor import numpy as np from ....representation import FDataGrid @@ -143,17 +142,17 @@ class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): Select the relevant points to distinguish the two classes >>> local_maxima_selector = partial(select_local_maxima, order=10) - >>> rkvs = variable_selection.MaximaHunting( + >>> mh = variable_selection.MaximaHunting( ... local_maxima_selector=local_maxima_selector) - >>> _ = rkvs.fit(X, y) - >>> point_mask = rkvs.get_support() + >>> _ = mh.fit(X, y) + >>> point_mask = mh.get_support() >>> points = X.sample_points[0][point_mask] >>> np.allclose(points, [0.5], rtol=0.1) True Apply the learned dimensionality reduction - >>> X_dimred = rkvs.transform(X) + >>> X_dimred = mh.transform(X) >>> len(X.sample_points[0]) 100 >>> X_dimred.shape diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py new file mode 100644 index 000000000..5c1c611ed --- /dev/null +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -0,0 +1,963 @@ +import abc +import copy +import numbers +import random + +import scipy.stats +import sklearn.base +import sklearn.utils + +import dcor +import numpy as np +import numpy.linalg as linalg +import numpy.ma as ma + +from ....misc.covariances import Brownian +from ....representation import FDataGrid +from .maxima_hunting import _compute_dependence + + +def _transform_to_2d(t): + t = np.asarray(t) + + dim = len(t.shape) + assert dim <= 2 + + if dim < 2: + t = np.atleast_2d(t).T + + return t + + +def _execute_kernel(kernel, t_0, t_1): + t_0 = _transform_to_2d(t_0) + t_1 = _transform_to_2d(t_1) + + if isinstance(kernel, numbers.Number): + return kernel + else: + if callable(kernel): + result = kernel(t_0, t_1) + else: + # GPy kernel + result = kernel.K(t_0, t_1) + + assert result.shape[0] == len(t_0) + assert result.shape[1] == len(t_1) + return result + + +def _absolute_argmax(function, *, mask): + ''' + Computes the absolute maximum of a discretized function. + + Some values of the function may be masked in order not to consider them + as maximum. + + Parameters: + function (numpy array): Discretized function. + mask (numpy boolean array): Masked values. + + Returns: + int: Index of the absolute maximum. + + ''' + masked_function = ma.array(function, mask=mask) + + t_max = ma.argmax(masked_function) + + t_max = np.unravel_index(t_max, function.shape) + + return t_max + + +class Correction(abc.ABC): + ''' + Base class for applying a correction after a point is taken, eliminating + its influence over the rest + ''' + + def begin(self, X: FDataGrid, Y): + ''' + Initialization + ''' + pass + + def conditioned(self, **kwargs): + ''' + Returns a correction object that is conditioned to the value of a point + ''' + return self + + @abc.abstractmethod + def correct(self, X, selected_index): + ''' + Correct the trajectories. + + Arguments: + X: Matrix with one trajectory per row + T: Times of each measure + selected_index: index of the selected value + ''' + pass + + def __call__(self, *args, **kwargs): + self.correct(*args, **kwargs) + + +class ConditionalExpectationCorrection(Correction): + + @abc.abstractmethod + def conditional_expectation(self, T, t_0, x_0, selected_index): + pass + + def correct(self, X, selected_index): + T = X.sample_points[0] + + t_0 = T[selected_index] + + x_index = (slice(None),) + tuple(selected_index) + (np.newaxis,) + x_0 = X.data_matrix[x_index] + + T = _transform_to_2d(T) + + X.data_matrix[...] -= self.conditional_expectation(T, t_0, x_0, + selected_index).T + + X.data_matrix[:, selected_index] = 0 + + +class SampleGPCorrection(ConditionalExpectationCorrection): + ''' + Correction assuming that the process is Gaussian and using as the kernel + the sample covariance. + ''' + + def __init__(self, markov=False): + self.gaussian_correction = None + self.covariance_matrix = None + self.T = None + self.time = 0 + self.markov = markov + self.cond_points = [] + + def begin(self, X: FDataGrid, Y): + T = X.sample_points + + X_copy = np.copy(X.data_matrix[..., 0]) + + Y = np.ravel(Y) + for class_label in np.unique(Y): + trajectories = X_copy[Y == class_label, :] + + mean = np.mean(trajectories, axis=0) + X_copy[Y == class_label, :] -= mean + + self.covariance_matrix = np.cov(X_copy, rowvar=False) + self.T = np.ravel(T) + self.gaussian_correction = GaussianCorrection(kernel=self.__kernel) + + def __kernel(self, t_0, t_1): + i = np.searchsorted(self.T, t_0) + j = np.searchsorted(self.T, t_1) + + i = np.ravel(i) + j = np.ravel(j) + + return self.covariance_matrix[np.ix_(i, j)] + + def conditioned(self, t_0, **kwargs): + self.cond_points.append(t_0) + self.cond_points.sort() + self.gaussian_correction = self.gaussian_correction.conditioned( + t_0=t_0, **kwargs) + return self + + def conditional_expectation(self, T, t_0, x_0, selected_index): + + gp_condexp = self.gaussian_correction.conditional_expectation( + T, t_0, x_0, selected_index) + + if self.markov: + left_index = np.searchsorted(self.cond_points, t_0) + + left_value = (self.cond_points[left_index - 1] + if left_index != 0 else None) + right_value = (self.cond_points[left_index] + if left_index != len(self.cond_points) else None) + + if left_value is not None: + gp_condexp[:, T.ravel() < left_value, :] = 0 + + if right_value is not None: + gp_condexp[:, T.ravel() > right_value, :] = 0 + + return gp_condexp + + +class PicklableKernel(): + + def __init__(self, kernel): + super().__setattr__('_PicklableKernel__kernel', kernel) + + def __getattr__(self, name): + if name != '__deepcopy__': + return getattr(self.__kernel, name) + + def __setattr__(self, name, value): + setattr(self.__kernel, name, value) + + def __getstate__(self): + return {'class': self.__kernel.__class__, + 'input_dim': self.__kernel.input_dim, + 'values': self.__kernel.param_array} + + def __setstate__(self, state): + super().__setattr__('_PicklableKernel__kernel', state['class']( + input_dim=state['input_dim'])) + self.__kernel.param_array[...] = state['values'] + + def __call__(self, *args, **kwargs): + return self.__kernel.K(*args, **kwargs) + + +def make_kernel(k): + try: + import GPy + except ImportError: + return k + + if isinstance(k, GPy.kern.Kern): + return PicklableKernel(k) + else: + return k + + +class UniformCorrection(Correction): + ''' + Correction assuming that the underlying process is an Ornstein-Uhlenbeck + process with infinite lengthscale. + ''' + + def __init__(self): + pass + + def conditioned(self, X, t_0, **kwargs): + return GaussianCorrection(kernel=Brownian(origin=t_0)) + + def correct(self, X, selected_index): + x_index = (slice(None),) + tuple(selected_index) + (np.newaxis,) + + # Have to copy it because otherwise is a view and shouldn't be + # subtracted from the original matrix + x_0 = np.copy(X.data_matrix[x_index]) + + X.data_matrix[...] -= x_0 + + +class GaussianCorrection(ConditionalExpectationCorrection): + ''' + Correction assuming that the underlying process is Gaussian. + ''' + + def __init__(self, expectation=0, kernel=1, optimize_kernel=False): + super(GaussianCorrection, self).__init__() + + self.__expectation = expectation + self.__kernel = make_kernel(kernel) + self.optimize_kernel = optimize_kernel + self.kernel_params_optimized_names = None + self.kernel_params_optimized_values = None + + def begin(self, X, Y): + if self.optimize_kernel: + import GPy + + T = X.sample_points[0] + X_copy = np.copy(X.data_matrix[..., 0]) + + Y = np.ravel(Y) + for class_label in np.unique(Y): + trajectories = X_copy[Y == class_label, :] + + mean = np.mean(trajectories, axis=0) + X_copy[Y == class_label, :] -= mean + + m = GPy.models.GPRegression( + T[:, None], X_copy.T, + kernel=self.__kernel._PicklableKernel__kernel) + m.constrain_positive('') + m.optimize() + + self.kernel_params_optimized_names = m.parameter_names(), + self.kernel_params_optimized_values = m.param_array + + self.__kernel = copy.deepcopy(make_kernel(m.kern)) + + def conditioned(self, X, t_0, **kwargs): + # If the point makes the matrix singular, don't change the correction + try: + return GaussianConditionedCorrection( + expectation=self.expectation, + kernel=self.kernel, + point_list=t_0) + except linalg.LinAlgError: + return self + + def expectation(self, t): + + if isinstance(self.__expectation, numbers.Number): + expectation = np.ones_like(t, dtype=float) * self.__expectation + else: + expectation = self.__expectation(t) + + return expectation + + def kernel(self, t_0, t_1): + return _execute_kernel(self.__kernel, t_0, t_1) + + covariance = kernel + + def variance(self, t): + return self.covariance(t, t) + + def conditional_expectation(self, T, t_0, x_0, selected_index): + + var = self.variance(t_0) + + expectation = self.expectation(T) + assert expectation.shape == T.shape + + t_0_expectation = expectation[selected_index] + + b_T = self.covariance(T, t_0) + assert b_T.shape == T.shape + + cond_expectation = (expectation + + b_T / var * + (x_0.T - t_0_expectation) + ) if var else expectation + np.zeros_like(x_0.T) + + return cond_expectation + + +class GaussianConditionedCorrection(GaussianCorrection): + ''' + Correction assuming that the underlying process is a Gaussian conditioned + to several points with value 0. + ''' + + def __init__(self, point_list, expectation=0, + kernel=1, **kwargs): + super(GaussianConditionedCorrection, self).__init__( + expectation=self.__expectation, + kernel=self.__kernel, + **kwargs) + + self.point_list = _transform_to_2d(point_list) + self.__gaussian_expectation = expectation + self.__gaussian_kernel = make_kernel(kernel) + self.__covariance_matrix = self.gaussian_kernel( + self.point_list, self.point_list + ) + self.__covariance_matrix_inv = np.linalg.inv(self.__covariance_matrix) + + def conditioned(self, X, t_0, **kwargs): + + # If the point makes the matrix singular, don't change the correction + try: + return GaussianConditionedCorrection( + expectation=self.__gaussian_expectation, + kernel=self.__gaussian_kernel, + point_list=np.concatenate((self.point_list, [[t_0]])) + ) + except linalg.LinAlgError: + return self + + def gaussian_expectation(self, t): + if isinstance(self.__gaussian_expectation, numbers.Number): + expectation = (np.ones_like(t, dtype=float) * + self.__gaussian_expectation) + else: + expectation = self.__gaussian_expectation(t) + + return expectation + + def gaussian_kernel(self, t_0, t_1): + return _execute_kernel(self.__gaussian_kernel, t_0, t_1) + + def __expectation(self, t): + + A_inv = self.__covariance_matrix_inv + + b_T = self.gaussian_kernel(t, self.point_list) + # assert b_T.shape[0] == np.shape(t)[-1] + # assert b_T.shape[1] == np.shape(point_list)[-1] + + c = -self.gaussian_expectation(self.point_list) + assert c.shape == np.shape(self.point_list) + + original_expect = self.gaussian_expectation(t) + assert original_expect.shape == t.shape + + modified_expect = b_T.dot(A_inv).dot(c) + assert modified_expect.shape == t.shape + + expectation = original_expect + modified_expect + assert expectation.shape == t.shape + + return expectation + + def __kernel(self, t_0, t_1): + + A_inv = self.__covariance_matrix_inv + + b_t_0_T = self.gaussian_kernel(t_0, self.point_list) + # assert b_t_0_T.shape[0] == np.shape(np.atleast_2d(t_0))[0] + # assert b_t_0_T.shape[1] == np.shape(point_list)[-1] + + b_t_1 = self.gaussian_kernel(self.point_list, t_1) + # assert b_t_1.shape[0] == np.shape(point_list)[-1] + # assert b_t_1.shape[1] == np.shape(np.atleast_2d(t_1))[0] + + return (self.gaussian_kernel(t_0, t_1) - + b_t_0_T @ A_inv @ b_t_1) + + +class RMHResult(object): + + def __init__(self, index, score): + self.index = index + self.score = score + self.matrix_after_correction = None + self.original_dependence = None + self.influence_mask = None + self.current_mask = None + + def __repr__(self): + return (self.__class__.__name__ + + "(index={index}, score={score})" + .format(index=self.index, score=self.score)) + + +def get_influence_mask(X, t_max_index, min_redundancy, dependence_measure, + old_mask): + ''' + Get the mask of the points that have much dependence with the + selected point. + ''' + + sl = slice(None) + + def get_index(index): + return (sl,) + tuple(index) + (np.newaxis,) + + def is_redundant(index): + + max_point = np.squeeze(X[get_index(t_max_index)], axis=1) + test_point = np.squeeze(X[get_index(index)], axis=1) + + return (dependence_measure(max_point, test_point) > + min_redundancy) + + def adjacent_indexes(index): + for i, coord in enumerate(index): + # Out of bounds right check + if coord < (X.shape[i + 1] - 1): + new_index = list(index) + new_index[i] += 1 + yield tuple(new_index) + # Out of bounds left check + if coord > 0: + new_index = list(index) + new_index[i] -= 1 + yield tuple(new_index) + + def update_mask(new_mask, index): + indexes = [index] + + while indexes: + index = indexes.pop() + # Check if it wasn't masked before + if ( + not old_mask[index] and not new_mask[index] and + is_redundant(index) + ): + new_mask[index] = True + for i in adjacent_indexes(index): + indexes.append(i) + + new_mask = np.zeros_like(old_mask) + + update_mask(new_mask, t_max_index) + + # The selected point is masked even if min_redundancy is high + new_mask[t_max_index] = True + + return new_mask + + +class StoppingCondition(abc.ABC): + '''Stopping condition for RMH.''' + + def begin(self, **kwargs): + pass + + @abc.abstractmethod + def __call__(self, **kwargs): + pass + + +class ScoreThresholdStop(StoppingCondition): + '''Stop when the score is under a threshold.''' + + def __init__(self, threshold=None): + super().__init__() + self.threshold = threshold + self.threshold_specified = threshold is not None + + def begin(self, min_relevance, **kwargs): + if not self.threshold_specified: + self.threshold = min_relevance + + def __call__(self, *, score, **kwargs): + return score < self.threshold + + +def chi_bound(x, y, significance): + + x_dist = dcor.distances.pairwise_distances(x) + y_dist = dcor.distances.pairwise_distances(y) + + t2 = np.mean(x_dist) * np.mean(y_dist) + + chi_quant = scipy.stats.chi2.ppf(1 - significance, df=1) + + return chi_quant * t2 / x_dist.shape[0] + + +def normal_bound(x, y, significance): + + x_dist = dcor.distances.pairwise_distances(x) + y_dist = dcor.distances.pairwise_distances(y) + + t2 = np.mean(x_dist) * np.mean(y_dist) + + norm_quant = scipy.stats.norm.ppf(1 - significance / 2, df=1) + + return norm_quant ** 2 * t2 / x_dist.shape[0] + + +class Chi2BoundStop(StoppingCondition): + '''Stop when the score is under a threshold.''' + + def __init__(self, significance=0.01): + super().__init__() + self.significance = significance + + def __call__(self, *, selected_variable, X, Y, + **kwargs): + bound = chi_bound(selected_variable, Y, self.significance) + # print(f'bound = {bound}') + return dcor.u_distance_covariance_sqr(selected_variable, Y) < bound + + +class NormalBoundStop(StoppingCondition): + '''Stop when the score is under a threshold.''' + + def __init__(self, significance=0.01): + super().__init__() + self.significance = significance + + def __call__(self, *, selected_variable, X, Y, + **kwargs): + bound = normal_bound(selected_variable, Y, self.significance) + # print(f'bound = {bound}') + return dcor.u_distance_covariance_sqr(selected_variable, Y) < bound + + +class DcovTestStop(StoppingCondition): + '''Stop when the score is under a threshold.''' + + def __init__(self, significance=0.01, num_resamples=200, + random_state=None): + super().__init__() + + if random_state == -1: + random_state = None + + self.significance = significance + self.num_resamples = num_resamples + self.random_state = random_state + + def __call__(self, *, selected_variable, X, Y, + **kwargs): + return dcor.independence.distance_covariance_test( + selected_variable, Y, + num_resamples=self.num_resamples, + random_state=self.random_state).p_value >= self.significance + + +class NComponentsStop(StoppingCondition): + '''Stop when the first n components are selected.''' + + def __init__(self, n_components=1): + super().__init__() + self.n_components = n_components + + def begin(self, min_relevance, **kwargs): + self.selected_components = 0 + + def __call__(self, *, score, **kwargs): + stop = self.selected_components >= self.n_components + self.selected_components += 1 + return stop + + +def redundancy_distance_covariance(x, y): + dcov = dcor.u_distance_covariance_sqr(x, y) + dvar = dcor.u_distance_covariance_sqr(x, x) + + return dcov / dvar + + +def rec_maxima_hunting_gen_no_copy( + X: FDataGrid, Y, min_redundancy=0.9, min_relevance=0.2, + dependence_measure=dcor.u_distance_correlation_sqr, + redundancy_dependence_measure=None, + correction=None, + mask=None, + get_intermediate_results=False, + stopping_condition=None): + ''' + Find the most relevant features of a function using recursive maxima + hunting. It changes the original matrix. + + Arguments: + X: Matrix with one trajectory per row + Y: Vector for the response variable + min_redundancy: Minimum dependence between two features to be + considered redundant. + min_relevance: Minimum score to consider a point relevant + dependence_measure: Measure of the dependence between variables + correction: Class that defines the correction to apply to eliminate the + influence of the selected feature. + ''' + + # X = np.asfarray(X) + Y = np.asfarray(Y) + + if correction is None: + correction = UniformCorrection() + + if redundancy_dependence_measure is None: + redundancy_dependence_measure = dependence_measure + + if mask is None: + mask = np.zeros([len(t) for t in X.sample_points], dtype=bool) + + if stopping_condition is None: + stopping_condition = ScoreThresholdStop() + + first_pass = True + + correction.begin(X, Y) + + try: + stopping_condition.begin(X=X.data_matrix, Y=Y, T=X.sample_points[0], + min_relevance=min_relevance, + dependence_measure=dependence_measure) + except AttributeError: + pass + + while True: + dependencies = _compute_dependence( + X=X.data_matrix, Y=Y, + dependence_measure=dependence_measure) + + t_max_index = _absolute_argmax(dependencies, + mask=mask) + score = dependencies[t_max_index] + + repeated_point = mask[t_max_index] + + stopping_condition_reached = stopping_condition( + selected_index=t_max_index, + selected_variable=X.data_matrix[(slice(None),) + + tuple(t_max_index)], + score=score, + X=X.data_matrix, Y=Y) + + if ((repeated_point or stopping_condition_reached) and + not first_pass): + return + + influence_mask = get_influence_mask( + X=X.data_matrix, t_max_index=t_max_index, + min_redundancy=min_redundancy, + dependence_measure=redundancy_dependence_measure, + old_mask=mask) + + mask |= influence_mask + + # Correct the influence of t_max + correction(X=X, + selected_index=t_max_index) + result = RMHResult(index=t_max_index, score=score) + + # Additional info, useful for debugging + if get_intermediate_results: + result.matrix_after_correction = np.copy(X.data_matrix) + result.original_dependence = dependencies + result.influence_mask = influence_mask + result.current_mask = mask + + new_X = yield result # Accept modifications to the matrix + if new_X is not None: + X.data_matrix = new_X + + correction = correction.conditioned( + X=X.data_matrix, + T=X.sample_points[0], + t_0=X.sample_points[0][t_max_index]) + + first_pass = False + + +def rec_maxima_hunting_gen(X, *args, **kwargs): + yield from rec_maxima_hunting_gen_no_copy(copy.copy(X), + *args, **kwargs) + + +def rec_maxima_hunting(*args, **kwargs): + return list(rec_maxima_hunting_gen(*args, **kwargs)) + + +class RecursiveMaximaHunting( + sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): + r''' + Recursive Maxima Hunting variable selection. + + This is a filter variable selection method for problems with a target + variable. It evaluates a dependence measure between each point of the + function and the target variable, selects the point that maximizes this + dependence, subtracts the information of the selected point from + the original functions and repeat the process. + + This method is inspired by :class:`MaximaHunting`, and shares + similarities with it. However, as the information of the selected point + is subtracted from each function in each step of the algorithm, this + algorithm can uncover points that are not relevant by themselves but are + relevant once other points are selected. Those points would not be + selected by :class:`MaximaHunting` alone. + + This method was originally described in a special case in article [1]_. + + Parameters: + + dependence_measure (callable): Dependence measure to use. By default, + it uses the bias corrected squared distance correlation. + local_maxima_selector (callable): Function to detect local maxima. The + default is :func:`select_local_maxima` with ``order`` parameter + equal to one. The original article used a similar function testing + different values of ``order``. + + Examples: + + >>> from skfda.preprocessing.dim_reduction import variable_selection + >>> from skfda.datasets import make_gaussian_process + >>> import skfda + >>> import numpy as np + + We create trajectories from two classes, one with zero mean and the + other with a peak-like mean. Both have Brownian covariance. + + >>> n_samples = 10000 + >>> n_features = 100 + >>> + >>> def mean_1(t): + ... return (np.abs(t - 0.25) + ... - 2 * np.abs(t - 0.5) + ... + np.abs(t - 0.75)) + >>> + >>> X_0 = make_gaussian_process(n_samples=n_samples // 2, + ... n_features=n_features, + ... random_state=0) + >>> X_1 = make_gaussian_process(n_samples=n_samples // 2, + ... n_features=n_features, + ... mean=mean_1, + ... random_state=1) + >>> X = skfda.concatenate((X_0, X_1)) + >>> + >>> y = np.zeros(n_samples) + >>> y [n_samples // 2:] = 1 + + Select the relevant points to distinguish the two classes + + >>> rmh = variable_selection.RecursiveMaximaHunting() + >>> _ = rmh.fit(X, y) + >>> point_mask = rmh.get_support() + >>> points = X.sample_points[0][point_mask] + >>> np.allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) + True + + Apply the learned dimensionality reduction + + >>> X_dimred = rmh.transform(X) + >>> len(X.sample_points[0]) + 100 + >>> X_dimred.shape + (10000, 3) + + References: + + .. [1] J. L. Torrecilla and A. Suárez, “Feature selection in + functional data classification with recursive maxima hunting,” + in Advances in Neural Information Processing Systems 29, + Curran Associates, Inc., 2016, pp. 4835–4843. + + ''' + + def __init__(self, + T=None, + min_redundancy=0.9, + min_relevance=0.2, + dependence_measure=dcor.u_distance_correlation_sqr, + redundancy_dependence_measure=None, + n_components=None, + correction=None, + stopping_condition=None, + return_matrix=False, + num_extra_features=0): + self.min_redundancy = min_redundancy + self.min_relevance = min_relevance + self.T = T + self.dependence_measure = dependence_measure + self.redundancy_dependence_measure = redundancy_dependence_measure + self.n_components = n_components + self.correction = correction + self.stopping_condition = stopping_condition + self.return_matrix = return_matrix + self.num_extra_features = num_extra_features + + def fit(self, X, y): + + if isinstance(X, FDataGrid): + T = X.sample_points[0] + X_all = X.data_matrix + X = X.data_matrix[..., 0] + else: + T = self.T + X_all = X + + X, y = sklearn.utils.validation.check_X_y(X, y) + + self.features_shape_ = X.shape[1:] + + red_dep_measure = self.redundancy_dependence_measure + + indexes = [] + for i, result in enumerate( + rec_maxima_hunting_gen( + X=FDataGrid(data_matrix=np.copy(X_all), sample_points=T), + Y=y, + min_redundancy=self.min_redundancy, + min_relevance=self.min_relevance, + dependence_measure=self.dependence_measure, + redundancy_dependence_measure=red_dep_measure, + correction=self.correction, + stopping_condition=self.stopping_condition, + get_intermediate_results=(self.num_extra_features != 0))): + + # print(f'{i+1}...', end='', flush=True) + + if self.n_components is None or i < self.n_components: + indexes.append(result.index) + + if self.num_extra_features: + mask = result.influence_mask + new_indexes = [a[0] for a in np.ndenumerate(mask) if a[1]] + new_indexes.remove(result.index) + new_indexes = random.sample(new_indexes, min( + len(new_indexes), self.num_extra_features)) + + indexes = indexes + new_indexes + + else: + break + + self.indexes_ = tuple(np.transpose(indexes).tolist()) + + return self + + def transform(self, X): + + if isinstance(X, FDataGrid): + sample_points = X.sample_points[0] + X_all = X.data_matrix[...] + X = X.data_matrix[..., 0] + else: + X_all = X + + sklearn.utils.validation.check_is_fitted(self, ['features_shape_', + 'indexes_']) + + X = sklearn.utils.validation.check_array(X) + + if X.shape[1:] != self.features_shape_: + raise ValueError("The trajectories have a different number of " + "points than the ones fitted") + + matrix = X_all[(slice(None),) + self.indexes_] + + if self.return_matrix: + return matrix + else: + sample_points_new = sample_points[self.indexes_] + return FDataGrid(data_matrix=matrix, + sample_points=sample_points_new) + + def get_support(self, indices: bool=False): + indexes_unraveled = self.indexes_ + if indices: + return indexes_unraveled + else: + mask = np.zeros(self.features_shape_[0], dtype=bool) + mask[self.indexes_] = True + return mask + + def fit_all(self, param_grid, X_train, y_train): + + # We can fit at the same time all n_components, but nothing else + + if len(param_grid) == 0: + return NotImplemented + + n_components_max = 1 + + for param in param_grid: + if len(param) != 1: + return NotImplemented + + n_components = param.get("n_components", None) + + if n_components is None: + return NotImplemented + + n_components_max = max(n_components_max, n_components) + + print(f'Fitting RMH with n_components={n_components_max}') + + cloned = sklearn.base.clone(self) + cloned.set_params(n_components=n_components_max) + cloned.fit(X_train, y_train) + + fitted_estimators = [None] * len(param_grid) + + for i, param in enumerate(param_grid): + n_components = param["n_components"] + fitted_estimators[i] = copy.copy(cloned) + fitted_estimators[i].set_params(n_components=n_components) + fitted_estimators[i].indexes_ = cloned.indexes_[:n_components] + + return fitted_estimators From c64ddedc2ec9616f3476ffffb673bf0f165ffa89 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sat, 15 Aug 2020 21:33:51 +0200 Subject: [PATCH 044/210] Fix doctest. --- .../recursive_maxima_hunting.py | 50 ++++--------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 5c1c611ed..97b991477 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -12,7 +12,6 @@ import numpy.linalg as linalg import numpy.ma as ma -from ....misc.covariances import Brownian from ....representation import FDataGrid from .maxima_hunting import _compute_dependence @@ -243,6 +242,8 @@ def __init__(self): pass def conditioned(self, X, t_0, **kwargs): + from ....misc.covariances import Brownian + return GaussianCorrection(kernel=Brownian(origin=t_0)) def correct(self, X, selected_index): @@ -657,7 +658,7 @@ def rec_maxima_hunting_gen_no_copy( mask = np.zeros([len(t) for t in X.sample_points], dtype=bool) if stopping_condition is None: - stopping_condition = ScoreThresholdStop() + stopping_condition = Chi2BoundStop() first_pass = True @@ -798,7 +799,7 @@ class RecursiveMaximaHunting( >>> _ = rmh.fit(X, y) >>> point_mask = rmh.get_support() >>> points = X.sample_points[0][point_mask] - >>> np.allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) + >>> np.allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) True Apply the learned dimensionality reduction @@ -819,7 +820,6 @@ class RecursiveMaximaHunting( ''' def __init__(self, - T=None, min_redundancy=0.9, min_relevance=0.2, dependence_measure=dcor.u_distance_correlation_sqr, @@ -827,39 +827,26 @@ def __init__(self, n_components=None, correction=None, stopping_condition=None, - return_matrix=False, num_extra_features=0): self.min_redundancy = min_redundancy self.min_relevance = min_relevance - self.T = T self.dependence_measure = dependence_measure self.redundancy_dependence_measure = redundancy_dependence_measure self.n_components = n_components self.correction = correction self.stopping_condition = stopping_condition - self.return_matrix = return_matrix self.num_extra_features = num_extra_features def fit(self, X, y): - if isinstance(X, FDataGrid): - T = X.sample_points[0] - X_all = X.data_matrix - X = X.data_matrix[..., 0] - else: - T = self.T - X_all = X - - X, y = sklearn.utils.validation.check_X_y(X, y) - - self.features_shape_ = X.shape[1:] + self.features_shape_ = X.data_matrix.shape[1:] red_dep_measure = self.redundancy_dependence_measure indexes = [] for i, result in enumerate( rec_maxima_hunting_gen( - X=FDataGrid(data_matrix=np.copy(X_all), sample_points=T), + X=X.copy(), Y=y, min_redundancy=self.min_redundancy, min_relevance=self.min_relevance, @@ -869,8 +856,6 @@ def fit(self, X, y): stopping_condition=self.stopping_condition, get_intermediate_results=(self.num_extra_features != 0))): - # print(f'{i+1}...', end='', flush=True) - if self.n_components is None or i < self.n_components: indexes.append(result.index) @@ -892,30 +877,17 @@ def fit(self, X, y): def transform(self, X): - if isinstance(X, FDataGrid): - sample_points = X.sample_points[0] - X_all = X.data_matrix[...] - X = X.data_matrix[..., 0] - else: - X_all = X - - sklearn.utils.validation.check_is_fitted(self, ['features_shape_', - 'indexes_']) + X_matrix = X.data_matrix - X = sklearn.utils.validation.check_array(X) + sklearn.utils.validation.check_is_fitted(self) - if X.shape[1:] != self.features_shape_: + if X_matrix.shape[1:] != self.features_shape_: raise ValueError("The trajectories have a different number of " "points than the ones fitted") - matrix = X_all[(slice(None),) + self.indexes_] + output = X_matrix[(slice(None),) + self.indexes_] - if self.return_matrix: - return matrix - else: - sample_points_new = sample_points[self.indexes_] - return FDataGrid(data_matrix=matrix, - sample_points=sample_points_new) + return output.reshape(X.n_samples, -1) def get_support(self, indices: bool=False): indexes_unraveled = self.indexes_ From f28af5e4763e4e1c2a6c5c66eac3183c18b06fc3 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 17 Aug 2020 00:04:59 +0200 Subject: [PATCH 045/210] Small changes. --- .../recursive_maxima_hunting.py | 73 +++---------------- 1 file changed, 9 insertions(+), 64 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 97b991477..50dcb4c11 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -29,21 +29,9 @@ def _transform_to_2d(t): def _execute_kernel(kernel, t_0, t_1): - t_0 = _transform_to_2d(t_0) - t_1 = _transform_to_2d(t_1) + from ....misc.covariances import _execute_covariance - if isinstance(kernel, numbers.Number): - return kernel - else: - if callable(kernel): - result = kernel(t_0, t_1) - else: - # GPy kernel - result = kernel.K(t_0, t_1) - - assert result.shape[0] == len(t_0) - assert result.shape[1] == len(t_1) - return result + return _execute_covariance(kernel, t_0, t_1) def _absolute_argmax(function, *, mask): @@ -238,9 +226,6 @@ class UniformCorrection(Correction): process with infinite lengthscale. ''' - def __init__(self): - pass - def conditioned(self, X, t_0, **kwargs): from ....misc.covariances import Brownian @@ -622,7 +607,7 @@ def redundancy_distance_covariance(x, y): return dcov / dvar -def rec_maxima_hunting_gen_no_copy( +def _rec_maxima_hunting_gen_no_copy( X: FDataGrid, Y, min_redundancy=0.9, min_relevance=0.2, dependence_measure=dcor.u_distance_correlation_sqr, redundancy_dependence_measure=None, @@ -725,13 +710,9 @@ def rec_maxima_hunting_gen_no_copy( first_pass = False -def rec_maxima_hunting_gen(X, *args, **kwargs): - yield from rec_maxima_hunting_gen_no_copy(copy.copy(X), - *args, **kwargs) - - -def rec_maxima_hunting(*args, **kwargs): - return list(rec_maxima_hunting_gen(*args, **kwargs)) +def _rec_maxima_hunting_gen(X, *args, **kwargs): + yield from _rec_maxima_hunting_gen_no_copy(copy.copy(X), + *args, **kwargs) class RecursiveMaximaHunting( @@ -845,7 +826,7 @@ def fit(self, X, y): indexes = [] for i, result in enumerate( - rec_maxima_hunting_gen( + _rec_maxima_hunting_gen( X=X.copy(), Y=y, min_redundancy=self.min_redundancy, @@ -890,46 +871,10 @@ def transform(self, X): return output.reshape(X.n_samples, -1) def get_support(self, indices: bool=False): - indexes_unraveled = self.indexes_ + if indices: - return indexes_unraveled + return self.indexes_ else: mask = np.zeros(self.features_shape_[0], dtype=bool) mask[self.indexes_] = True return mask - - def fit_all(self, param_grid, X_train, y_train): - - # We can fit at the same time all n_components, but nothing else - - if len(param_grid) == 0: - return NotImplemented - - n_components_max = 1 - - for param in param_grid: - if len(param) != 1: - return NotImplemented - - n_components = param.get("n_components", None) - - if n_components is None: - return NotImplemented - - n_components_max = max(n_components_max, n_components) - - print(f'Fitting RMH with n_components={n_components_max}') - - cloned = sklearn.base.clone(self) - cloned.set_params(n_components=n_components_max) - cloned.fit(X_train, y_train) - - fitted_estimators = [None] * len(param_grid) - - for i, param in enumerate(param_grid): - n_components = param["n_components"] - fitted_estimators[i] = copy.copy(cloned) - fitted_estimators[i].set_params(n_components=n_components) - fitted_estimators[i].indexes_ = cloned.indexes_[:n_components] - - return fitted_estimators From a27ee62f41248d17ebdab362f98b71d02b0317a9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 17 Aug 2020 15:55:02 +0200 Subject: [PATCH 046/210] Refactor RMH stopping conditions. --- .../variable_selection/maxima_hunting.py | 7 +- .../recursive_maxima_hunting.py | 245 ++++++++---------- 2 files changed, 111 insertions(+), 141 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index ba7ee36d2..a88a530d9 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -1,14 +1,15 @@ +import dcor + import scipy.signal import sklearn.base import sklearn.utils -import dcor import numpy as np from ....representation import FDataGrid -def _compute_dependence(X, Y, *, dependence_measure): +def _compute_dependence(X, y, *, dependence_measure): ''' Computes the dependence of each point in each trajectory in X with the corresponding class label in Y. @@ -17,7 +18,7 @@ def _compute_dependence(X, Y, *, dependence_measure): def vectorial_dependence_measure(x): x = np.atleast_2d(x).transpose() - return dependence_measure(x, Y) + return dependence_measure(x, y) vectorial_dependence_measure = np.vectorize( vectorial_dependence_measure, diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 50dcb4c11..6c1b4a1da 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -1,5 +1,6 @@ import abc import copy +import dcor import numbers import random @@ -7,7 +8,6 @@ import sklearn.base import sklearn.utils -import dcor import numpy as np import numpy.linalg as linalg import numpy.ma as ma @@ -35,7 +35,7 @@ def _execute_kernel(kernel, t_0, t_1): def _absolute_argmax(function, *, mask): - ''' + """ Computes the absolute maximum of a discretized function. Some values of the function may be masked in order not to consider them @@ -48,7 +48,7 @@ def _absolute_argmax(function, *, mask): Returns: int: Index of the absolute maximum. - ''' + """ masked_function = ma.array(function, mask=mask) t_max = ma.argmax(masked_function) @@ -59,33 +59,34 @@ def _absolute_argmax(function, *, mask): class Correction(abc.ABC): - ''' + """ Base class for applying a correction after a point is taken, eliminating - its influence over the rest - ''' + its influence over the rest. + + """ def begin(self, X: FDataGrid, Y): - ''' - Initialization - ''' + """ + Initialization. + + The initial parameters of Recursive Maxima Hunting can be used there. + + """ pass def conditioned(self, **kwargs): - ''' - Returns a correction object that is conditioned to the value of a point - ''' + """ + Returns a correction object that is conditioned to the value of a point. + + """ return self @abc.abstractmethod def correct(self, X, selected_index): - ''' + """ Correct the trajectories. - Arguments: - X: Matrix with one trajectory per row - T: Times of each measure - selected_index: index of the selected value - ''' + """ pass def __call__(self, *args, **kwargs): @@ -115,10 +116,11 @@ def correct(self, X, selected_index): class SampleGPCorrection(ConditionalExpectationCorrection): - ''' + """ Correction assuming that the process is Gaussian and using as the kernel the sample covariance. - ''' + + """ def __init__(self, markov=False): self.gaussian_correction = None @@ -221,10 +223,11 @@ def make_kernel(k): class UniformCorrection(Correction): - ''' + """ Correction assuming that the underlying process is an Ornstein-Uhlenbeck process with infinite lengthscale. - ''' + + """ def conditioned(self, X, t_0, **kwargs): from ....misc.covariances import Brownian @@ -242,9 +245,10 @@ def correct(self, X, selected_index): class GaussianCorrection(ConditionalExpectationCorrection): - ''' + """ Correction assuming that the underlying process is Gaussian. - ''' + + """ def __init__(self, expectation=0, kernel=1, optimize_kernel=False): super(GaussianCorrection, self).__init__() @@ -328,10 +332,11 @@ def conditional_expectation(self, T, t_0, x_0, selected_index): class GaussianConditionedCorrection(GaussianCorrection): - ''' + """ Correction assuming that the underlying process is a Gaussian conditioned to several points with value 0. - ''' + + """ def __init__(self, point_list, expectation=0, kernel=1, **kwargs): @@ -377,8 +382,6 @@ def __expectation(self, t): A_inv = self.__covariance_matrix_inv b_T = self.gaussian_kernel(t, self.point_list) - # assert b_T.shape[0] == np.shape(t)[-1] - # assert b_T.shape[1] == np.shape(point_list)[-1] c = -self.gaussian_expectation(self.point_list) assert c.shape == np.shape(self.point_list) @@ -426,13 +429,13 @@ def __repr__(self): .format(index=self.index, score=self.score)) -def get_influence_mask(X, t_max_index, min_redundancy, dependence_measure, - old_mask): - ''' +def _get_influence_mask(X, t_max_index, min_redundancy, dependence_measure, + old_mask): + """ Get the mask of the points that have much dependence with the selected point. - ''' + """ sl = slice(None) def get_index(index): @@ -483,11 +486,14 @@ def update_mask(new_mask, index): return new_mask -class StoppingCondition(abc.ABC): - '''Stopping condition for RMH.''' +class StoppingCondition(abc.ABC, sklearn.base.BaseEstimator): + """ + Stopping condition for RMH. - def begin(self, **kwargs): - pass + This is a callable that should return ``True`` if the algorithm must stop + and the current point should not be selected. + + """ @abc.abstractmethod def __call__(self, **kwargs): @@ -495,120 +501,94 @@ def __call__(self, **kwargs): class ScoreThresholdStop(StoppingCondition): - '''Stop when the score is under a threshold.''' + """ + Stop when the score is under a threshold. - def __init__(self, threshold=None): - super().__init__() - self.threshold = threshold - self.threshold_specified = threshold is not None - - def begin(self, min_relevance, **kwargs): - if not self.threshold_specified: - self.threshold = min_relevance - - def __call__(self, *, score, **kwargs): - return score < self.threshold + This stopping condition requires that the score has a known bound, for + example that it takes values in the interval :math:`[0, 1]`. + This is one of the simplest stopping criterions, but it requires that + the user chose a threshold parameter, which controls the number of + points chosen and can vary per problem. -def chi_bound(x, y, significance): - - x_dist = dcor.distances.pairwise_distances(x) - y_dist = dcor.distances.pairwise_distances(y) + Parameters: - t2 = np.mean(x_dist) * np.mean(y_dist) + threshold (float): Value compared with the score. If the score + of the selected point is not higher than that, + the point will not be selected (unless it is + the first iteration) and RMH will end. - chi_quant = scipy.stats.chi2.ppf(1 - significance, df=1) + """ - return chi_quant * t2 / x_dist.shape[0] + def __init__(self, threshold=0.2): + super().__init__() + self.threshold = threshold -def normal_bound(x, y, significance): + def __call__(self, *, selected_index, dependences, **kwargs): - x_dist = dcor.distances.pairwise_distances(x) - y_dist = dcor.distances.pairwise_distances(y) + score = dependences[selected_index] - t2 = np.mean(x_dist) * np.mean(y_dist) + return score < self.threshold - norm_quant = scipy.stats.norm.ppf(1 - significance / 2, df=1) - return norm_quant ** 2 * t2 / x_dist.shape[0] +class AsymptoticIndependenceTestStop(StoppingCondition): + r""" + Stop when the selected point is independent from the target. + It uses an asymptotic test based on the chi-squared distribution described + in [1]_. The test rejects independence if -class Chi2BoundStop(StoppingCondition): - '''Stop when the score is under a threshold.''' + .. math:: - def __init__(self, significance=0.01): - super().__init__() - self.significance = significance + \frac{n \mathcal{V}_n}{T_2} \geq \mathcal{X}_{1-\alpha}^2, - def __call__(self, *, selected_variable, X, Y, - **kwargs): - bound = chi_bound(selected_variable, Y, self.significance) - # print(f'bound = {bound}') - return dcor.u_distance_covariance_sqr(selected_variable, Y) < bound + where :math:`n` is the number of samples, :math:`\mathcal{V}_n` is the + sample distance correlation between the selected point and the target, + :math:`\mathcal{X}_{1-\alpha}^2` is the :math:`1-\alpha` quantile of a + chi-squared variable with 1 degree of freedom. :math:`T_2` is the product + of the means of the distance matrices of the selected point and the + target, a term which is involved in the standard computation of the sample + distance covariance. + Parameters: -class NormalBoundStop(StoppingCondition): - '''Stop when the score is under a threshold.''' + significance (float): Significance used in the independence test. By + default is 0.01 (1%). - def __init__(self, significance=0.01): - super().__init__() - self.significance = significance + References: - def __call__(self, *, selected_variable, X, Y, - **kwargs): - bound = normal_bound(selected_variable, Y, self.significance) - # print(f'bound = {bound}') - return dcor.u_distance_covariance_sqr(selected_variable, Y) < bound + .. [1] G. J. Székely and M. L. Rizzo, “Brownian distance covariance,” + Ann. Appl. Stat., vol. 3, no. 4, pp. 1236–1265, Dec. 2009, + doi: 10.1214/09-AOAS312. -class DcovTestStop(StoppingCondition): - '''Stop when the score is under a threshold.''' + """ - def __init__(self, significance=0.01, num_resamples=200, - random_state=None): + def __init__(self, significance=0.01): super().__init__() - - if random_state == -1: - random_state = None - self.significance = significance - self.num_resamples = num_resamples - self.random_state = random_state - def __call__(self, *, selected_variable, X, Y, - **kwargs): - return dcor.independence.distance_covariance_test( - selected_variable, Y, - num_resamples=self.num_resamples, - random_state=self.random_state).p_value >= self.significance + def chi_bound(self, x, y, significance): + x_dist = dcor.distances.pairwise_distances(x) + y_dist = dcor.distances.pairwise_distances(y) -class NComponentsStop(StoppingCondition): - '''Stop when the first n components are selected.''' + t2 = np.mean(x_dist) * np.mean(y_dist) - def __init__(self, n_components=1): - super().__init__() - self.n_components = n_components - - def begin(self, min_relevance, **kwargs): - self.selected_components = 0 + chi_quant = scipy.stats.chi2.ppf(1 - significance, df=1) - def __call__(self, *, score, **kwargs): - stop = self.selected_components >= self.n_components - self.selected_components += 1 - return stop + return chi_quant * t2 / x_dist.shape[0] + def __call__(self, *, selected_variable, y, **kwargs): -def redundancy_distance_covariance(x, y): - dcov = dcor.u_distance_covariance_sqr(x, y) - dvar = dcor.u_distance_covariance_sqr(x, x) + bound = self.chi_bound(selected_variable, y, self.significance) - return dcov / dvar + return dcor.u_distance_covariance_sqr(selected_variable, y) < bound def _rec_maxima_hunting_gen_no_copy( - X: FDataGrid, Y, min_redundancy=0.9, min_relevance=0.2, + X: FDataGrid, y, min_redundancy=0.9, dependence_measure=dcor.u_distance_correlation_sqr, redundancy_dependence_measure=None, correction=None, @@ -621,17 +601,16 @@ def _rec_maxima_hunting_gen_no_copy( Arguments: X: Matrix with one trajectory per row - Y: Vector for the response variable + y: Vector for the response variable min_redundancy: Minimum dependence between two features to be considered redundant. - min_relevance: Minimum score to consider a point relevant dependence_measure: Measure of the dependence between variables correction: Class that defines the correction to apply to eliminate the influence of the selected feature. ''' # X = np.asfarray(X) - Y = np.asfarray(Y) + y = np.asfarray(y) if correction is None: correction = UniformCorrection() @@ -643,42 +622,35 @@ def _rec_maxima_hunting_gen_no_copy( mask = np.zeros([len(t) for t in X.sample_points], dtype=bool) if stopping_condition is None: - stopping_condition = Chi2BoundStop() + stopping_condition = AsymptoticIndependenceTestStop() first_pass = True - correction.begin(X, Y) - - try: - stopping_condition.begin(X=X.data_matrix, Y=Y, T=X.sample_points[0], - min_relevance=min_relevance, - dependence_measure=dependence_measure) - except AttributeError: - pass + correction.begin(X, y) while True: - dependencies = _compute_dependence( - X=X.data_matrix, Y=Y, + dependences = _compute_dependence( + X=X.data_matrix, y=y, dependence_measure=dependence_measure) - t_max_index = _absolute_argmax(dependencies, + t_max_index = _absolute_argmax(dependences, mask=mask) - score = dependencies[t_max_index] + score = dependences[t_max_index] repeated_point = mask[t_max_index] stopping_condition_reached = stopping_condition( selected_index=t_max_index, + dependences=dependences, selected_variable=X.data_matrix[(slice(None),) + tuple(t_max_index)], - score=score, - X=X.data_matrix, Y=Y) + X=X, y=y) if ((repeated_point or stopping_condition_reached) and not first_pass): return - influence_mask = get_influence_mask( + influence_mask = _get_influence_mask( X=X.data_matrix, t_max_index=t_max_index, min_redundancy=min_redundancy, dependence_measure=redundancy_dependence_measure, @@ -694,7 +666,7 @@ def _rec_maxima_hunting_gen_no_copy( # Additional info, useful for debugging if get_intermediate_results: result.matrix_after_correction = np.copy(X.data_matrix) - result.original_dependence = dependencies + result.original_dependence = dependences result.influence_mask = influence_mask result.current_mask = mask @@ -802,7 +774,6 @@ class RecursiveMaximaHunting( def __init__(self, min_redundancy=0.9, - min_relevance=0.2, dependence_measure=dcor.u_distance_correlation_sqr, redundancy_dependence_measure=None, n_components=None, @@ -810,7 +781,6 @@ def __init__(self, stopping_condition=None, num_extra_features=0): self.min_redundancy = min_redundancy - self.min_relevance = min_relevance self.dependence_measure = dependence_measure self.redundancy_dependence_measure = redundancy_dependence_measure self.n_components = n_components @@ -828,9 +798,8 @@ def fit(self, X, y): for i, result in enumerate( _rec_maxima_hunting_gen( X=X.copy(), - Y=y, + y=y, min_redundancy=self.min_redundancy, - min_relevance=self.min_relevance, dependence_measure=self.dependence_measure, redundancy_dependence_measure=red_dep_measure, correction=self.correction, From adbbaa6d78987fbdcea6315736937edc00bd818b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 17 Aug 2020 19:46:07 +0200 Subject: [PATCH 047/210] Simplify corrections. --- .../recursive_maxima_hunting.py | 493 ++++++++++-------- 1 file changed, 278 insertions(+), 215 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 6c1b4a1da..7aa69b89e 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -34,6 +34,45 @@ def _execute_kernel(kernel, t_0, t_1): return _execute_covariance(kernel, t_0, t_1) +class _PicklableKernel(): + """ Class used to pickle GPy kernels.""" + + def __init__(self, kernel): + super().__setattr__('_PicklableKernel__kernel', kernel) + + def __getattr__(self, name): + if name != '__deepcopy__': + return getattr(self.__kernel, name) + + def __setattr__(self, name, value): + setattr(self.__kernel, name, value) + + def __getstate__(self): + return {'class': self.__kernel.__class__, + 'input_dim': self.__kernel.input_dim, + 'values': self.__kernel.param_array} + + def __setstate__(self, state): + super().__setattr__('_PicklableKernel__kernel', state['class']( + input_dim=state['input_dim'])) + self.__kernel.param_array[...] = state['values'] + + def __call__(self, *args, **kwargs): + return self.__kernel.K(*args, **kwargs) + + +def make_kernel(k): + try: + import GPy + except ImportError: + return k + + if isinstance(k, GPy.kern.Kern): + return _PicklableKernel(k) + else: + return k + + def _absolute_argmax(function, *, mask): """ Computes the absolute maximum of a discretized function. @@ -58,7 +97,7 @@ def _absolute_argmax(function, *, mask): return t_max -class Correction(abc.ABC): +class Correction(abc.ABC, sklearn.base.BaseEstimator): """ Base class for applying a correction after a point is taken, eliminating its influence over the rest. @@ -76,7 +115,10 @@ def begin(self, X: FDataGrid, Y): def conditioned(self, **kwargs): """ - Returns a correction object that is conditioned to the value of a point. + Returns a correction object conditioned to the value of a point. + + This method is necessary because after the RMH correction step, the + functions follow a different model. """ return self @@ -86,6 +128,15 @@ def correct(self, X, selected_index): """ Correct the trajectories. + This method subtracts the influence of the selected point from the + other points in the function. + + Parameters: + + X (FDataGrid): Functions in the current iteration of the algorithm. + selected_index (int or tuple of int): Index of the selected point + in the ``data_matrix``. + """ pass @@ -93,324 +144,336 @@ def __call__(self, *args, **kwargs): self.correct(*args, **kwargs) -class ConditionalExpectationCorrection(Correction): +class ConditionalMeanCorrection(Correction): + """ + Base class for applying a correction based on the conditional expectation. + + The functions are assumed to be realizations of a particular stochastic + process. The information subtracted in each iteration would be the + mean of the process conditioned to the value observed at the + selected point. + + """ @abc.abstractmethod - def conditional_expectation(self, T, t_0, x_0, selected_index): + def conditional_mean(self, X, selected_index): + """ + Mean of the process conditioned to the value observed at the + selected point. + + Parameters: + + X (FDataGrid): Functions in the current iteration of the algorithm. + selected_index (int or tuple of int): Index of the selected point + in the ``data_matrix``. + + """ pass def correct(self, X, selected_index): - T = X.sample_points[0] - t_0 = T[selected_index] + X.data_matrix[...] -= self.conditional_mean( + X, selected_index).T - x_index = (slice(None),) + tuple(selected_index) + (np.newaxis,) - x_0 = X.data_matrix[x_index] + X.data_matrix[:, selected_index] = 0 - T = _transform_to_2d(T) - X.data_matrix[...] -= self.conditional_expectation(T, t_0, x_0, - selected_index).T +class GaussianCorrection(ConditionalMeanCorrection): + r""" + Correction assuming that the underlying process is Gaussian. - X.data_matrix[:, selected_index] = 0 + The conditional mean of a Gaussian process :math:`X(t)` is + .. math:: -class SampleGPCorrection(ConditionalExpectationCorrection): - """ - Correction assuming that the process is Gaussian and using as the kernel - the sample covariance. + \mathbb{E}[X(t) \mid X(t_0) = x_0] = \mathbb{E}[X(t)] + + \frac{\mathrm{Cov}[X(t), X(t_0)]}{\mathrm{Cov}[X(t_0), X(t_0)]} + (X(t_0) - \mathbb{E}[X(t_0)]) + + The corrections after this is applied are of type + :class:`GaussianConditionedCorrection`. + + Parameters: + + mean (number or function): Mean function of the Gaussian process. + cov (number or function): Covariance function of the Gaussian process. + fit_hyperparameters (boolean): If ``True`` the hyperparameters of the + covariance function are optimized for the data. """ - def __init__(self, markov=False): - self.gaussian_correction = None - self.covariance_matrix = None - self.T = None - self.time = 0 - self.markov = markov - self.cond_points = [] + def __init__(self, *, mean=0, cov=1, fit_hyperparameters=False): + super(GaussianCorrection, self).__init__() - def begin(self, X: FDataGrid, Y): - T = X.sample_points + self.mean = mean + self.cov = make_kernel(cov) + self.fit_hyperparameters = fit_hyperparameters - X_copy = np.copy(X.data_matrix[..., 0]) + def begin(self, X, y): + if self.fit_hyperparameters: + import GPy - Y = np.ravel(Y) - for class_label in np.unique(Y): - trajectories = X_copy[Y == class_label, :] + T = X.sample_points[0] + X_copy = np.copy(X.data_matrix[..., 0]) - mean = np.mean(trajectories, axis=0) - X_copy[Y == class_label, :] -= mean + y = np.ravel(y) + for class_label in np.unique(y): + trajectories = X_copy[y == class_label, :] - self.covariance_matrix = np.cov(X_copy, rowvar=False) - self.T = np.ravel(T) - self.gaussian_correction = GaussianCorrection(kernel=self.__kernel) + mean = np.mean(trajectories, axis=0) + X_copy[y == class_label, :] -= mean - def __kernel(self, t_0, t_1): - i = np.searchsorted(self.T, t_0) - j = np.searchsorted(self.T, t_1) + m = GPy.models.GPRegression( + T[:, None], X_copy.T, + kernel=self.cov._PicklableKernel__kernel) + m.constrain_positive('') + m.optimize() - i = np.ravel(i) - j = np.ravel(j) + self.cov_ = copy.deepcopy(make_kernel(m.kern)) - return self.covariance_matrix[np.ix_(i, j)] + def _evaluate_mean(self, t): - def conditioned(self, t_0, **kwargs): - self.cond_points.append(t_0) - self.cond_points.sort() - self.gaussian_correction = self.gaussian_correction.conditioned( - t_0=t_0, **kwargs) - return self + mean = self.mean - def conditional_expectation(self, T, t_0, x_0, selected_index): + if isinstance(mean, numbers.Number): + expectation = np.ones_like(t, dtype=float) * mean + else: + expectation = mean(t) - gp_condexp = self.gaussian_correction.conditional_expectation( - T, t_0, x_0, selected_index) + return expectation - if self.markov: - left_index = np.searchsorted(self.cond_points, t_0) + def _evaluate_cov(self, t_0, t_1): + cov = getattr(self, "cov_", self.cov) - left_value = (self.cond_points[left_index - 1] - if left_index != 0 else None) - right_value = (self.cond_points[left_index] - if left_index != len(self.cond_points) else None) + return _execute_kernel(cov, t_0, t_1) - if left_value is not None: - gp_condexp[:, T.ravel() < left_value, :] = 0 + def conditioned(self, X, t_0, **kwargs): + # If the point makes the matrix singular, don't change the correction - if right_value is not None: - gp_condexp[:, T.ravel() > right_value, :] = 0 + cov = getattr(self, "cov_", self.cov) - return gp_condexp + try: + correction = GaussianConditionedCorrection( + mean=self.mean, + cov=cov, + conditioning_points=t_0) -class PicklableKernel(): + correction._covariance_matrix_inv() - def __init__(self, kernel): - super().__setattr__('_PicklableKernel__kernel', kernel) + return correction - def __getattr__(self, name): - if name != '__deepcopy__': - return getattr(self.__kernel, name) + except linalg.LinAlgError: - def __setattr__(self, name, value): - setattr(self.__kernel, name, value) + return self - def __getstate__(self): - return {'class': self.__kernel.__class__, - 'input_dim': self.__kernel.input_dim, - 'values': self.__kernel.param_array} + def conditional_mean(self, X, selected_index): - def __setstate__(self, state): - super().__setattr__('_PicklableKernel__kernel', state['class']( - input_dim=state['input_dim'])) - self.__kernel.param_array[...] = state['values'] + T = X.sample_points[0] - def __call__(self, *args, **kwargs): - return self.__kernel.K(*args, **kwargs) + t_0 = T[selected_index] + x_index = (slice(None),) + tuple(selected_index) + (np.newaxis,) + x_0 = X.data_matrix[x_index] -def make_kernel(k): - try: - import GPy - except ImportError: - return k + T = _transform_to_2d(T) - if isinstance(k, GPy.kern.Kern): - return PicklableKernel(k) - else: - return k + var = self._evaluate_cov(t_0, t_0) + expectation = self._evaluate_mean(T) + assert expectation.shape == T.shape -class UniformCorrection(Correction): - """ - Correction assuming that the underlying process is an Ornstein-Uhlenbeck - process with infinite lengthscale. + t_0_expectation = expectation[selected_index] - """ + b_T = self._evaluate_cov(T, t_0) + assert b_T.shape == T.shape - def conditioned(self, X, t_0, **kwargs): - from ....misc.covariances import Brownian + cond_expectation = (expectation + + b_T / var * + (x_0.T - t_0_expectation) + ) if var else expectation + np.zeros_like(x_0.T) - return GaussianCorrection(kernel=Brownian(origin=t_0)) + return cond_expectation - def correct(self, X, selected_index): - x_index = (slice(None),) + tuple(selected_index) + (np.newaxis,) - # Have to copy it because otherwise is a view and shouldn't be - # subtracted from the original matrix - x_0 = np.copy(X.data_matrix[x_index]) +class GaussianConditionedCorrection(GaussianCorrection): + r""" + Correction assuming that the underlying process is Gaussian, with several + values conditioned to 0. - X.data_matrix[...] -= x_0 + The conditional mean is inherited from :class:`GaussianConditioned`, with + the conditioned mean and covariance. + The corrections after this is applied are of type + :class:`GaussianConditionedCorrection`, adding additional points. -class GaussianCorrection(ConditionalExpectationCorrection): - """ - Correction assuming that the underlying process is Gaussian. + Parameters: + + mean (number or function): Mean function of the (unconditioned) + Gaussian process. + cov (number or function): Covariance function of the (unconditioned) + Gaussian process. """ - def __init__(self, expectation=0, kernel=1, optimize_kernel=False): - super(GaussianCorrection, self).__init__() + def __init__(self, conditioning_points, *, mean=0, cov=1): - self.__expectation = expectation - self.__kernel = make_kernel(kernel) - self.optimize_kernel = optimize_kernel - self.kernel_params_optimized_names = None - self.kernel_params_optimized_values = None + super(GaussianConditionedCorrection, self).__init__( + mean=mean, cov=cov) - def begin(self, X, Y): - if self.optimize_kernel: - import GPy + self.conditioning_points = conditioning_points - T = X.sample_points[0] - X_copy = np.copy(X.data_matrix[..., 0]) + def _covariance_matrix_inv(self): - Y = np.ravel(Y) - for class_label in np.unique(Y): - trajectories = X_copy[Y == class_label, :] + cond_points = self._conditioning_points() - mean = np.mean(trajectories, axis=0) - X_copy[Y == class_label, :] -= mean + cov_matrix_inv = getattr(self, "_cov_matrix_inv", None) + if cov_matrix_inv is None: - m = GPy.models.GPRegression( - T[:, None], X_copy.T, - kernel=self.__kernel._PicklableKernel__kernel) - m.constrain_positive('') - m.optimize() + cov_matrix = super()._evaluate_cov( + cond_points, cond_points + ) + + self._cov_matrix_inv = np.linalg.inv(cov_matrix) + cov_matrix_inv = self._cov_matrix_inv - self.kernel_params_optimized_names = m.parameter_names(), - self.kernel_params_optimized_values = m.param_array + return cov_matrix_inv - self.__kernel = copy.deepcopy(make_kernel(m.kern)) + def _conditioning_points(self): + return _transform_to_2d(self.conditioning_points) def conditioned(self, X, t_0, **kwargs): + # If the point makes the matrix singular, don't change the correction try: - return GaussianConditionedCorrection( - expectation=self.expectation, - kernel=self.kernel, - point_list=t_0) + + correction = GaussianConditionedCorrection( + mean=self.mean, + cov=self.cov, + conditioning_points=np.concatenate( + (self._conditioning_points(), [[t_0]])) + ) + + correction._covariance_matrix_inv() + + return correction + except linalg.LinAlgError: + return self - def expectation(self, t): + def _evaluate_mean(self, t): - if isinstance(self.__expectation, numbers.Number): - expectation = np.ones_like(t, dtype=float) * self.__expectation - else: - expectation = self.__expectation(t) + cond_points = self._conditioning_points() - return expectation + A_inv = self._covariance_matrix_inv() - def kernel(self, t_0, t_1): - return _execute_kernel(self.__kernel, t_0, t_1) + b_T = super()._evaluate_cov(t, cond_points) - covariance = kernel + c = -super()._evaluate_mean(cond_points) + assert c.shape == np.shape(cond_points) - def variance(self, t): - return self.covariance(t, t) + original_expect = super()._evaluate_mean(t) + assert original_expect.shape == t.shape - def conditional_expectation(self, T, t_0, x_0, selected_index): + modified_expect = b_T.dot(A_inv).dot(c) + assert modified_expect.shape == t.shape - var = self.variance(t_0) + expectation = original_expect + modified_expect + assert expectation.shape == t.shape - expectation = self.expectation(T) - assert expectation.shape == T.shape + return expectation - t_0_expectation = expectation[selected_index] + def _evaluate_cov(self, t_0, t_1): - b_T = self.covariance(T, t_0) - assert b_T.shape == T.shape + cond_points = self._conditioning_points() - cond_expectation = (expectation + - b_T / var * - (x_0.T - t_0_expectation) - ) if var else expectation + np.zeros_like(x_0.T) + A_inv = self._covariance_matrix_inv() - return cond_expectation + b_t_0_T = super()._evaluate_cov(t_0, cond_points) + b_t_1 = super()._evaluate_cov(cond_points, t_1) -class GaussianConditionedCorrection(GaussianCorrection): + return (super()._evaluate_cov(t_0, t_1) - + b_t_0_T @ A_inv @ b_t_1) + + +class SampleGaussianCorrection(ConditionalMeanCorrection): """ - Correction assuming that the underlying process is a Gaussian conditioned - to several points with value 0. + Correction assuming that the process is Gaussian and using as the kernel + the sample covariance. """ - def __init__(self, point_list, expectation=0, - kernel=1, **kwargs): - super(GaussianConditionedCorrection, self).__init__( - expectation=self.__expectation, - kernel=self.__kernel, - **kwargs) - - self.point_list = _transform_to_2d(point_list) - self.__gaussian_expectation = expectation - self.__gaussian_kernel = make_kernel(kernel) - self.__covariance_matrix = self.gaussian_kernel( - self.point_list, self.point_list - ) - self.__covariance_matrix_inv = np.linalg.inv(self.__covariance_matrix) + def __init__(self): + self.gaussian_correction = None + self.covariance_matrix = None + self.T = None + self.time = 0 + self.cond_points = [] - def conditioned(self, X, t_0, **kwargs): + def begin(self, X: FDataGrid, Y): + T = X.sample_points - # If the point makes the matrix singular, don't change the correction - try: - return GaussianConditionedCorrection( - expectation=self.__gaussian_expectation, - kernel=self.__gaussian_kernel, - point_list=np.concatenate((self.point_list, [[t_0]])) - ) - except linalg.LinAlgError: - return self + X_copy = np.copy(X.data_matrix[..., 0]) - def gaussian_expectation(self, t): - if isinstance(self.__gaussian_expectation, numbers.Number): - expectation = (np.ones_like(t, dtype=float) * - self.__gaussian_expectation) - else: - expectation = self.__gaussian_expectation(t) + Y = np.ravel(Y) + for class_label in np.unique(Y): + trajectories = X_copy[Y == class_label, :] - return expectation + mean = np.mean(trajectories, axis=0) + X_copy[Y == class_label, :] -= mean - def gaussian_kernel(self, t_0, t_1): - return _execute_kernel(self.__gaussian_kernel, t_0, t_1) + self.covariance_matrix = np.cov(X_copy, rowvar=False) + self.T = np.ravel(T) + self.gaussian_correction = GaussianCorrection(kernel=self.__kernel) - def __expectation(self, t): + def __kernel(self, t_0, t_1): + i = np.searchsorted(self.T, t_0) + j = np.searchsorted(self.T, t_1) - A_inv = self.__covariance_matrix_inv + i = np.ravel(i) + j = np.ravel(j) - b_T = self.gaussian_kernel(t, self.point_list) + return self.covariance_matrix[np.ix_(i, j)] - c = -self.gaussian_expectation(self.point_list) - assert c.shape == np.shape(self.point_list) + def conditioned(self, t_0, **kwargs): + self.cond_points.append(t_0) + self.cond_points.sort() + self.gaussian_correction = self.gaussian_correction.conditioned( + t_0=t_0, **kwargs) + return self - original_expect = self.gaussian_expectation(t) - assert original_expect.shape == t.shape + def conditional_mean(self, X, selected_index): - modified_expect = b_T.dot(A_inv).dot(c) - assert modified_expect.shape == t.shape + return self.gaussian_correction.conditional_expectation( + X, selected_index) - expectation = original_expect + modified_expect - assert expectation.shape == t.shape - return expectation +class UniformCorrection(Correction): + """ + Correction assuming that the underlying process is an Ornstein-Uhlenbeck + process with infinite lengthscale. - def __kernel(self, t_0, t_1): + The initial conditional mean subtracts the observed value from every + point, and the following correction is a :class:`GaussianCorrection` + with a :class:`skfda.misc.covariances.Brownian` covariance function. - A_inv = self.__covariance_matrix_inv + """ - b_t_0_T = self.gaussian_kernel(t_0, self.point_list) - # assert b_t_0_T.shape[0] == np.shape(np.atleast_2d(t_0))[0] - # assert b_t_0_T.shape[1] == np.shape(point_list)[-1] + def conditioned(self, X, t_0, **kwargs): + from ....misc.covariances import Brownian - b_t_1 = self.gaussian_kernel(self.point_list, t_1) - # assert b_t_1.shape[0] == np.shape(point_list)[-1] - # assert b_t_1.shape[1] == np.shape(np.atleast_2d(t_1))[0] + return GaussianCorrection(cov=Brownian(origin=t_0)) - return (self.gaussian_kernel(t_0, t_1) - - b_t_0_T @ A_inv @ b_t_1) + def correct(self, X, selected_index): + x_index = (slice(None),) + tuple(selected_index) + (np.newaxis,) + + # Have to copy it because otherwise is a view and shouldn't be + # subtracted from the original matrix + x_0 = np.copy(X.data_matrix[x_index]) + + X.data_matrix[...] -= x_0 class RMHResult(object): @@ -432,7 +495,7 @@ def __repr__(self): def _get_influence_mask(X, t_max_index, min_redundancy, dependence_measure, old_mask): """ - Get the mask of the points that have much dependence with the + Get the mask of the points that have a large dependence with the selected point. """ From 1183bb320021c5be0f76d0c7d9a3f75639fde1c4 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 17 Aug 2020 20:44:16 +0200 Subject: [PATCH 048/210] Refactor redundancy as its own class. --- .../recursive_maxima_hunting.py | 292 ++++++++++-------- 1 file changed, 158 insertions(+), 134 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 7aa69b89e..1ab57b2ee 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -106,7 +106,8 @@ class Correction(abc.ABC, sklearn.base.BaseEstimator): def begin(self, X: FDataGrid, Y): """ - Initialization. + Initialization for a particular application of Recursive Maxima + Hunting. The initial parameters of Recursive Maxima Hunting can be used there. @@ -309,6 +310,8 @@ class GaussianConditionedCorrection(GaussianCorrection): Parameters: + conditioning_points (iterable of ints or tuples of ints): Points where + the process is conditioned to have the value 0. mean (number or function): Mean function of the (unconditioned) Gaussian process. cov (number or function): Covariance function of the (unconditioned) @@ -405,15 +408,7 @@ class SampleGaussianCorrection(ConditionalMeanCorrection): """ - def __init__(self): - self.gaussian_correction = None - self.covariance_matrix = None - self.T = None - self.time = 0 - self.cond_points = [] - def begin(self, X: FDataGrid, Y): - T = X.sample_points X_copy = np.copy(X.data_matrix[..., 0]) @@ -424,29 +419,30 @@ def begin(self, X: FDataGrid, Y): mean = np.mean(trajectories, axis=0) X_copy[Y == class_label, :] -= mean - self.covariance_matrix = np.cov(X_copy, rowvar=False) - self.T = np.ravel(T) - self.gaussian_correction = GaussianCorrection(kernel=self.__kernel) + self.conditioning_points_ = [] + self.cov_matrix_ = np.cov(X_copy, rowvar=False) + self.t_ = np.ravel(X.sample_points) + self.gaussian_correction_ = GaussianCorrection(kernel=self.cov) - def __kernel(self, t_0, t_1): - i = np.searchsorted(self.T, t_0) - j = np.searchsorted(self.T, t_1) + def cov(self, t_0, t_1): + i = np.searchsorted(self.t_, t_0) + j = np.searchsorted(self.t_, t_1) i = np.ravel(i) j = np.ravel(j) - return self.covariance_matrix[np.ix_(i, j)] + return self.cov_matrix_[np.ix_(i, j)] def conditioned(self, t_0, **kwargs): - self.cond_points.append(t_0) - self.cond_points.sort() - self.gaussian_correction = self.gaussian_correction.conditioned( + self.conditioning_points.append(t_0) + self.conditioning_points.sort() + self.gaussian_correction_ = self.gaussian_correction_.conditioned( t_0=t_0, **kwargs) return self def conditional_mean(self, X, selected_index): - return self.gaussian_correction.conditional_expectation( + return self.gaussian_correction_.conditional_expectation( X, selected_index) @@ -476,79 +472,6 @@ def correct(self, X, selected_index): X.data_matrix[...] -= x_0 -class RMHResult(object): - - def __init__(self, index, score): - self.index = index - self.score = score - self.matrix_after_correction = None - self.original_dependence = None - self.influence_mask = None - self.current_mask = None - - def __repr__(self): - return (self.__class__.__name__ + - "(index={index}, score={score})" - .format(index=self.index, score=self.score)) - - -def _get_influence_mask(X, t_max_index, min_redundancy, dependence_measure, - old_mask): - """ - Get the mask of the points that have a large dependence with the - selected point. - - """ - sl = slice(None) - - def get_index(index): - return (sl,) + tuple(index) + (np.newaxis,) - - def is_redundant(index): - - max_point = np.squeeze(X[get_index(t_max_index)], axis=1) - test_point = np.squeeze(X[get_index(index)], axis=1) - - return (dependence_measure(max_point, test_point) > - min_redundancy) - - def adjacent_indexes(index): - for i, coord in enumerate(index): - # Out of bounds right check - if coord < (X.shape[i + 1] - 1): - new_index = list(index) - new_index[i] += 1 - yield tuple(new_index) - # Out of bounds left check - if coord > 0: - new_index = list(index) - new_index[i] -= 1 - yield tuple(new_index) - - def update_mask(new_mask, index): - indexes = [index] - - while indexes: - index = indexes.pop() - # Check if it wasn't masked before - if ( - not old_mask[index] and not new_mask[index] and - is_redundant(index) - ): - new_mask[index] = True - for i in adjacent_indexes(index): - indexes.append(i) - - new_mask = np.zeros_like(old_mask) - - update_mask(new_mask, t_max_index) - - # The selected point is masked even if min_redundancy is high - new_mask[t_max_index] = True - - return new_mask - - class StoppingCondition(abc.ABC, sklearn.base.BaseEstimator): """ Stopping condition for RMH. @@ -650,14 +573,131 @@ def __call__(self, *, selected_variable, y, **kwargs): return dcor.u_distance_covariance_sqr(selected_variable, y) < bound +class RedundancyCondition(abc.ABC, sklearn.base.BaseEstimator): + """ + Redundancy condition for RMH. + + This is a callable that should return ``True`` if the two points are + redundant and false otherwise. + + """ + + @abc.abstractmethod + def __call__(self, max_point, test_point, **kwargs): + pass + + +class DependenceThresholdRedundancy(RedundancyCondition): + """ + The points are redundant if their dependency is above a given + threshold. + + This stopping condition requires that the dependency has a known bound, for + example that it takes values in the interval :math:`[0, 1]`. + + Parameters: + + threshold (float): Value compared with the score. If the score + of the selected point is not higher than that, + the point will not be selected (unless it is + the first iteration) and RMH will end. + dependence_measure (callable): Dependence measure to use. By default, + it uses the bias corrected squared distance correlation. + + """ + + def __init__(self, threshold=0.9, *, + dependence_measure=dcor.u_distance_correlation_sqr): + + super().__init__() + self.threshold = threshold + self.dependence_measure = dependence_measure + + def __call__(self, *, max_point, test_point, **kwargs): + + return self.dependence_measure(max_point, test_point) > self.threshold + + +class RMHResult(object): + + def __init__(self, index, score): + self.index = index + self.score = score + self.matrix_after_correction = None + self.original_dependence = None + self.influence_mask = None + self.current_mask = None + + def __repr__(self): + return (self.__class__.__name__ + + "(index={index}, score={score})" + .format(index=self.index, score=self.score)) + + +def _get_influence_mask(X, t_max_index, redundancy_condition, old_mask): + """ + Get the mask of the points that have a large dependence with the + selected point. + + """ + sl = slice(None) + + def get_index(index): + return (sl,) + tuple(index) + (np.newaxis,) + + def is_redundant(index): + + max_point = np.squeeze(X[get_index(t_max_index)], axis=1) + test_point = np.squeeze(X[get_index(index)], axis=1) + + return redundancy_condition(max_point=max_point, + test_point=test_point) + + def adjacent_indexes(index): + for i, coord in enumerate(index): + # Out of bounds right check + if coord < (X.shape[i + 1] - 1): + new_index = list(index) + new_index[i] += 1 + yield tuple(new_index) + # Out of bounds left check + if coord > 0: + new_index = list(index) + new_index[i] -= 1 + yield tuple(new_index) + + def update_mask(new_mask, index): + indexes = [index] + + while indexes: + index = indexes.pop() + # Check if it wasn't masked before + if ( + not old_mask[index] and not new_mask[index] and + is_redundant(index) + ): + new_mask[index] = True + for i in adjacent_indexes(index): + indexes.append(i) + + new_mask = np.zeros_like(old_mask) + + update_mask(new_mask, t_max_index) + + # The selected point is masked even if min_redundancy is high + new_mask[t_max_index] = True + + return new_mask + + def _rec_maxima_hunting_gen_no_copy( - X: FDataGrid, y, min_redundancy=0.9, + X: FDataGrid, y, *, dependence_measure=dcor.u_distance_correlation_sqr, - redundancy_dependence_measure=None, correction=None, + redundancy_condition=None, + stopping_condition=None, mask=None, - get_intermediate_results=False, - stopping_condition=None): + get_intermediate_results=False): ''' Find the most relevant features of a function using recursive maxima hunting. It changes the original matrix. @@ -678,12 +718,12 @@ def _rec_maxima_hunting_gen_no_copy( if correction is None: correction = UniformCorrection() - if redundancy_dependence_measure is None: - redundancy_dependence_measure = dependence_measure - if mask is None: mask = np.zeros([len(t) for t in X.sample_points], dtype=bool) + if redundancy_condition is None: + redundancy_condition = DependenceThresholdRedundancy() + if stopping_condition is None: stopping_condition = AsymptoticIndependenceTestStop() @@ -715,8 +755,7 @@ def _rec_maxima_hunting_gen_no_copy( influence_mask = _get_influence_mask( X=X.data_matrix, t_max_index=t_max_index, - min_redundancy=min_redundancy, - dependence_measure=redundancy_dependence_measure, + redundancy_condition=redundancy_condition, old_mask=mask) mask |= influence_mask @@ -774,10 +813,13 @@ class RecursiveMaximaHunting( dependence_measure (callable): Dependence measure to use. By default, it uses the bias corrected squared distance correlation. - local_maxima_selector (callable): Function to detect local maxima. The - default is :func:`select_local_maxima` with ``order`` parameter - equal to one. The original article used a similar function testing - different values of ``order``. + max_features (int): Maximum number of features to select. + correction (Correction): Correction used to subtract the information + of each selected point in each iteration. + redundancy_condition (callable): Condition to consider a point + redundant with the selected maxima and discard it from future + consideration as a maximum. + stopping_condition (callable): Condition to stop the algorithm. Examples: @@ -835,53 +877,35 @@ class RecursiveMaximaHunting( ''' - def __init__(self, - min_redundancy=0.9, + def __init__(self, *, dependence_measure=dcor.u_distance_correlation_sqr, - redundancy_dependence_measure=None, - n_components=None, + max_features=None, correction=None, - stopping_condition=None, - num_extra_features=0): - self.min_redundancy = min_redundancy + redundancy_condition=None, + stopping_condition=None): self.dependence_measure = dependence_measure - self.redundancy_dependence_measure = redundancy_dependence_measure - self.n_components = n_components + self.max_features = max_features self.correction = correction + self.redundancy_condition = redundancy_condition self.stopping_condition = stopping_condition - self.num_extra_features = num_extra_features def fit(self, X, y): self.features_shape_ = X.data_matrix.shape[1:] - red_dep_measure = self.redundancy_dependence_measure - indexes = [] for i, result in enumerate( _rec_maxima_hunting_gen( X=X.copy(), y=y, - min_redundancy=self.min_redundancy, dependence_measure=self.dependence_measure, - redundancy_dependence_measure=red_dep_measure, correction=self.correction, - stopping_condition=self.stopping_condition, - get_intermediate_results=(self.num_extra_features != 0))): - - if self.n_components is None or i < self.n_components: - indexes.append(result.index) - - if self.num_extra_features: - mask = result.influence_mask - new_indexes = [a[0] for a in np.ndenumerate(mask) if a[1]] - new_indexes.remove(result.index) - new_indexes = random.sample(new_indexes, min( - len(new_indexes), self.num_extra_features)) + redundancy_condition=self.redundancy_condition, + stopping_condition=self.stopping_condition)): - indexes = indexes + new_indexes + indexes.append(result.index) - else: + if self.max_features is not None and i >= self.max_features: break self.indexes_ = tuple(np.transpose(indexes).tolist()) From 92f887e177b0c18334d49458be4a305d06cafdfe Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 17 Aug 2020 23:20:13 +0200 Subject: [PATCH 049/210] Improve documentation. --- docs/modules/preprocessing/dim_reduction.rst | 8 +++ .../recursive_maxima_hunting.rst | 61 +++++++++++++++++++ .../recursive_maxima_hunting.py | 51 ++++++++++------ 3 files changed, 101 insertions(+), 19 deletions(-) create mode 100644 docs/modules/preprocessing/dim_reduction/recursive_maxima_hunting.rst diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index 1a27db215..d45a85909 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -19,7 +19,15 @@ following: :toctree: autosummary skfda.preprocessing.dim_reduction.variable_selection.MaximaHunting + skfda.preprocessing.dim_reduction.variable_selection.RecursiveMaximaHunting skfda.preprocessing.dim_reduction.variable_selection.RKHSVariableSelection + +.. toctree:: + :hidden: + :maxdepth: 4 + :caption: Modules: + + dim_reduction/recursive_maxima_hunting Projection ---------- diff --git a/docs/modules/preprocessing/dim_reduction/recursive_maxima_hunting.rst b/docs/modules/preprocessing/dim_reduction/recursive_maxima_hunting.rst new file mode 100644 index 000000000..58d8efab4 --- /dev/null +++ b/docs/modules/preprocessing/dim_reduction/recursive_maxima_hunting.rst @@ -0,0 +1,61 @@ +Recursive Maxima Hunting +======================== + +The recursive maxima hunting method is described in +:class:`~skfda.preprocessing.dim_reduction.variable_selection.RecursiveMaximaHunting`. + +This method has several parts that can be customized and are described here. + +Correction +---------- + +Recursive Maxima Hunting is an iterative variable selection method that +modifies the data functions in each iteration subtracting the information of +the selected points, in order to uncover points that become relevant once +other points are selected. Thus, the correction applied depends on how we +define the information of the selected points. This can be customized using +the ``correction`` parameter, passing a object with one of the following +interfaces: + +.. autosummary:: + :toctree: autosummary + + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.Correction + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.ConditionalMeanCorrection + +Currently the available objects are: + +.. autosummary:: + :toctree: autosummary + + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.UniformCorrection + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.GaussianCorrection + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.GaussianConditionedCorrection + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.GaussianSampleCorrection + +Redundancy +---------- + +Although redundant points should be eliminated by the correction, numerical +errors and inappropriate corrections may cause redundant points to become +maxima. Thus, redundant points are explicitly masked to exclude them for +future considerations. + +Currently there is only one way to detect if a point is redundant: + +.. autosummary:: + :toctree: autosummary + + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.DependenceThresholdRedundancy + +Stopping criterion +------------------ + +In order for the algorithm to stop, the remaining points should not be relevant +enough. There are several ways to check this condition: + +.. autosummary:: + :toctree: autosummary + + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.ScoreThresholdStop + skfda.preprocessing.dim_reduction.variable_selection.recursive_maxima_hunting.AsymptoticIndependenceTestStop diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 1ab57b2ee..39e648014 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -302,7 +302,7 @@ class GaussianConditionedCorrection(GaussianCorrection): Correction assuming that the underlying process is Gaussian, with several values conditioned to 0. - The conditional mean is inherited from :class:`GaussianConditioned`, with + The conditional mean is inherited from :class:`GaussianCorrection`, with the conditioned mean and covariance. The corrections after this is applied are of type @@ -401,7 +401,7 @@ def _evaluate_cov(self, t_0, t_1): b_t_0_T @ A_inv @ b_t_1) -class SampleGaussianCorrection(ConditionalMeanCorrection): +class GaussianSampleCorrection(ConditionalMeanCorrection): """ Correction assuming that the process is Gaussian and using as the kernel the sample covariance. @@ -453,7 +453,8 @@ class UniformCorrection(Correction): The initial conditional mean subtracts the observed value from every point, and the following correction is a :class:`GaussianCorrection` - with a :class:`skfda.misc.covariances.Brownian` covariance function. + with a :class:`~skfda.misc.covariances.Brownian` covariance function with + the selected point as its origin. """ @@ -698,20 +699,25 @@ def _rec_maxima_hunting_gen_no_copy( stopping_condition=None, mask=None, get_intermediate_results=False): - ''' + """ Find the most relevant features of a function using recursive maxima hunting. It changes the original matrix. - Arguments: - X: Matrix with one trajectory per row - y: Vector for the response variable - min_redundancy: Minimum dependence between two features to be - considered redundant. - dependence_measure: Measure of the dependence between variables - correction: Class that defines the correction to apply to eliminate the - influence of the selected feature. - ''' + Parameters: + dependence_measure (callable): Dependence measure to use. By default, + it uses the bias corrected squared distance correlation. + max_features (int): Maximum number of features to select. + correction (Correction): Correction used to subtract the information + of each selected point in each iteration. + redundancy_condition (callable): Condition to consider a point + redundant with the selected maxima and discard it from future + consideration as a maximum. + stopping_condition (callable): Condition to stop the algorithm. + mask (boolean array): Masked values. + get_intermediate_results (boolean): Return additional debug info. + + """ # X = np.asfarray(X) y = np.asfarray(y) @@ -791,7 +797,7 @@ def _rec_maxima_hunting_gen(X, *args, **kwargs): class RecursiveMaximaHunting( sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): - r''' + r""" Recursive Maxima Hunting variable selection. This is a filter variable selection method for problems with a target @@ -808,18 +814,25 @@ class RecursiveMaximaHunting( selected by :class:`MaximaHunting` alone. This method was originally described in a special case in article [1]_. + Additional information about the usage of this method can be found in + :doc:`/modules/preprocessing/dim_reduction/recursive_maxima_hunting`. Parameters: dependence_measure (callable): Dependence measure to use. By default, it uses the bias corrected squared distance correlation. - max_features (int): Maximum number of features to select. + max_features (int): Maximum number of features to select. By default + there is no limit. correction (Correction): Correction used to subtract the information - of each selected point in each iteration. + of each selected point in each iteration. By default it is + a :class:`.UniformCorrection` object. redundancy_condition (callable): Condition to consider a point redundant with the selected maxima and discard it from future - consideration as a maximum. - stopping_condition (callable): Condition to stop the algorithm. + consideration as a maximum. By default it is a + :class:`.DependenceThresholdRedundancy` object. + stopping_condition (callable): Condition to stop the algorithm. By + default it is a :class:`.AsymptoticIndependenceTestStop` + object. Examples: @@ -875,7 +888,7 @@ class RecursiveMaximaHunting( in Advances in Neural Information Processing Systems 29, Curran Associates, Inc., 2016, pp. 4835–4843. - ''' + """ def __init__(self, *, dependence_measure=dcor.u_distance_correlation_sqr, From ccea8e538f4915ac84c9e131353fe1ac42718ee7 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 23 Aug 2020 18:14:50 +0200 Subject: [PATCH 050/210] Fix GaussianSampleCorrection. --- .../variable_selection/maxima_hunting.py | 37 ++++++++------- .../recursive_maxima_hunting.py | 8 ++-- tests/test_recursive_maxima_hunting.py | 46 +++++++++++++++++++ 3 files changed, 67 insertions(+), 24 deletions(-) create mode 100644 tests/test_recursive_maxima_hunting.py diff --git a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index a88a530d9..db0e0ff92 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -15,20 +15,22 @@ def _compute_dependence(X, y, *, dependence_measure): corresponding class label in Y. ''' - def vectorial_dependence_measure(x): - x = np.atleast_2d(x).transpose() + # Move n_samples to the end + # The shape is now input_shape + n_samples + n_output + X = np.moveaxis(X, 0, -2) - return dependence_measure(x, y) + input_shape = X.shape[:-2] - vectorial_dependence_measure = np.vectorize( - vectorial_dependence_measure, - otypes=[float], - signature="(m,n)->()" - ) + # Join input in a list for rowwise + X = X.reshape(-1, X.shape[-2], X.shape[-1]) - X_view = np.rollaxis(X, 0, len(X.shape)) + if y.ndim == 1: + y = np.atleast_2d(y).T + Y = np.array([y] * len(X)) - return vectorial_dependence_measure(X_view) + dependence_results = dcor.rowwise(dependence_measure, X, Y) + + return dependence_results.reshape(input_shape) def select_local_maxima(X, *, order: int=1): @@ -42,6 +44,7 @@ def select_local_maxima(X, *, order: int=1): Parameters: + X (numpy array): Where to compute the local maxima. order (callable): How many points on each side to look, to check if a point is a maximum in that interval. @@ -176,11 +179,9 @@ def __init__(self, def fit(self, X: FDataGrid, y): - X, y = sklearn.utils.validation.check_X_y(X.data_matrix[..., 0], y) - - self.features_shape_ = X.shape[1:] + self.features_shape_ = X.data_matrix.shape[1:] self.dependence_ = _compute_dependence( - X[..., np.newaxis], y, + X.data_matrix, y, dependence_measure=self.dependence_measure) self.indexes_ = self.local_maxima_selector(self.dependence_) @@ -194,7 +195,7 @@ def get_support(self, indices: bool=False): if indices: return self.indexes_ else: - mask = np.zeros(self.features_shape_[0], dtype=bool) + mask = np.zeros(self.features_shape_[0:-1], dtype=bool) mask[self.indexes_] = True return mask @@ -202,10 +203,8 @@ def transform(self, X, y=None): sklearn.utils.validation.check_is_fitted(self) - X = sklearn.utils.validation.check_array(X.data_matrix[..., 0]) - - if X.shape[1:] != self.features_shape_: + if X.data_matrix.shape[1:] != self.features_shape_: raise ValueError("The trajectories have a different number of " "points than the ones fitted") - return X[:, self.sorted_indexes_] + return X.data_matrix[:, self.sorted_indexes_].reshape(X.n_samples, -1) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 39e648014..1b4a83793 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -419,10 +419,10 @@ def begin(self, X: FDataGrid, Y): mean = np.mean(trajectories, axis=0) X_copy[Y == class_label, :] -= mean - self.conditioning_points_ = [] self.cov_matrix_ = np.cov(X_copy, rowvar=False) self.t_ = np.ravel(X.sample_points) - self.gaussian_correction_ = GaussianCorrection(kernel=self.cov) + self.gaussian_correction_ = GaussianCorrection( + cov=self.cov) def cov(self, t_0, t_1): i = np.searchsorted(self.t_, t_0) @@ -434,15 +434,13 @@ def cov(self, t_0, t_1): return self.cov_matrix_[np.ix_(i, j)] def conditioned(self, t_0, **kwargs): - self.conditioning_points.append(t_0) - self.conditioning_points.sort() self.gaussian_correction_ = self.gaussian_correction_.conditioned( t_0=t_0, **kwargs) return self def conditional_mean(self, X, selected_index): - return self.gaussian_correction_.conditional_expectation( + return self.gaussian_correction_.conditional_mean( X, selected_index) diff --git a/tests/test_recursive_maxima_hunting.py b/tests/test_recursive_maxima_hunting.py new file mode 100644 index 000000000..1834d3902 --- /dev/null +++ b/tests/test_recursive_maxima_hunting.py @@ -0,0 +1,46 @@ +import skfda +from skfda.datasets import make_gaussian_process +from skfda.preprocessing.dim_reduction import variable_selection as vs +import unittest + +import numpy as np + + +class TestRMH(unittest.TestCase): + + def test_rmh(self): + n_samples = 10000 + n_features = 100 + + def mean_1(t): + return (np.abs(t - 0.25) + - 2 * np.abs(t - 0.5) + + np.abs(t - 0.75)) + + X_0 = make_gaussian_process(n_samples=n_samples // 2, + n_features=n_features, + random_state=0) + X_1 = make_gaussian_process(n_samples=n_samples // 2, + n_features=n_features, + mean=mean_1, + random_state=1) + X = skfda.concatenate((X_0, X_1)) + + y = np.zeros(n_samples) + y[n_samples // 2:] = 1 + + correction = vs.recursive_maxima_hunting.GaussianSampleCorrection() + stopping_condition = vs.recursive_maxima_hunting.ScoreThresholdStop( + threshold=0.05) + + rmh = vs.RecursiveMaximaHunting( + correction=correction, + stopping_condition=stopping_condition) + _ = rmh.fit(X, y) + point_mask = rmh.get_support() + points = X.sample_points[0][point_mask] + np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) + + +if __name__ == '__main__': + unittest.main() From 7c4bd26697669619d20759e6d27d69ac5a0fd62d Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 25 Aug 2020 15:39:20 +0200 Subject: [PATCH 051/210] Change name of one parameter of RKHSVariableSelection. --- .../dim_reduction/variable_selection/_rkvs.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 569edae35..acdbb0dfd 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -6,7 +6,7 @@ from ....representation import FDataGrid -def _rkhs_vs(X, Y, n_components: int=1): +def _rkhs_vs(X, Y, n_features_to_select: int=1): ''' Parameters ---------- @@ -14,18 +14,18 @@ def _rkhs_vs(X, Y, n_components: int=1): Matrix of trajectories Y Vector of class labels - n_components - Number of selected components + n_features_to_select + Number of selected features ''' X = np.atleast_2d(X) - assert n_components >= 1 - assert n_components <= X.shape[1] + assert n_features_to_select >= 1 + assert n_features_to_select <= X.shape[1] Y = np.asarray(Y) - selected_features = np.zeros(n_components, dtype=int) - score = np.zeros(n_components) + selected_features = np.zeros(n_features_to_select, dtype=int) + score = np.zeros(n_features_to_select) indexes = np.arange(0, X.shape[1]) # Calculate means and covariance matrix @@ -55,7 +55,7 @@ def _rkhs_vs(X, Y, n_components: int=1): score[0] = mu_sigma[selected_features[0]] indexes = np.delete(indexes, selected_features[0]) - for i in range(1, n_components): + for i in range(1, n_features_to_select): aux = np.zeros_like(indexes, dtype=np.float_) for j in range(0, indexes.shape[0]): @@ -115,7 +115,7 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, Parameters: - n_components (int): number of variables to select. + n_features_to_select (int): number of features to select. Examples: @@ -149,7 +149,8 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, Select the relevant points to distinguish the two classes - >>> rkvs = variable_selection.RKHSVariableSelection(n_components=3) + >>> rkvs = variable_selection.RKHSVariableSelection( + ... n_features_to_select=3) >>> _ = rkvs.fit(X, y) >>> point_mask = rkvs.get_support() >>> points = X.sample_points[0][point_mask] @@ -174,8 +175,8 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, ''' - def __init__(self, n_components: int=1): - self.n_components = n_components + def __init__(self, n_features_to_select: int=1): + self.n_features_to_select = n_features_to_select def fit(self, X: FDataGrid, y): @@ -195,7 +196,7 @@ def fit(self, X: FDataGrid, y): self._features_, self._scores_ = _rkhs_vs( X=X, Y=y, - n_components=self.n_components) + n_features_to_select=self.n_features_to_select) return self From 70cdf8984e81f1d727de282f5053187e3b86e51f Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 27 Aug 2020 17:10:57 +0200 Subject: [PATCH 052/210] Mention the default value of `cov` in `make_gaussian_process`. --- skfda/datasets/_samples_generators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 059cc3489..c45fc9b57 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -26,7 +26,8 @@ def make_gaussian_process(n_samples: int = 100, n_features: int = 100, *, ``n_features``. cov: The covariance function of the process. Can be a callable accepting two vectors with the locations, or a - matrix with size ``n_features`` x ``n_features``. + matrix with size ``n_features`` x ``n_features``. By default, + the Brownian covariance function is used. noise: Standard deviation of Gaussian noise added to the data. random_state: Random state. From 9df271db89eb1061bbe0f5ef59d715b6aa4bf5e1 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 1 Sep 2020 13:17:55 +0200 Subject: [PATCH 053/210] First version of `make_gaussian`. --- skfda/datasets/__init__.py | 3 +- skfda/datasets/_samples_generators.py | 56 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/skfda/datasets/__init__.py b/skfda/datasets/__init__.py index 48ead6663..7fa549473 100644 --- a/skfda/datasets/__init__.py +++ b/skfda/datasets/__init__.py @@ -4,7 +4,8 @@ fetch_tecator, fetch_medflies, fetch_weather, fetch_aemet, fetch_octane, fetch_gait) -from ._samples_generators import (make_gaussian_process, +from ._samples_generators import (make_gaussian, + make_gaussian_process, make_sinusoidal_process, make_multimodal_samples, make_multimodal_landmarks, diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index c45fc9b57..6035ab548 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -5,11 +5,67 @@ import numpy as np from .. import FDataGrid +from .._utils import _cartesian_product from ..misc import covariances from ..preprocessing.registration import normalize_warping from ..representation.interpolation import SplineInterpolation +def make_gaussian(n_samples: int = 100, *, + sample_points, + domain_range=None, + mean=0, cov=None, noise: float = 0., + random_state=None): + """Generate Gaussian process or fields. + + Args: + n_samples: The total number of trajectories. + n_features: The total number of features (points of evaluation). + start: Starting point of the trajectories. + stop: Ending point of the trajectories. + mean: The mean function of the process. Can be a callable accepting + a vector with the locations, or a vector with length + ``n_features``. + cov: The covariance function of the process. Can be a + callable accepting two vectors with the locations, or a + matrix with size ``n_features`` x ``n_features``. By default, + the Brownian covariance function is used. + noise: Standard deviation of Gaussian noise added to the data. + random_state: Random state. + + Returns: + :class:`FDataGrid` object comprising all the trajectories. + + """ + + random_state = sklearn.utils.check_random_state(random_state) + + if cov is None: + cov = covariances.Brownian() + + input_points = _cartesian_product(sample_points) + + covariance = covariances._execute_covariance( + cov, input_points, input_points) + + if noise: + covariance += np.eye(len(covariance)) * noise ** 2 + + mu = np.zeros(len(input_points)) + if callable(mean): + mean = mean(sample_points) + mu += mean + + data_matrix = random_state.multivariate_normal( + mu, covariance, n_samples) + + data_matrix = data_matrix.reshape( + [n_samples] + [len(t) for t in sample_points]) + + return FDataGrid(sample_points=sample_points, data_matrix=data_matrix, + domain_range=domain_range) + + def make_gaussian_process(n_samples: int = 100, n_features: int = 100, *, start: float = 0., stop: float = 1., mean=0, cov=None, noise: float = 0., From 7858e57448f10e735cc97ea0bac90db68bcb14c6 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Sep 2020 02:16:36 +0200 Subject: [PATCH 054/210] Implement `make_gaussian_process` in terms of `make_gaussian`. --- skfda/datasets/_samples_generators.py | 31 ++++++++------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 6035ab548..47ef2ab49 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -51,13 +51,13 @@ def make_gaussian(n_samples: int = 100, *, if noise: covariance += np.eye(len(covariance)) * noise ** 2 - mu = np.zeros(len(input_points)) + mu = np.zeros_like(input_points) if callable(mean): - mean = mean(sample_points) + mean = mean(input_points) mu += mean data_matrix = random_state.multivariate_normal( - mu, covariance, n_samples) + mu.ravel(), covariance, n_samples) data_matrix = data_matrix.reshape( [n_samples] + [len(t) for t in sample_points]) @@ -92,26 +92,13 @@ def make_gaussian_process(n_samples: int = 100, n_features: int = 100, *, """ - random_state = sklearn.utils.check_random_state(random_state) - - x = np.linspace(start, stop, n_features) - - if cov is None: - cov = covariances.Brownian() - - covariance = covariances._execute_covariance(cov, x, x) - - if noise: - covariance += np.eye(n_features) * noise ** 2 - - mu = np.zeros(n_features) - if callable(mean): - mean = mean(x) - mu += mean - - y = random_state.multivariate_normal(mu, covariance, n_samples) + t = np.linspace(start, stop, n_features) - return FDataGrid(sample_points=x, data_matrix=y) + return make_gaussian(n_samples=n_samples, + sample_points=[t], + mean=mean, cov=cov, + noise=noise, + random_state=random_state) def make_sinusoidal_process(n_samples: int = 15, n_features: int = 100, *, From 29e160183a70e2b3ad9e1db7aba660eb93525230 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Sep 2020 12:56:30 +0200 Subject: [PATCH 055/210] Fixed error in `make_gaussian`. --- skfda/datasets/_samples_generators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 47ef2ab49..41b9f105a 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -51,10 +51,11 @@ def make_gaussian(n_samples: int = 100, *, if noise: covariance += np.eye(len(covariance)) * noise ** 2 - mu = np.zeros_like(input_points) + mu = np.zeros(len(input_points)) if callable(mean): mean = mean(input_points) - mu += mean + + mu += np.ravel(mean) data_matrix = random_state.multivariate_normal( mu.ravel(), covariance, n_samples) From f410d065298c64dfbbe6a962c3894799de35829a Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Sep 2020 18:23:21 +0200 Subject: [PATCH 056/210] Extended Brownian to several dimensions. --- skfda/misc/covariances.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/skfda/misc/covariances.py b/skfda/misc/covariances.py index 1ba97f2c2..2ea24e301 100644 --- a/skfda/misc/covariances.py +++ b/skfda/misc/covariances.py @@ -203,7 +203,7 @@ class Brownian(Covariance): _parameters = [("variance", r"\sigma^2"), ("origin", r"\mathcal{O}")] - def __init__(self, *, variance: float = 1., origin: float = 0.): + def __init__(self, *, variance: float = 1., origin=0.): self.variance = variance self.origin = origin @@ -211,7 +211,13 @@ def __call__(self, x, y): x = _transform_to_2d(x) - self.origin y = _transform_to_2d(y) - self.origin - return self.variance * (np.abs(x) + np.abs(y.T) - np.abs(x - y.T)) / 2 + sum_norms = np.add.outer( + np.linalg.norm(x, axis=-1), + np.linalg.norm(y, axis=-1)) + norm_sub = np.linalg.norm( + x[:, np.newaxis, :] - y[np.newaxis, :, :], axis=-1) + + return self.variance * (sum_norms - norm_sub) / 2 class Linear(Covariance): From a6bf6942088b5864619a13116349865d25fdbcc4 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Sep 2020 20:19:18 +0200 Subject: [PATCH 057/210] Update documentation. --- docs/modules/datasets.rst | 3 ++- skfda/datasets/_samples_generators.py | 26 +++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/modules/datasets.rst b/docs/modules/datasets.rst index fc09bb486..b4968fdd0 100644 --- a/docs/modules/datasets.rst +++ b/docs/modules/datasets.rst @@ -43,7 +43,8 @@ The following functions are used to make synthetic functional datasets: .. autosummary:: :toctree: autosummary - + + skfda.datasets.make_gaussian skfda.datasets.make_gaussian_process skfda.datasets.make_sinusoidal_process skfda.datasets.make_multimodal_samples diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 41b9f105a..0771f8a87 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -16,19 +16,18 @@ def make_gaussian(n_samples: int = 100, *, domain_range=None, mean=0, cov=None, noise: float = 0., random_state=None): - """Generate Gaussian process or fields. + """Generate Gaussian random fields. Args: n_samples: The total number of trajectories. - n_features: The total number of features (points of evaluation). - start: Starting point of the trajectories. - stop: Ending point of the trajectories. - mean: The mean function of the process. Can be a callable accepting - a vector with the locations, or a vector with length - ``n_features``. + sample_points: Sample points for the evaluation grid of the + Gaussian field. + mean: The mean function of the random field. Can be a callable + accepting a vector with the locations, or a vector with + appropriate size. cov: The covariance function of the process. Can be a callable accepting two vectors with the locations, or a - matrix with size ``n_features`` x ``n_features``. By default, + matrix with appropriate size. By default, the Brownian covariance function is used. noise: Standard deviation of Gaussian noise added to the data. random_state: Random state. @@ -36,6 +35,10 @@ def make_gaussian(n_samples: int = 100, *, Returns: :class:`FDataGrid` object comprising all the trajectories. + See also: + :func:`make_gaussian_process`: Simpler function for generating + Gaussian processes. + """ random_state = sklearn.utils.check_random_state(random_state) @@ -61,7 +64,7 @@ def make_gaussian(n_samples: int = 100, *, mu.ravel(), covariance, n_samples) data_matrix = data_matrix.reshape( - [n_samples] + [len(t) for t in sample_points]) + [n_samples] + [len(t) for t in sample_points] + [-1]) return FDataGrid(sample_points=sample_points, data_matrix=data_matrix, domain_range=domain_range) @@ -91,6 +94,11 @@ def make_gaussian_process(n_samples: int = 100, n_features: int = 100, *, Returns: :class:`FDataGrid` object comprising all the trajectories. + See also: + :func:`make_gaussian`: More general function that allows to + select the points of evaluation and to + generate data in higer dimensions. + """ t = np.linspace(start, stop, n_features) From cd44e2e1dc0738b3f91af98d70b31b52e2f2ee9a Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Sep 2020 20:57:28 +0200 Subject: [PATCH 058/210] Update required Sphinx version. --- readthedocs-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs-requirements.txt b/readthedocs-requirements.txt index e97c6a51e..a3cb14b8f 100644 --- a/readthedocs-requirements.txt +++ b/readthedocs-requirements.txt @@ -3,7 +3,7 @@ numpy scipy Cython sklearn -Sphinx +Sphinx>=3 sphinx_rtd_theme sphinx-gallery pillow From 0639d2d17cb0afca8d51f402d800597c33e65e1b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 11 Oct 2020 19:21:41 +0200 Subject: [PATCH 059/210] Rename `sample_points` to `grid_points`. --- examples/plot_composition.py | 4 +- examples/plot_discrete_representation.py | 6 +- examples/plot_interpolation.py | 4 +- .../plot_radius_neighbors_classification.py | 11 +- examples/plot_representation.py | 2 +- examples/plot_surface_boxplot.py | 2 +- skfda/_neighbors/base.py | 30 ++-- skfda/_neighbors/outlier.py | 4 +- skfda/_utils/_utils.py | 4 +- skfda/datasets/_real_datasets.py | 37 ++-- skfda/datasets/_samples_generators.py | 24 +-- skfda/exploratory/depth/_depth.py | 26 +-- .../outliers/_directional_outlyingness.py | 18 +- skfda/exploratory/outliers/_iqr.py | 4 +- skfda/exploratory/visualization/_boxplot.py | 40 ++--- .../visualization/_magnitude_shape_plot.py | 6 +- skfda/exploratory/visualization/clustering.py | 4 +- .../visualization/representation.py | 16 +- skfda/inference/anova/anova_oneway.py | 6 +- skfda/misc/_math.py | 6 +- skfda/misc/metrics.py | 36 ++-- .../_linear_differential_operator.py | 6 +- skfda/ml/clustering/kmeans.py | 8 +- .../dim_reduction/projection/_fpca.py | 16 +- .../dim_reduction/variable_selection/_rkvs.py | 4 +- .../variable_selection/maxima_hunting.py | 4 +- .../recursive_maxima_hunting.py | 16 +- .../registration/_landmark_registration.py | 14 +- .../registration/_shift_registration.py | 8 +- skfda/preprocessing/registration/_warping.py | 8 +- skfda/preprocessing/registration/elastic.py | 28 +-- .../preprocessing/registration/validation.py | 2 +- skfda/preprocessing/smoothing/_basis.py | 20 +-- skfda/preprocessing/smoothing/_linear.py | 6 +- .../smoothing/kernel_smoothers.py | 8 +- skfda/preprocessing/smoothing/validation.py | 2 +- .../representation/_evaluation_trasformer.py | 12 +- skfda/representation/_functional_data.py | 4 +- skfda/representation/basis/_fdatabasis.py | 46 +++-- skfda/representation/grid.py | 160 +++++++++++------- skfda/representation/interpolation.py | 14 +- tests/test_basis.py | 6 +- tests/test_clustering.py | 26 +-- tests/test_elastic.py | 2 +- tests/test_fdata_boxplot.py | 10 +- tests/test_fpca.py | 4 +- tests/test_grid.py | 14 +- tests/test_interpolation.py | 36 ++-- tests/test_math.py | 8 +- tests/test_metrics.py | 14 +- tests/test_oneway_anova.py | 2 +- tests/test_outliers.py | 14 +- tests/test_pandas_fdatagrid.py | 14 +- tests/test_recursive_maxima_hunting.py | 2 +- tests/test_registration.py | 8 +- tests/test_smoothing.py | 6 +- 56 files changed, 444 insertions(+), 398 deletions(-) diff --git a/examples/plot_composition.py b/examples/plot_composition.py index ff9b33566..5dded0d1b 100644 --- a/examples/plot_composition.py +++ b/examples/plot_composition.py @@ -38,9 +38,9 @@ # Constructs example surface X, Y, Z = axes3d.get_test_data(1.2) data_matrix = [Z.T] -sample_points = [X[0, :], Y[:, 0]] +grid_points = [X[0, :], Y[:, 0]] -g = skfda.FDataGrid(data_matrix, sample_points) +g = skfda.FDataGrid(data_matrix, grid_points) # Sets cubic interpolation g.interpolation = skfda.representation.interpolation.SplineInterpolation( diff --git a/examples/plot_discrete_representation.py b/examples/plot_discrete_representation.py index 47e6afb80..a1543f8f1 100644 --- a/examples/plot_discrete_representation.py +++ b/examples/plot_discrete_representation.py @@ -20,15 +20,15 @@ # random displacements. random_state = np.random.RandomState(0) -sample_points = np.linspace(0, 1) -data = np.array([np.sin((sample_points + random_state.randn()) +grid_points = np.linspace(0, 1) +data = np.array([np.sin((grid_points + random_state.randn()) * 2 * np.pi) for _ in range(5)]) ############################################################################## # The FDataGrid class is used for datasets containing discretized functions # that are measured at the same points. -fd = FDataGrid(data, sample_points, +fd = FDataGrid(data, grid_points, dataset_name='Sinusoidal curves', argument_names=['t'], coordinate_names=['x(t)']) diff --git a/examples/plot_interpolation.py b/examples/plot_interpolation.py index 13bd3d17b..9f3e03352 100644 --- a/examples/plot_interpolation.py +++ b/examples/plot_interpolation.py @@ -114,10 +114,10 @@ X, Y, Z = axes3d.get_test_data(1.2) data_matrix = [Z.T] -sample_points = [X[0, :], Y[:, 0]] +grid_points = [X[0, :], Y[:, 0]] -fd = skfda.FDataGrid(data_matrix, sample_points) +fd = skfda.FDataGrid(data_matrix, grid_points) fig = fd.plot() fd.scatter(fig=fig) diff --git a/examples/plot_radius_neighbors_classification.py b/examples/plot_radius_neighbors_classification.py index 4c07289d1..57dd64a00 100644 --- a/examples/plot_radius_neighbors_classification.py +++ b/examples/plot_radius_neighbors_classification.py @@ -11,13 +11,14 @@ # sphinx_gallery_thumbnail_number = 2 +import skfda +from skfda.misc.metrics import pairwise_distance, lp_distance +from skfda.ml.classification import RadiusNeighborsClassifier + from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import numpy as np -import skfda -from skfda.misc.metrics import pairwise_distance, lp_distance -from skfda.ml.classification import RadiusNeighborsClassifier ############################################################################## @@ -76,7 +77,7 @@ lower = sample - radius upper = sample + radius fig.axes[0].fill_between( - sample.sample_points[0], lower.data_matrix.flatten(), + sample.grid_points[0], lower.data_matrix.flatten(), upper.data_matrix[0].flatten(), alpha=.25, color='C1') @@ -95,7 +96,7 @@ fig = X_train[distances <= radius].plot(color='C0') sample.plot(fig=fig, color='red', linewidth=3) fig.axes[0].fill_between( - sample.sample_points[0], lower.data_matrix.flatten(), + sample.grid_points[0], lower.data_matrix.flatten(), upper.data_matrix[0].flatten(), alpha=.25, color='C1') diff --git a/examples/plot_representation.py b/examples/plot_representation.py index 1763c4887..fb40be899 100644 --- a/examples/plot_representation.py +++ b/examples/plot_representation.py @@ -33,7 +33,7 @@ ############################################################################## # This kind of representation is a discretized representation, in which the # measurement points are shared between samples. -print(fd.sample_points) +print(fd.grid_points) ############################################################################## # In this representation, the data can be arranged as a matrix. diff --git a/examples/plot_surface_boxplot.py b/examples/plot_surface_boxplot.py index d64dbb6a3..b411abec7 100644 --- a/examples/plot_surface_boxplot.py +++ b/examples/plot_surface_boxplot.py @@ -50,7 +50,7 @@ # We can plot now the extruded trajectories. fd_2 = FDataGrid(data_matrix=cube, - sample_points=np.tile(fd.sample_points, (2, 1)), + grid_points=np.tile(fd.grid_points, (2, 1)), dataset_name="Extruded Brownian process") fd_2.plot() diff --git a/skfda/_neighbors/base.py b/skfda/_neighbors/base.py index 17f8b5287..36ee1af09 100644 --- a/skfda/_neighbors/base.py +++ b/skfda/_neighbors/base.py @@ -21,19 +21,19 @@ def _to_multivariate(fdatagrid): Returns: (np.array): Numpy array with size (n_samples, points), where - points = prod([len(d) for d in fdatagrid.sample_points] + points = prod([len(d) for d in fdatagrid.grid_points] """ return fdatagrid.data_matrix.reshape(fdatagrid.n_samples, -1) -def _from_multivariate(data_matrix, sample_points, shape, **kwargs): +def _from_multivariate(data_matrix, grid_points, shape, **kwargs): r"""Constructs a FDatagrid from the data matrix flattened. Args: data_matrix (np.array): Data Matrix flattened as multivariate vector compatible with sklearn. - sample_points (array_like): List with sample points for each dimension. + grid_points (array_like): List with sample points for each dimension. shape (tuple): Shape of the data_matrix. **kwargs: Named params to be passed to the FDataGrid constructor. @@ -41,10 +41,10 @@ def _from_multivariate(data_matrix, sample_points, shape, **kwargs): (:class:`FDataGrid`): FDatagrid with the data. """ - return FDataGrid(data_matrix.reshape(shape), sample_points, **kwargs) + return FDataGrid(data_matrix.reshape(shape), grid_points, **kwargs) -def _to_multivariate_metric(metric, sample_points): +def _to_multivariate_metric(metric, grid_points): r"""Transform a metric between FDatagrid in a sklearn compatible one. Given a metric between FDatagrids returns a compatible metric used to @@ -53,7 +53,7 @@ def _to_multivariate_metric(metric, sample_points): Args: metric (pyfunc): Metric of the module `mics.metrics`. Must accept two FDataGrids and return a float representing the distance. - sample_points (array_like): Array of arrays with the sample points of + grid_points (array_like): Array of arrays with the sample points of the FDataGrids. Returns: @@ -82,12 +82,12 @@ def _to_multivariate_metric(metric, sample_points): """ # Shape -> (n_samples = 1, domain_dims...., image_dimension (-1)) - shape = [1] + [len(axis) for axis in sample_points] + [-1] + shape = [1] + [len(axis) for axis in grid_points] + [-1] def multivariate_metric(x, y, _check=False, **kwargs): - return metric(_from_multivariate(x, sample_points, shape), - _from_multivariate(y, sample_points, shape), + return metric(_from_multivariate(x, grid_points, shape), + _from_multivariate(y, grid_points, shape), _check=_check, **kwargs) return multivariate_metric @@ -156,7 +156,7 @@ def fit(self, X, y=None): self.estimator_ = self._init_estimator(self.metric) self.estimator_.fit(X, y) else: - self._sample_points = X.sample_points + self._grid_points = X.grid_points self._shape = X.data_matrix.shape[1:] if not self.multivariate_metric: @@ -167,7 +167,7 @@ def fit(self, X, y=None): metric = self.metric sklearn_metric = _to_multivariate_metric(metric, - self._sample_points) + self._grid_points) else: sklearn_metric = self.metric @@ -493,7 +493,7 @@ def _functional_fit(self, X, y): self.estimator_ = self._init_estimator(self.metric) self.estimator_.fit(X) else: - self._sample_points = X.sample_points + self._grid_points = X.grid_points self._shape = X.data_matrix.shape[1:] if not self.multivariate_metric: @@ -504,7 +504,7 @@ def _functional_fit(self, X, y): metric = self.metric sklearn_metric = _to_multivariate_metric(metric, - self._sample_points) + self._grid_points) else: sklearn_metric = self.metric @@ -780,7 +780,7 @@ def _functional_score(self, X, y, sample_weight=None): sum_u = np.sum(data_u, axis=0) sum_v = np.sum(data_v, axis=0) - int_u = simps(sum_u, x=u.sample_points[0]) - int_v = simps(sum_v, x=v.sample_points[0]) + int_u = simps(sum_u, x=u.grid_points[0]) + int_v = simps(sum_v, x=v.grid_points[0]) return 1 - int_u / int_v diff --git a/skfda/_neighbors/outlier.py b/skfda/_neighbors/outlier.py index 9b844575d..efe45108d 100644 --- a/skfda/_neighbors/outlier.py +++ b/skfda/_neighbors/outlier.py @@ -277,7 +277,7 @@ def fit_predict(self, X, y=None): self.estimator_ = self._init_estimator(self.metric) res = self.estimator_.fit_predict(X, y) else: - self._sample_points = X.sample_points + self._grid_points = X.grid_points self._shape = X.data_matrix.shape[1:] if not self.multivariate_metric: @@ -287,7 +287,7 @@ def fit_predict(self, X, y=None): else: metric = self.metric sklearn_metric = _to_multivariate_metric(metric, - self._sample_points) + self._grid_points) else: sklearn_metric = self.metric diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 333d74360..4da22d8e0 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -72,9 +72,9 @@ def _to_grid(X, y, eval_points=None): X = X.to_grid(eval_points) y = y.to_grid(eval_points) elif x_is_grid and not y_is_grid: - y = y.to_grid(X.sample_points[0]) + y = y.to_grid(X.grid_points[0]) elif not x_is_grid and y_is_grid: - X = X.to_grid(y.sample_points[0]) + X = X.to_grid(y.grid_points[0]) elif not x_is_grid and not y_is_grid: X = X.to_grid() y = y.to_grid() diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index f89ea15f9..97a0462b0 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -1,6 +1,5 @@ -import warnings - import rdata +import warnings import numpy as np @@ -21,7 +20,7 @@ def fdata_constructor(obj, attrs): names = obj["names"] return FDataGrid(data_matrix=obj["data"], - sample_points=obj["argvals"], + grid_points=obj["argvals"], domain_range=obj["rangeval"], dataset_name=names['main'][0], argument_names=(names['xlab'][0],), @@ -35,22 +34,22 @@ def functional_constructor(obj, attrs): target = np.array(obj['labels']).ravel() dataf = obj['dataf'] - sample_points_set = {a for o in dataf for a in o["args"]} + grid_points_set = {a for o in dataf for a in o["args"]} - args_init = min(sample_points_set) - args_end = max(sample_points_set) + args_init = min(grid_points_set) + args_end = max(grid_points_set) - sample_points = np.arange(args_init, - args_end + 1) + grid_points = np.arange(args_init, + args_end + 1) - data_matrix = np.zeros(shape=(len(dataf), len(sample_points))) + data_matrix = np.zeros(shape=(len(dataf), len(grid_points))) for num_sample, o in enumerate(dataf): for t, x in zip(o["args"], o["vals"]): data_matrix[num_sample, t - args_init] = x return (FDataGrid(data_matrix=data_matrix, - sample_points=sample_points, + grid_points=grid_points, domain_range=(args_init, args_end), dataset_name=name[0], argument_names=(args_label[0],), @@ -112,9 +111,9 @@ def ucr_to_fdatagrid(data): data = np.transpose(data, axes=(0, 2, 1)) - sample_points = range(data.shape[1]) + grid_points = range(data.shape[1]) - return FDataGrid(data, sample_points=sample_points) + return FDataGrid(data, grid_points=grid_points) dataset['data'] = ucr_to_fdatagrid(dataset['data']) del dataset['feature_names'] @@ -220,7 +219,7 @@ def fetch_phoneme(return_X_y: bool = False): speaker = data["speaker"].values curves = FDataGrid(data_matrix=curve_data.values, - sample_points=np.linspace(0, 8, 256), + grid_points=np.linspace(0, 8, 256), domain_range=[0, 8], dataset_name="Phoneme", argument_names=("frequency (kHz)",), @@ -275,7 +274,7 @@ def fetch_growth(return_X_y: bool = False): males = data["hgtm"].T curves = FDataGrid(data_matrix=np.concatenate((males, females), axis=0), - sample_points=ages, + grid_points=ages, dataset_name="Berkeley Growth Study", argument_names=("age",), coordinate_names=("height",)) @@ -470,7 +469,7 @@ def fetch_weather(return_X_y: bool = False): temp_prec_daily = np.transpose(weather_daily[:, :, 0:2], axes=(1, 0, 2)) curves = FDataGrid(data_matrix=temp_prec_daily, - sample_points=np.arange(0, 365) + 0.5, + grid_points=np.arange(0, 365) + 0.5, domain_range=(0, 365), dataset_name="Canadian Weather", argument_names=("day",), @@ -601,7 +600,7 @@ def fetch_octane(return_X_y: bool = False): # "wavelengths ranging from 1102nm to 1552nm with measurements every two # nm."" - sample_points = np.linspace(1102, 1552, 226) + grid_points = np.linspace(1102, 1552, 226) # "The octane data set contains six outliers (25, 26, 36–39) to which # alcohol was added". @@ -609,7 +608,7 @@ def fetch_octane(return_X_y: bool = False): target[24] = target[25] = target[35:39] = 1 # Outliers 1 curves = FDataGrid(data, - sample_points=sample_points, + grid_points=grid_points, dataset_name="Octane", argument_names=("wavelength (nm)",), coordinate_names=("absorbances",)) @@ -654,10 +653,10 @@ def fetch_gait(return_X_y: bool = False): data_matrix = np.asarray(data) data_matrix = np.transpose(data_matrix, axes=(1, 0, 2)) - sample_points = np.asarray(data.coords.get('dim_0'), np.float64) + grid_points = np.asarray(data.coords.get('dim_0'), np.float64) curves = FDataGrid(data_matrix=data_matrix, - sample_points=sample_points, + grid_points=grid_points, dataset_name="GAIT", argument_names=("Time (proportion of gait cycle)",), coordinate_names=("Hip angle (degrees)", diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 0771f8a87..fd038c1ea 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -12,7 +12,7 @@ def make_gaussian(n_samples: int = 100, *, - sample_points, + grid_points, domain_range=None, mean=0, cov=None, noise: float = 0., random_state=None): @@ -20,7 +20,7 @@ def make_gaussian(n_samples: int = 100, *, Args: n_samples: The total number of trajectories. - sample_points: Sample points for the evaluation grid of the + grid_points: Sample points for the evaluation grid of the Gaussian field. mean: The mean function of the random field. Can be a callable accepting a vector with the locations, or a vector with @@ -46,7 +46,7 @@ def make_gaussian(n_samples: int = 100, *, if cov is None: cov = covariances.Brownian() - input_points = _cartesian_product(sample_points) + input_points = _cartesian_product(grid_points) covariance = covariances._execute_covariance( cov, input_points, input_points) @@ -64,9 +64,9 @@ def make_gaussian(n_samples: int = 100, *, mu.ravel(), covariance, n_samples) data_matrix = data_matrix.reshape( - [n_samples] + [len(t) for t in sample_points] + [-1]) + [n_samples] + [len(t) for t in grid_points] + [-1]) - return FDataGrid(sample_points=sample_points, data_matrix=data_matrix, + return FDataGrid(grid_points=grid_points, data_matrix=data_matrix, domain_range=domain_range) @@ -104,7 +104,7 @@ def make_gaussian_process(n_samples: int = 100, n_features: int = 100, *, t = np.linspace(start, stop, n_features) return make_gaussian(n_samples=n_samples, - sample_points=[t], + grid_points=[t], mean=mean, cov=cov, noise=noise, random_state=random_state) @@ -161,7 +161,7 @@ def make_sinusoidal_process(n_samples: int = 15, n_features: int = 100, *, y = alpha @ np.sin((2 * np.pi / period) * t + phi) + error - return FDataGrid(sample_points=t, data_matrix=y) + return FDataGrid(grid_points=t, data_matrix=y) def make_multimodal_landmarks(n_samples: int = 15, *, n_modes: int = 1, @@ -280,12 +280,12 @@ def make_multimodal_samples(n_samples: int = 15, *, n_modes: int = 1, axis = np.linspace(start, stop, points_per_dim) if dim_domain == 1: - sample_points = axis + grid_points = axis evaluation_grid = axis else: - sample_points = np.repeat(axis[:, np.newaxis], dim_domain, axis=1).T + grid_points = np.repeat(axis[:, np.newaxis], dim_domain, axis=1).T - meshgrid = np.meshgrid(*sample_points) + meshgrid = np.meshgrid(*grid_points) evaluation_grid = np.empty(meshgrid[0].shape + (dim_domain,)) @@ -312,7 +312,7 @@ def make_multimodal_samples(n_samples: int = 15, *, n_modes: int = 1, data_matrix += random_state.normal(0, noise, size=data_matrix.shape) - return FDataGrid(sample_points=sample_points, data_matrix=data_matrix) + return FDataGrid(grid_points=grid_points, data_matrix=data_matrix) def make_random_warping(n_samples: int = 15, n_features: int = 100, *, @@ -399,7 +399,7 @@ def make_random_warping(n_samples: int = 15, n_features: int = 100, *, # Creation of FDataGrid in the corresponding domain data_matrix = scipy.integrate.cumtrapz(v, dx=1. / n_features, initial=0, axis=0) - warping = FDataGrid(data_matrix.T, sample_points=time[:, 0]) + warping = FDataGrid(data_matrix.T, grid_points=time[:, 0]) warping = normalize_warping(warping, domain_range=(start, stop)) warping.interpolation = SplineInterpolation(interpolation_order=3, monotone=True) diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index db1c86898..3c7833e79 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -73,8 +73,8 @@ def _rank_samples(fdatagrid): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> _rank_samples(fd) array([[ 4., 4., 4., 4., 4., 4.], [ 3., 3., 3., 3., 3., 3.], @@ -87,8 +87,8 @@ def _rank_samples(fdatagrid): ... [[4], [0.4], [5]]], ... [[[2], [0.5], [2]], ... [[3], [0.6], [3]]]] - >>> sample_points = [[2, 4], [3, 6, 8]] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [[2, 4], [3, 6, 8]] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> _rank_samples(fd) array([[[ 1., 2., 1.], [ 2., 1., 2.]], @@ -137,8 +137,8 @@ def band_depth(fdatagrid, *, pointwise=False): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> band_depth(fd) array([ 0.5 , 0.83333333, 0.5 , 0.5 ]) @@ -187,8 +187,8 @@ def modified_band_depth(fdatagrid, *, pointwise=False): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> depth = modified_band_depth(fd) >>> depth.round(2) array([ 0.5 , 0.83, 0.72, 0.67]) @@ -215,7 +215,7 @@ def modified_band_depth(fdatagrid, *, pointwise=False): return depth_pointwise else: npoints_sample = reduce(lambda x, y: x * len(y), - fdatagrid.sample_points, 1) + fdatagrid.grid_points, 1) proportion = match.sum(axis=axis) / npoints_sample depth = (proportion + fdatagrid.n_samples - 1) / nchoose2 @@ -285,8 +285,8 @@ def fraiman_muniz_depth(fdatagrid, *, pointwise=False): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> fraiman_muniz_depth(fd) array([ 0.5 , 0.75 , 0.925, 0.875]) @@ -308,7 +308,7 @@ def fraiman_muniz_depth(fdatagrid, *, pointwise=False): pointwise_depth = np.array([ 1 - abs(0.5 - _cumulative_distribution( fdatagrid.data_matrix[:, i, 0]) - ) for i in range(len(fdatagrid.sample_points[0]))]).T + ) for i in range(len(fdatagrid.grid_points[0]))]).T if pointwise: return pointwise_depth @@ -318,7 +318,7 @@ def fraiman_muniz_depth(fdatagrid, *, pointwise=False): - fdatagrid.domain_range[0][0]) depth = (scipy.integrate.simps(pointwise_depth, - fdatagrid.sample_points[0]) + fdatagrid.grid_points[0]) / interval_len) return depth diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index 50624bb7f..0887e3f5a 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -1,3 +1,4 @@ +from skfda.exploratory.depth.multivariate import projection_depth import typing from numpy import linalg as la @@ -8,7 +9,6 @@ from sklearn.covariance import MinCovDet import numpy as np -from skfda.exploratory.depth.multivariate import projection_depth from ... import FDataGrid @@ -100,8 +100,8 @@ def directional_outlyingness_stats( ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = FDataGrid(data_matrix, grid_points) >>> stats = directional_outlyingness_stats(fd) >>> stats.directional_outlyingness array([[[ 0.89932101], @@ -151,7 +151,7 @@ def directional_outlyingness_stats( raise NotImplementedError("Only support 1 dimension on the domain.") if (pointwise_weights is not None and - (len(pointwise_weights) != len(fdatagrid.sample_points[0]) or + (len(pointwise_weights) != len(fdatagrid.grid_points[0]) or pointwise_weights.sum() != 1)): raise ValueError( "There must be a weight in pointwise_weights for each recorded " @@ -159,7 +159,7 @@ def directional_outlyingness_stats( if pointwise_weights is None: pointwise_weights = np.ones( - len(fdatagrid.sample_points[0])) / ( + len(fdatagrid.grid_points[0])) / ( fdatagrid.domain_range[0][1] - fdatagrid.domain_range[0][0]) depth_pointwise = depth_method(fdatagrid, pointwise=True) @@ -190,7 +190,7 @@ def directional_outlyingness_stats( assert weighted_dir_outlyingness.shape == dir_outlyingness.shape mean_dir_outlyingness = scipy.integrate.simps(weighted_dir_outlyingness, - fdatagrid.sample_points[0], + fdatagrid.grid_points[0], axis=1) assert mean_dir_outlyingness.shape == ( fdatagrid.n_samples, fdatagrid.dim_codomain) @@ -200,7 +200,7 @@ def directional_outlyingness_stats( mean_dir_outlyingness[:, np.newaxis, :], axis=-1)) weighted_norm = norm * pointwise_weights variation_dir_outlyingness = scipy.integrate.simps( - weighted_norm, fdatagrid.sample_points[0], + weighted_norm, fdatagrid.grid_points[0], axis=1) assert variation_dir_outlyingness.shape == (fdatagrid.n_samples,) @@ -289,8 +289,8 @@ class DirectionalOutlierDetector(BaseEstimator, OutlierMixin): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> out_detector = DirectionalOutlierDetector() >>> out_detector.fit_predict(fd) array([1, 1, 1, 1]) diff --git a/skfda/exploratory/outliers/_iqr.py b/skfda/exploratory/outliers/_iqr.py index d48d41cf1..632906f01 100644 --- a/skfda/exploratory/outliers/_iqr.py +++ b/skfda/exploratory/outliers/_iqr.py @@ -24,8 +24,8 @@ class IQROutlierDetector(BaseEstimator, OutlierMixin): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> out_detector = IQROutlierDetector() >>> out_detector.fit_predict(fd) array([-1, 1, 1, -1]) diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index 6b1ce495e..c79b15735 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -114,17 +114,17 @@ class Boxplot(FDataBoxplot): Attributes: fdatagrid (FDataGrid): Object containing the data. - median (array, (fdatagrid.dim_codomain, nsample_points)): contains + median (array, (fdatagrid.dim_codomain, ngrid_points)): contains the median/s. - central_envelope (array, (fdatagrid.dim_codomain, 2, nsample_points)): + central_envelope (array, (fdatagrid.dim_codomain, 2, ngrid_points)): contains the central envelope/s. non_outlying_envelope (array, (fdatagrid.dim_codomain, 2, - nsample_points)): + ngrid_points)): contains the non-outlying envelope/s. colormap (matplotlib.colors.LinearSegmentedColormap): Colormap from which the colors to represent the central regions are selected. envelopes (array, (fdatagrid.dim_codomain * ncentral_regions, 2, - nsample_points)): contains the region envelopes. + ngrid_points)): contains the region envelopes. outliers (array, (fdatagrid.dim_codomain, fdatagrid.n_samples)): contains the outliers. barcol (string): Color of the envelopes and vertical lines. @@ -159,8 +159,8 @@ class Boxplot(FDataBoxplot): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = FDataGrid(data_matrix, sample_points, dataset_name="dataset", + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = FDataGrid(data_matrix, grid_points, dataset_name="dataset", ... argument_names=["x_label"], ... coordinate_names=["y_label"]) >>> Boxplot(fd) @@ -190,7 +190,7 @@ class Boxplot(FDataBoxplot): [-1. ], [-1. ], [-1. ]]]), - sample_points=(array([ 0., 2., 4., 6., 8., 10.]),), + grid_points=(array([ 0., 2., 4., 6., 8., 10.]),), domain_range=((0.0, 10.0),), dataset_name='dataset', argument_names=('x_label',), @@ -389,35 +389,35 @@ def plot(self, chart=None, *, fig=None, axes=None, # Outliers for o in outliers: - axes[m].plot(o.sample_points[0], + axes[m].plot(o.grid_points[0], o.data_matrix[0, :, m], color=self.outliercol, linestyle='--', zorder=1) for i in range(len(self._prob)): # central regions - axes[m].fill_between(self.fdatagrid.sample_points[0], + axes[m].fill_between(self.fdatagrid.grid_points[0], self.envelopes[i][0][..., m], self.envelopes[i][1][..., m], facecolor=color[i], zorder=var_zorder) # outlying envelope - axes[m].plot(self.fdatagrid.sample_points[0], + axes[m].plot(self.fdatagrid.grid_points[0], self.non_outlying_envelope[0][..., m], - self.fdatagrid.sample_points[0], + self.fdatagrid.grid_points[0], self.non_outlying_envelope[1][..., m], color=self.barcol, zorder=4) # central envelope - axes[m].plot(self.fdatagrid.sample_points[0], + axes[m].plot(self.fdatagrid.grid_points[0], self.central_envelope[0][..., m], - self.fdatagrid.sample_points[0], + self.fdatagrid.grid_points[0], self.central_envelope[1][..., m], color=self.barcol, zorder=4) # vertical lines index = math.ceil(self.fdatagrid.ncol / 2) - x = self.fdatagrid.sample_points[0][index] + x = self.fdatagrid.grid_points[0][index] axes[m].plot([x, x], [self.non_outlying_envelope[0][..., m][index], self.central_envelope[0][..., m][index]], @@ -429,7 +429,7 @@ def plot(self, chart=None, *, fig=None, axes=None, color=self.barcol, zorder=4) # median sample - axes[m].plot(self.fdatagrid.sample_points[0], self.median[..., m], + axes[m].plot(self.fdatagrid.grid_points[0], self.median[..., m], color=self.mediancol, zorder=5) _set_labels(self.fdatagrid, fig, axes) @@ -497,8 +497,8 @@ class SurfaceBoxplot(FDataBoxplot): ... [[4], [0.4], [5]]], ... [[[2], [0.5], [2]], ... [[3], [0.6], [3]]]] - >>> sample_points = [[2, 4], [3, 6, 8]] - >>> fd = FDataGrid(data_matrix, sample_points, dataset_name="dataset", + >>> grid_points = [[2, 4], [3, 6, 8]] + >>> fd = FDataGrid(data_matrix, grid_points, dataset_name="dataset", ... argument_names=["x1_label", "x2_label"], ... coordinate_names=["y_label"]) >>> SurfaceBoxplot(fd) @@ -516,7 +516,7 @@ class SurfaceBoxplot(FDataBoxplot): [[ 3. ], [ 0.6], [ 3. ]]]]), - sample_points=(array([ 2., 4.]), array([ 3., 6., 8.])), + grid_points=(array([ 2., 4.]), array([ 3., 6., 8.])), domain_range=((2.0, 4.0), (3.0, 8.0)), dataset_name='dataset', argument_names=('x1_label', 'x2_label'), @@ -671,9 +671,9 @@ def plot(self, chart=None, *, fig=None, axes=None, fig, axes = _set_figure_layout_for_fdata( self.fdatagrid, fig, axes, n_rows, n_cols) - x = self.fdatagrid.sample_points[0] + x = self.fdatagrid.grid_points[0] lx = len(x) - y = self.fdatagrid.sample_points[1] + y = self.fdatagrid.grid_points[1] ly = len(y) X, Y = np.meshgrid(x, y) diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 6992e4e87..faddef6f8 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -116,8 +116,8 @@ class MagnitudeShapePlot: ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [ 0., 2., 4., 6., 8., 10.] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [ 0., 2., 4., 6., 8., 10.] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> MagnitudeShapePlot(fd) MagnitudeShapePlot( FDataGrid=FDataGrid( @@ -145,7 +145,7 @@ class MagnitudeShapePlot: [-1. ], [-1. ], [-1. ]]]), - sample_points=(array([ 0., 2., 4., 6., 8., 10.]),), + grid_points=(array([ 0., 2., 4., 6., 8., 10.]),), domain_range=((0.0, 10.0),), ...), depth_method=projection_depth, diff --git a/skfda/exploratory/visualization/clustering.py b/skfda/exploratory/visualization/clustering.py index c945e02ef..21af30129 100644 --- a/skfda/exploratory/visualization/clustering.py +++ b/skfda/exploratory/visualization/clustering.py @@ -176,12 +176,12 @@ def _plot_clusters(estimator, fdata, *, chart=None, fig=None, axes=None, for j in range(fdata.dim_codomain): for i in range(fdata.n_samples): - axes[j].plot(fdata.sample_points[0], + axes[j].plot(fdata.grid_points[0], fdata.data_matrix[i, :, j], c=colors_by_cluster[i], label=sample_labels[i]) for i in range(estimator.n_clusters): - axes[j].plot(fdata.sample_points[0], + axes[j].plot(fdata.grid_points[0], estimator.cluster_centers_.data_matrix[i, :, j], c=center_colors[i], label=center_labels[i], diff --git a/skfda/exploratory/visualization/representation.py b/skfda/exploratory/visualization/representation.py index 3389a343f..209356e5a 100644 --- a/skfda/exploratory/visualization/representation.py +++ b/skfda/exploratory/visualization/representation.py @@ -209,7 +209,7 @@ def plot_graph(fdata, chart=None, *, fig=None, axes=None, return fig -def plot_scatter(fdata, chart=None, *, sample_points=None, +def plot_scatter(fdata, chart=None, *, grid_points=None, fig=None, axes=None, n_rows=None, n_cols=None, domain_range=None, group=None, group_colors=None, group_names=None, @@ -222,7 +222,7 @@ def plot_scatter(fdata, chart=None, *, sample_points=None, with the graphs are plotted or axis over where the graphs are plotted. If None and ax is also None, the figure is initialized. - sample_points (ndarray): points to plot. + grid_points (ndarray): points to plot. fig (figure object, optional): figure over with the graphs are plotted in case ax is not specified. If None and ax is also None, the figure is initialized. @@ -267,14 +267,14 @@ def plot_scatter(fdata, chart=None, *, sample_points=None, evaluated_points = None - if sample_points is None: + if grid_points is None: # This can only be done for FDataGrid - sample_points = fdata.sample_points + grid_points = fdata.grid_points evaluated_points = fdata.data_matrix if evaluated_points is None: evaluated_points = fdata( - sample_points, grid=True) + grid_points, grid=True) fig, axes = _get_figure_and_axes(chart, fig, axes) fig, axes = _set_figure_layout_for_fdata(fdata, fig, axes, n_rows, n_cols) @@ -297,14 +297,14 @@ def plot_scatter(fdata, chart=None, *, sample_points=None, if sample_colors is not None: color_dict["color"] = sample_colors[j] - axes[i].scatter(sample_points[0], + axes[i].scatter(grid_points[0], evaluated_points[j, ..., i].T, **color_dict, **kwargs) else: - X = fdata.sample_points[0] - Y = fdata.sample_points[1] + X = fdata.grid_points[0] + Y = fdata.grid_points[1] X, Y = np.meshgrid(X, Y) color_dict = {} diff --git a/skfda/inference/anova/anova_oneway.py b/skfda/inference/anova/anova_oneway.py index d806b7fd1..f25bede87 100644 --- a/skfda/inference/anova/anova_oneway.py +++ b/skfda/inference/anova/anova_oneway.py @@ -56,7 +56,7 @@ def v_sample_stat(fd, weights, p=2): >>> x1 = t * (1 - t) ** 5 >>> x2 = t ** 2 * (1 - t) ** 4 >>> x3 = t ** 3 * (1 - t) ** 3 - >>> fd = FDataGrid([x1, x2, x3], sample_points=t) + >>> fd = FDataGrid([x1, x2, x3], grid_points=t) >>> weights = [10, 20, 30] Finally the value of the statistic is calculated: @@ -129,7 +129,7 @@ def v_asymptotic_stat(fd, weights, p=2): >>> x1 = t * (1 - t) ** 5 >>> x2 = t ** 2 * (1 - t) ** 4 >>> x3 = t ** 3 * (1 - t) ** 3 - >>> fd = FDataGrid([x1, x2, x3], sample_points=t) + >>> fd = FDataGrid([x1, x2, x3], grid_points=t) >>> weights = [10, 20, 30] Finally the value of the statistic is calculated: @@ -304,7 +304,7 @@ def oneway_anova(*args, n_reps=2000, return_dist=False, random_state=None, if isinstance(fd_groups[0], FDataGrid): # Creating list with all the sample points - list_sample = [fd.sample_points[0].tolist() for fd in fd_groups] + list_sample = [fd.grid_points[0].tolist() for fd in fd_groups] # Checking that the all the entries in the list are the same if not list_sample.count(list_sample[0]) == len(list_sample): raise ValueError("All FDataGrid passed must have the same sample " diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index fbf9b5af9..51b1edda5 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -256,13 +256,13 @@ def inner_product(arg1, arg2, **kwargs): @inner_product.register def inner_product_fdatagrid(arg1: FDataGrid, arg2: FDataGrid): - if not np.array_equal(arg1.sample_points, - arg2.sample_points): + if not np.array_equal(arg1.grid_points, + arg2.grid_points): raise ValueError("Sample points for both objects must be equal") integrand = arg1.data_matrix * arg2.data_matrix - for s in arg1.sample_points: + for s in arg1.grid_points: integrand = scipy.integrate.simps(integrand, x=s, axis=1) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index 18188c40a..a8b181923 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -43,20 +43,20 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): fdata2 = fdata2.to_grid(eval_points) elif not isinstance(fdata1, FDataGrid) and isinstance(fdata2, FDataGrid): - fdata1 = fdata1.to_grid(fdata2.sample_points[0]) + fdata1 = fdata1.to_grid(fdata2.grid_points[0]) elif not isinstance(fdata2, FDataGrid) and isinstance(fdata1, FDataGrid): - fdata2 = fdata2.to_grid(fdata1.sample_points[0]) + fdata2 = fdata2.to_grid(fdata1.grid_points[0]) elif (not isinstance(fdata1, FDataGrid) and not isinstance(fdata2, FDataGrid)): domain = fdata1.domain_range[0] - sample_points = np.linspace(*domain) - fdata1 = fdata1.to_grid(sample_points) - fdata2 = fdata2.to_grid(sample_points) + grid_points = np.linspace(*domain) + fdata1 = fdata1.to_grid(grid_points) + fdata2 = fdata2.to_grid(grid_points) - elif not np.array_equal(fdata1.sample_points, - fdata2.sample_points): + elif not np.array_equal(fdata1.grid_points, + fdata2.grid_points): raise ValueError("Sample points for both objects must be equal or" "a new list evaluation points must be specified") @@ -252,7 +252,7 @@ def lp_norm(fdata, p=2, p2=None): # Computes the norm, approximating the integral with Simpson's # rule. res = scipy.integrate.simps(data_matrix[..., 0] ** p, - x=fdata.sample_points) ** (1 / p) + x=fdata.grid_points) ** (1 / p) else: # Needed to perform surface integration @@ -356,12 +356,12 @@ def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): _check=_check) # Both should have the same sample points - eval_points_normalized = _normalize_scale(fdata1.sample_points[0]) + eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) - fdata1 = fdata1.copy(sample_points=eval_points_normalized, + fdata1 = fdata1.copy(grid_points=eval_points_normalized, domain_range=(0, 1)) - fdata2 = fdata2.copy(sample_points=eval_points_normalized, + fdata2 = fdata2.copy(grid_points=eval_points_normalized, domain_range=(0, 1)) srsf = SRSF(initial_value=0) @@ -426,12 +426,12 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, _check=_check) # Both should have the same sample points - eval_points_normalized = _normalize_scale(fdata1.sample_points[0]) + eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) - fdata1 = fdata1.copy(sample_points=eval_points_normalized, + fdata1 = fdata1.copy(grid_points=eval_points_normalized, domain_range=(0, 1)) - fdata2 = fdata2.copy(sample_points=eval_points_normalized, + fdata2 = fdata2.copy(grid_points=eval_points_normalized, domain_range=(0, 1)) elastic_registration = ElasticRegistration( @@ -506,12 +506,12 @@ def phase_distance(fdata1, fdata2, *, lam=0., eval_points=None, _check=True, _check=_check) # Rescale in (0,1) - eval_points_normalized = _normalize_scale(fdata1.sample_points[0]) + eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) - fdata1 = fdata1.copy(sample_points=eval_points_normalized, + fdata1 = fdata1.copy(grid_points=eval_points_normalized, domain_range=(0, 1)) - fdata2 = fdata2.copy(sample_points=eval_points_normalized, + fdata2 = fdata2.copy(grid_points=eval_points_normalized, domain_range=(0, 1)) elastic_registration = ElasticRegistration( @@ -588,7 +588,7 @@ def warping_distance(warping1, warping2, *, eval_points=None, _check=True): product = np.multiply(srsf_warping1, srsf_warping2, out=srsf_warping1) - d = scipy.integrate.simps(product, x=warping1.sample_points[0]) + d = scipy.integrate.simps(product, x=warping1.grid_points[0]) d = np.clip(d, -1, 1) return np.arccos(d) diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py index 06a67d817..1009a4417 100644 --- a/skfda/misc/operators/_linear_differential_operator.py +++ b/skfda/misc/operators/_linear_differential_operator.py @@ -564,14 +564,14 @@ def fdatagrid_penalty_matrix_optimized( basis: FDataGrid): evaluated_basis = sum( - w(basis.sample_points[0]) * - basis.derivative(order=i)(basis.sample_points[0]) + w(basis.grid_points[0]) * + basis.derivative(order=i)(basis.grid_points[0]) for i, w in enumerate(linear_operator.weights)) indices = np.triu_indices(basis.n_samples) product = evaluated_basis[indices[0]] * evaluated_basis[indices[1]] - triang_vec = scipy.integrate.simps(product[..., 0], x=basis.sample_points) + triang_vec = scipy.integrate.simps(product[..., 0], x=basis.grid_points) matrix = np.empty((basis.n_samples, basis.n_samples)) diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/kmeans.py index 7edacfe02..3692fedd1 100644 --- a/skfda/ml/clustering/kmeans.py +++ b/skfda/ml/clustering/kmeans.py @@ -458,8 +458,8 @@ class KMeans(BaseKMeans): ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> sample_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> kmeans = skfda.ml.clustering.KMeans(random_state=0) >>> kmeans.fit(fd) KMeans(...) @@ -641,8 +641,8 @@ class FuzzyCMeans(BaseKMeans): >>> data_matrix = [[[1, 0.3], [2, 0.4], [3, 0.5], [4, 0.6]], ... [[2, 0.5], [3, 0.6], [4, 0.7], [5, 0.7]], ... [[3, 0.2], [4, 0.3], [5, 0.4], [6, 0.5]]] - >>> sample_points = [2, 4, 6, 8] - >>> fd = skfda.FDataGrid(data_matrix, sample_points) + >>> grid_points = [2, 4, 6, 8] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> fuzzy_kmeans = skfda.ml.clustering.FuzzyCMeans(random_state=0) >>> fuzzy_kmeans.fit(fd) FuzzyCMeans(...) diff --git a/skfda/preprocessing/dim_reduction/projection/_fpca.py b/skfda/preprocessing/dim_reduction/projection/_fpca.py index 37fe82ed2..76346b24a 100644 --- a/skfda/preprocessing/dim_reduction/projection/_fpca.py +++ b/skfda/preprocessing/dim_reduction/projection/_fpca.py @@ -56,8 +56,8 @@ class FPCA(BaseEstimator, TransformerMixin): several equivalent possibilities. >>> data_matrix = np.array([[1.0, 0.0], [0.0, 2.0]]) - >>> sample_points = [0, 1] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 1] + >>> fd = FDataGrid(data_matrix, grid_points) >>> basis = skfda.representation.basis.Monomial((0,1), n_basis=2) >>> basis_fd = fd.to_basis(basis) >>> fpca_basis = FPCA(2) @@ -70,8 +70,8 @@ class FPCA(BaseEstimator, TransformerMixin): possibilities. >>> data_matrix = np.array([[1.0, 0.0], [0.0, 2.0]]) - >>> sample_points = [0, 1] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [0, 1] + >>> fd = FDataGrid(data_matrix, grid_points) >>> fpca_grid = FPCA(2) >>> fpca_grid = fpca_grid.fit(fd) @@ -284,15 +284,15 @@ def _fit_grid(self, X: FDataGrid, y=None): # establish weights for each point of discretization if not self.weights: - # sample_points is a list with one array in the 1D case + # grid_points is a list with one array in the 1D case # in trapezoidal rule, suppose \deltax_k = x_k - x_{k-1}, the weight # vector is as follows: [\deltax_1/2, \deltax_1/2 + \deltax_2/2, # \deltax_2/2 + \deltax_3/2, ... , \deltax_n/2] - differences = np.diff(X.sample_points[0]) + differences = np.diff(X.grid_points[0]) differences = np.concatenate(((0,), differences, (0,))) self.weights = (differences[:-1] + differences[1:]) / 2 elif callable(self.weights): - self.weights = self.weights(X.sample_points[0]) + self.weights = self.weights(X.grid_points[0]) # if its a FDataGrid then we need to reduce the dimension to 1-D # array if isinstance(self.weights, FDataGrid): @@ -302,7 +302,7 @@ def _fit_grid(self, X: FDataGrid, y=None): basis = FDataGrid( data_matrix=np.identity(n_points_discretization), - sample_points=X.sample_points + grid_points=X.grid_points ) regularization_matrix = compute_penalty_matrix( diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index acdbb0dfd..afd9119cc 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -153,14 +153,14 @@ class RKHSVariableSelection(sklearn.base.BaseEstimator, ... n_features_to_select=3) >>> _ = rkvs.fit(X, y) >>> point_mask = rkvs.get_support() - >>> points = X.sample_points[0][point_mask] + >>> points = X.grid_points[0][point_mask] >>> np.allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) True Apply the learned dimensionality reduction >>> X_dimred = rkvs.transform(X) - >>> len(X.sample_points[0]) + >>> len(X.grid_points[0]) 1000 >>> X_dimred.shape (10000, 3) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index db0e0ff92..ff45b7e2d 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -150,14 +150,14 @@ class MaximaHunting(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): ... local_maxima_selector=local_maxima_selector) >>> _ = mh.fit(X, y) >>> point_mask = mh.get_support() - >>> points = X.sample_points[0][point_mask] + >>> points = X.grid_points[0][point_mask] >>> np.allclose(points, [0.5], rtol=0.1) True Apply the learned dimensionality reduction >>> X_dimred = mh.transform(X) - >>> len(X.sample_points[0]) + >>> len(X.grid_points[0]) 100 >>> X_dimred.shape (10000, 1) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 1b4a83793..efa21536a 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -214,7 +214,7 @@ def begin(self, X, y): if self.fit_hyperparameters: import GPy - T = X.sample_points[0] + T = X.grid_points[0] X_copy = np.copy(X.data_matrix[..., 0]) y = np.ravel(y) @@ -270,7 +270,7 @@ def conditioned(self, X, t_0, **kwargs): def conditional_mean(self, X, selected_index): - T = X.sample_points[0] + T = X.grid_points[0] t_0 = T[selected_index] @@ -420,7 +420,7 @@ def begin(self, X: FDataGrid, Y): X_copy[Y == class_label, :] -= mean self.cov_matrix_ = np.cov(X_copy, rowvar=False) - self.t_ = np.ravel(X.sample_points) + self.t_ = np.ravel(X.grid_points) self.gaussian_correction_ = GaussianCorrection( cov=self.cov) @@ -723,7 +723,7 @@ def _rec_maxima_hunting_gen_no_copy( correction = UniformCorrection() if mask is None: - mask = np.zeros([len(t) for t in X.sample_points], dtype=bool) + mask = np.zeros([len(t) for t in X.grid_points], dtype=bool) if redundancy_condition is None: redundancy_condition = DependenceThresholdRedundancy() @@ -782,8 +782,8 @@ def _rec_maxima_hunting_gen_no_copy( correction = correction.conditioned( X=X.data_matrix, - T=X.sample_points[0], - t_0=X.sample_points[0][t_max_index]) + T=X.grid_points[0], + t_0=X.grid_points[0][t_max_index]) first_pass = False @@ -867,14 +867,14 @@ class RecursiveMaximaHunting( >>> rmh = variable_selection.RecursiveMaximaHunting() >>> _ = rmh.fit(X, y) >>> point_mask = rmh.get_support() - >>> points = X.sample_points[0][point_mask] + >>> points = X.grid_points[0][point_mask] >>> np.allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) True Apply the learned dimensionality reduction >>> X_dimred = rmh.transform(X) - >>> len(X.sample_points[0]) + >>> len(X.grid_points[0]) 100 >>> X_dimred.shape (10000, 3) diff --git a/skfda/preprocessing/registration/_landmark_registration.py b/skfda/preprocessing/registration/_landmark_registration.py index 5e8a96208..40a666a9c 100644 --- a/skfda/preprocessing/registration/_landmark_registration.py +++ b/skfda/preprocessing/registration/_landmark_registration.py @@ -238,7 +238,7 @@ def landmark_registration_warping(fd, landmarks, *, location=None, data_matrix[:, 1:-1] = landmarks if location is None: - sample_points = np.mean(data_matrix, axis=0) + grid_points = np.mean(data_matrix, axis=0) elif n_landmarks != len(location): @@ -246,20 +246,20 @@ def landmark_registration_warping(fd, landmarks, *, location=None, f"the number of landmarks ({len(location)}) != " f"({n_landmarks})") else: - sample_points = np.empty(n_landmarks + 2) - sample_points[0] = fd.domain_range[0][0] - sample_points[-1] = fd.domain_range[0][1] - sample_points[1:-1] = location + grid_points = np.empty(n_landmarks + 2) + grid_points[0] = fd.domain_range[0][0] + grid_points[-1] = fd.domain_range[0][1] + grid_points[1:-1] = location interpolation = SplineInterpolation(interpolation_order=3, monotone=True) warping = FDataGrid(data_matrix=data_matrix, - sample_points=sample_points, + grid_points=grid_points, interpolation=interpolation, extrapolation='bounds') try: - warping_points = fd.sample_points + warping_points = fd.grid_points except AttributeError: warping_points = [np.linspace(*domain, 201) for domain in fd.domain_range] diff --git a/skfda/preprocessing/registration/_shift_registration.py b/skfda/preprocessing/registration/_shift_registration.py index 237165b8d..602e0ec77 100644 --- a/skfda/preprocessing/registration/_shift_registration.py +++ b/skfda/preprocessing/registration/_shift_registration.py @@ -67,7 +67,7 @@ class ShiftRegistration(RegistrationTransformer): functions are evaluated to obtain the discrete representation of the object to integrate. If None is passed it calls numpy.linspace in FDataBasis and uses the - `sample_points` in FDataGrids. + `grid_points` in FDataGrids. Attributes: template_ (FData): Template :math:`\mu` learned during the fitting @@ -169,7 +169,7 @@ def _compute_deltas(self, fd, template): if self.output_points is None: try: - output_points = fd.sample_points[0] + output_points = fd.grid_points[0] nfine = len(output_points) except AttributeError: nfine = max(fd.n_basis * constants.BASIS_MIN_FACTOR + 1, @@ -248,7 +248,7 @@ def _compute_deltas(self, fd, template): elif template == "fixed" and self.restrict_domain: tfine_aux = template_points_aux[domain] elif callable(template): # Callable - fd_x = FDataGrid(x, sample_points=output_points) + fd_x = FDataGrid(x, grid_points=output_points) fd_tfine = template(fd_x) tfine_aux = fd_tfine.data_matrix.ravel() @@ -274,7 +274,7 @@ def _compute_deltas(self, fd, template): else: # Stores the template in an FDataGrid - template = FDataGrid(tfine_aux, sample_points=output_points) + template = FDataGrid(tfine_aux, grid_points=output_points) return delta, template diff --git a/skfda/preprocessing/registration/_warping.py b/skfda/preprocessing/registration/_warping.py index 90c5391ca..b2f4a222f 100644 --- a/skfda/preprocessing/registration/_warping.py +++ b/skfda/preprocessing/registration/_warping.py @@ -75,7 +75,7 @@ def invert_warping(fdatagrid, *, output_points=None): check_is_univariate(fdatagrid) if output_points is None: - output_points = fdatagrid.sample_points[0] + output_points = fdatagrid.grid_points[0] y = fdatagrid(output_points)[..., 0] @@ -84,7 +84,7 @@ def invert_warping(fdatagrid, *, output_points=None): for i in range(fdatagrid.n_samples): data_matrix[i] = PchipInterpolator(y[i], output_points)(output_points) - return fdatagrid.copy(data_matrix=data_matrix, sample_points=output_points) + return fdatagrid.copy(data_matrix=data_matrix, grid_points=output_points) def _normalize_scale(t, a=0, b=1): @@ -130,7 +130,7 @@ def normalize_warping(warping, domain_range=None): domain_range = warping.domain_range[0] data_matrix = _normalize_scale(warping.data_matrix[..., 0], *domain_range) - sample_points = _normalize_scale(warping.sample_points[0], *domain_range) + grid_points = _normalize_scale(warping.grid_points[0], *domain_range) - return warping.copy(data_matrix=data_matrix, sample_points=sample_points, + return warping.copy(data_matrix=data_matrix, grid_points=grid_points, domain_range=domain_range) diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py index a514dd7e9..5dd16b34e 100644 --- a/skfda/preprocessing/registration/elastic.py +++ b/skfda/preprocessing/registration/elastic.py @@ -154,7 +154,7 @@ def transform(self, X: FDataGrid, y=None): check_is_univariate(X) if self.output_points is None: - output_points = X.sample_points[0] + output_points = X.grid_points[0] else: output_points = self.output_points @@ -174,7 +174,7 @@ def transform(self, X: FDataGrid, y=None): a = X.domain_range[0][0] self.initial_value_ = X(a).reshape(X.n_samples, 1, X.dim_codomain) - return X.copy(data_matrix=data_matrix, sample_points=output_points) + return X.copy(data_matrix=data_matrix, grid_points=output_points) def inverse_transform(self, X: FDataGrid, y=None): r"""Computes the inverse SRSF transform. @@ -218,7 +218,7 @@ def inverse_transform(self, X: FDataGrid, y=None): "transformation.") if self.output_points is None: - output_points = X.sample_points[0] + output_points = X.grid_points[0] else: output_points = self.output_points @@ -235,7 +235,7 @@ def inverse_transform(self, X: FDataGrid, y=None): else: f_data_matrix += self.initial_value - return X.copy(data_matrix=f_data_matrix, sample_points=output_points) + return X.copy(data_matrix=f_data_matrix, grid_points=output_points) def _elastic_alignment_array(template_data, q_data, @@ -427,7 +427,7 @@ def transform(self, X: FDataGrid, y=None): # Points of discretization if self.output_points is None: - output_points = fdatagrid_srsf.sample_points[0] + output_points = fdatagrid_srsf.grid_points[0] else: output_points = self.output_points @@ -564,15 +564,15 @@ def warping_mean(warping, *, max_iter=100, tol=1e-6, step_size=.3): arXiv:1103.3817v2. """ - eval_points = warping.sample_points[0] + eval_points = warping.grid_points[0] original_eval_points = eval_points # Rescale warping to (0, 1) - if warping.sample_points[0][0] != 0 or warping.sample_points[0][-1] != 1: + if warping.grid_points[0][0] != 0 or warping.grid_points[0][-1] != 1: eval_points = _normalize_scale(eval_points) warping = FDataGrid(_normalize_scale(warping.data_matrix[..., 0]), - _normalize_scale(warping.sample_points[0])) + _normalize_scale(warping.grid_points[0])) # Compute srsf of warpings and their mean srsf = SRSF(output_points=eval_points, initial_value=0) @@ -630,7 +630,7 @@ def warping_mean(warping, *, max_iter=100, tol=1e-6, step_size=.3): monotone_interpolation = SplineInterpolation(interpolation_order=3, monotone=True) - mean = FDataGrid([warping_mean], sample_points=original_eval_points, + mean = FDataGrid([warping_mean], grid_points=original_eval_points, interpolation=monotone_interpolation) return mean @@ -695,7 +695,7 @@ def elastic_mean(fdatagrid, *, penalty=0., center=True, max_iter=20, tol=1e-3, srsf_transformer = SRSF(initial_value=0) fdatagrid_srsf = srsf_transformer.fit_transform(fdatagrid) - eval_points = fdatagrid.sample_points[0] + eval_points = fdatagrid.grid_points[0] eval_points_normalized = _normalize_scale(eval_points) y_scale = eval_points[-1] - eval_points[0] @@ -704,7 +704,7 @@ def elastic_mean(fdatagrid, *, penalty=0., center=True, max_iter=20, tol=1e-3, # Discretisation points fdatagrid_normalized = FDataGrid(fdatagrid(eval_points) / y_scale, - sample_points=eval_points_normalized) + grid_points=eval_points_normalized) srsf = fdatagrid_srsf(eval_points)[..., 0] @@ -724,7 +724,7 @@ def elastic_mean(fdatagrid, *, penalty=0., center=True, max_iter=20, tol=1e-3, gammas = _elastic_alignment_array( mu, srsf, eval_points_normalized, penalty, grid_dim) - gammas = FDataGrid(gammas, sample_points=eval_points_normalized, + gammas = FDataGrid(gammas, grid_points=eval_points_normalized, interpolation=interpolation) fdatagrid_normalized = fdatagrid_normalized.compose(gammas) @@ -754,7 +754,7 @@ def elastic_mean(fdatagrid, *, penalty=0., center=True, max_iter=20, tol=1e-3, # Karcher mean orbit in space L2/Gamma karcher_mean = srsf_transformer.inverse_transform( - fdatagrid.copy(data_matrix=[mu], sample_points=eval_points, + fdatagrid.copy(data_matrix=[mu], grid_points=eval_points, sample_names=("Karcher mean",))) if center: @@ -765,7 +765,7 @@ def elastic_mean(fdatagrid, *, penalty=0., center=True, max_iter=20, tol=1e-3, mean_normalized.data_matrix[..., 0], a=eval_points[0], b=eval_points[-1]), - sample_points=eval_points) + grid_points=eval_points) gamma_inverse = invert_warping(gamma_mean) diff --git a/skfda/preprocessing/registration/validation.py b/skfda/preprocessing/registration/validation.py index 38870cdaa..fc27ee72f 100644 --- a/skfda/preprocessing/registration/validation.py +++ b/skfda/preprocessing/registration/validation.py @@ -285,7 +285,7 @@ def score_function(self, X, y, *, warping=None): # Creates the mesh to discretize the functions if self.eval_points is None: try: - eval_points = y.sample_points[0] + eval_points = y.grid_points[0] except AttributeError: nfine = max(y.basis.n_basis * 10 + 1, 201) diff --git a/skfda/preprocessing/smoothing/_basis.py b/skfda/preprocessing/smoothing/_basis.py index 5bdad52b2..a27662c8f 100644 --- a/skfda/preprocessing/smoothing/_basis.py +++ b/skfda/preprocessing/smoothing/_basis.py @@ -124,7 +124,7 @@ def transform(self, estimator, X, y=None): else: # The matrix is cached return X.copy(data_matrix=self.hat_matrix() @ X.data_matrix, - sample_points=estimator.output_points_) + grid_points=estimator.output_points_) class BasisSmoother(_LinearSmoother): @@ -201,7 +201,7 @@ class BasisSmoother(_LinearSmoother): >>> x array([ 3., 3., 1., 1., 3.]) - >>> fd = skfda.FDataGrid(data_matrix=x, sample_points=t) + >>> fd = skfda.FDataGrid(data_matrix=x, grid_points=t) >>> basis = skfda.representation.basis.Fourier((0, 1), n_basis=3) >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, method='cholesky') @@ -216,7 +216,7 @@ class BasisSmoother(_LinearSmoother): However, the parameter ``return_basis`` can be used to return the data in basis form, by default, without extra smoothing: - >>> fd = skfda.FDataGrid(data_matrix=x, sample_points=t) + >>> fd = skfda.FDataGrid(data_matrix=x, grid_points=t) >>> basis = skfda.representation.basis.Fourier((0, 1), n_basis=3) >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, method='cholesky', return_basis=True) @@ -248,7 +248,7 @@ class BasisSmoother(_LinearSmoother): >>> from skfda.misc.regularization import TikhonovRegularization >>> from skfda.misc.operators import LinearDifferentialOperator >>> - >>> fd = skfda.FDataGrid(data_matrix=x, sample_points=t) + >>> fd = skfda.FDataGrid(data_matrix=x, grid_points=t) >>> basis = skfda.representation.basis.Fourier((0, 1), n_basis=3) >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, method='cholesky', @@ -259,7 +259,7 @@ class BasisSmoother(_LinearSmoother): >>> fd_basis.coefficients.round(2) array([[ 2.04, 0.51, 0.55]]) - >>> fd = skfda.FDataGrid(data_matrix=x, sample_points=t) + >>> fd = skfda.FDataGrid(data_matrix=x, grid_points=t) >>> basis = skfda.representation.basis.Fourier((0, 1), n_basis=3) >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, method='qr', @@ -270,7 +270,7 @@ class BasisSmoother(_LinearSmoother): >>> fd_basis.coefficients.round(2) array([[ 2.04, 0.51, 0.55]]) - >>> fd = skfda.FDataGrid(data_matrix=x, sample_points=t) + >>> fd = skfda.FDataGrid(data_matrix=x, grid_points=t) >>> basis = skfda.representation.basis.Fourier((0, 1), n_basis=3) >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, method='matrix', @@ -374,7 +374,7 @@ def fit(self, X: FDataGrid, y=None): """ - self.input_points_ = X.sample_points + self.input_points_ = X.grid_points self.output_points_ = (self.output_points if self.output_points is not None else self.input_points_) @@ -399,7 +399,7 @@ def fit_transform(self, X: FDataGrid, y=None): """ from ...misc.regularization import compute_penalty_matrix - self.input_points_ = X.sample_points + self.input_points_ = X.grid_points self.output_points_ = (self.output_points if self.output_points is not None else self.input_points_) @@ -471,7 +471,7 @@ def fit_transform(self, X: FDataGrid, y=None): if self.return_basis: return fdatabasis else: - return fdatabasis.to_grid(sample_points=self.output_points_) + return fdatabasis.to_grid(grid_points=self.output_points_) return self @@ -488,7 +488,7 @@ def transform(self, X: FDataGrid, y=None): """ assert all([all(i == s) - for i, s in zip(self.input_points_, X.sample_points)]) + for i, s in zip(self.input_points_, X.grid_points)]) method = self._method_function() diff --git a/skfda/preprocessing/smoothing/_linear.py b/skfda/preprocessing/smoothing/_linear.py index a7882be75..0729290dd 100644 --- a/skfda/preprocessing/smoothing/_linear.py +++ b/skfda/preprocessing/smoothing/_linear.py @@ -80,7 +80,7 @@ def fit(self, X: FDataGrid, y=None): """ _check_r_to_r(X) - self.input_points_ = X.sample_points[0] + self.input_points_ = X.grid_points[0] self.output_points_ = (self.output_points if self.output_points is not None else self.input_points_) @@ -102,11 +102,11 @@ def transform(self, X: FDataGrid, y=None): """ - assert all(self.input_points_ == X.sample_points[0]) + assert all(self.input_points_ == X.grid_points[0]) # The matrix is cached return X.copy(data_matrix=self.hat_matrix() @ X.data_matrix, - sample_points=self.output_points_) + grid_points=self.output_points_) def score(self, X, y): """Returns the generalized cross validation (GCV) score. diff --git a/skfda/preprocessing/smoothing/kernel_smoothers.py b/skfda/preprocessing/smoothing/kernel_smoothers.py index a35c587f5..263496fa7 100644 --- a/skfda/preprocessing/smoothing/kernel_smoothers.py +++ b/skfda/preprocessing/smoothing/kernel_smoothers.py @@ -107,7 +107,7 @@ class NadarayaWatsonSmoother(_LinearKernelSmoother): Examples: >>> from skfda import FDataGrid - >>> fd = FDataGrid(sample_points=[1, 2, 4, 5, 7], + >>> fd = FDataGrid(grid_points=[1, 2, 4, 5, 7], ... data_matrix=[[1, 2, 3, 4, 5]]) >>> smoother = NadarayaWatsonSmoother(smoothing_parameter=3.5) >>> fd_smoothed = smoother.fit_transform(fd) @@ -209,7 +209,7 @@ class LocalLinearRegressionSmoother(_LinearKernelSmoother): Examples: >>> from skfda import FDataGrid - >>> fd = FDataGrid(sample_points=[1, 2, 4, 5, 7], + >>> fd = FDataGrid(grid_points=[1, 2, 4, 5, 7], ... data_matrix=[[1, 2, 3, 4, 5]]) >>> smoother = LocalLinearRegressionSmoother(smoothing_parameter=3.5) >>> fd_smoothed = smoother.fit_transform(fd) @@ -298,7 +298,7 @@ class KNeighborsSmoother(_LinearKernelSmoother): Examples: >>> from skfda import FDataGrid - >>> fd = FDataGrid(sample_points=[1, 2, 4, 5, 7], + >>> fd = FDataGrid(grid_points=[1, 2, 4, 5, 7], ... data_matrix=[[1, 2, 3, 4, 5]]) >>> smoother = KNeighborsSmoother(smoothing_parameter=2) >>> fd_smoothed = smoother.fit_transform(fd) @@ -318,7 +318,7 @@ class KNeighborsSmoother(_LinearKernelSmoother): In case there are two points at the same distance it will take both. - >>> fd = FDataGrid(sample_points=[1, 2, 3, 5, 7], + >>> fd = FDataGrid(grid_points=[1, 2, 3, 5, 7], ... data_matrix=[[1, 2, 3, 4, 5]]) >>> smoother = KNeighborsSmoother(smoothing_parameter=2) >>> fd_smoothed = smoother.fit_transform(fd) diff --git a/skfda/preprocessing/smoothing/validation.py b/skfda/preprocessing/smoothing/validation.py index 3b54305cb..a01080707 100644 --- a/skfda/preprocessing/smoothing/validation.py +++ b/skfda/preprocessing/smoothing/validation.py @@ -189,7 +189,7 @@ class SmoothingParameterSearch(GridSearchCV): [ 0.67], [ 1.67], [ 2.5 ]]]), - sample_points=(array([-2., -1., 0., 1., 2.]),), + grid_points=(array([-2., -1., 0., 1., 2.]),), domain_range=((-2.0, 2.0),), ...) diff --git a/skfda/representation/_evaluation_trasformer.py b/skfda/representation/_evaluation_trasformer.py index feadd24f8..c3921f1e4 100644 --- a/skfda/representation/_evaluation_trasformer.py +++ b/skfda/representation/_evaluation_trasformer.py @@ -37,8 +37,8 @@ class EvaluationTransformer(BaseEstimator, TransformerMixin): representing a function :math:`f : \mathbb{R}\longmapsto\mathbb{R}`. >>> data_matrix = [[1, 2], [2, 3]] - >>> sample_points = [2, 4] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [2, 4] + >>> fd = FDataGrid(data_matrix, grid_points) >>> >>> transformer = EvaluationTransformer() >>> transformer.fit_transform(fd) @@ -49,8 +49,8 @@ class EvaluationTransformer(BaseEstimator, TransformerMixin): representing a function :math:`f : \mathbb{R}\longmapsto\mathbb{R}^2`. >>> data_matrix = [[[1, 0.3], [2, 0.4]], [[2, 0.5], [3, 0.6]]] - >>> sample_points = [2, 4] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [2, 4] + >>> fd = FDataGrid(data_matrix, grid_points) >>> >>> transformer = EvaluationTransformer() >>> transformer.fit_transform(fd) @@ -61,8 +61,8 @@ class EvaluationTransformer(BaseEstimator, TransformerMixin): representing a function :math:`f : \mathbb{R}^2\longmapsto\mathbb{R}`. >>> data_matrix = [[[1, 0.3], [2, 0.4]], [[2, 0.5], [3, 0.6]]] - >>> sample_points = [[2, 4], [3, 6]] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [[2, 4], [3, 6]] + >>> fd = FDataGrid(data_matrix, grid_points) >>> >>> transformer = EvaluationTransformer() >>> transformer.fit_transform(fd) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index f7a809663..b37c36145 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -576,11 +576,11 @@ def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, / self.n_samples) @abstractmethod - def to_grid(self, sample_points=None): + def to_grid(self, grid_points=None): """Return the discrete representation of the object. Args: - sample_points (array_like, optional): Points per axis + grid_points (array_like, optional): Points per axis where the function is going to be evaluated. Returns: diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 8cae42c01..c6040752e 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -2,6 +2,7 @@ import copy import numbers from typing import Any +import warnings import pandas.api.extensions @@ -118,7 +119,9 @@ def __init__(self, basis, coefficients, *, dataset_label=None, sample_names=sample_names) @classmethod - def from_data(cls, data_matrix, sample_points, basis, + def from_data(cls, data_matrix, *, basis, + grid_points=None, + sample_points=None, method='cholesky'): r"""Transform raw data to a smooth functional form. @@ -156,7 +159,7 @@ def from_data(cls, data_matrix, sample_points, basis, data_matrix (array_like): List or matrix containing the observations. If a matrix each row represents a single functional datum and the columns the different observations. - sample_points (array_like): Values of the domain where the previous + grid_points (array_like): Values of the domain where the previous data were taken. basis: (Basis): Basis used. method (str): Algorithm used for calculating the coefficients using @@ -177,7 +180,7 @@ def from_data(cls, data_matrix, sample_points, basis, >>> from skfda.representation.basis import FDataBasis, Fourier >>> basis = Fourier((0, 1), n_basis=3) - >>> fd = FDataBasis.from_data(x, t, basis) + >>> fd = FDataBasis.from_data(x, grid_points=t, basis=basis) >>> fd.coefficients.round(2) array([[ 2. , 0.71, 0.71]]) @@ -193,6 +196,12 @@ def from_data(cls, data_matrix, sample_points, basis, """ from ..grid import FDataGrid + if sample_points is not None: + warnings.warn("Parameter sample_points is deprecated. Use the " + "parameter grid_points instead.", + DeprecationWarning) + grid_points = sample_points + # n is the samples # m is the observations # k is the number of elements of the basis @@ -200,7 +209,7 @@ def from_data(cls, data_matrix, sample_points, basis, # Each sample in a column (m x n) data_matrix = np.atleast_2d(data_matrix) - fd = FDataGrid(data_matrix=data_matrix, sample_points=sample_points) + fd = FDataGrid(data_matrix=data_matrix, grid_points=grid_points) return fd.to_basis(basis=basis, method=method) @@ -317,8 +326,8 @@ def shift(self, shifts, *, restrict_domain=False, extrapolation=None, domain_range[1] + shifts)) return FDataBasis.from_data(self.evaluate(eval_points), - eval_points + shifts, - _basis, **kwargs) + grid_points=eval_points + shifts, + basis=_basis, **kwargs) elif len(shifts) != self.n_samples: raise ValueError(f"shifts vector ({len(shifts)}) must have the " @@ -347,8 +356,8 @@ def shift(self, shifts, *, restrict_domain=False, extrapolation=None, _basis = self.basis.rescale(domain) - return FDataBasis.from_data(_data_matrix, eval_points, - _basis, **kwargs) + return FDataBasis.from_data(_data_matrix, grid_points=eval_points, + basis=_basis, **kwargs) def derivative(self, *, order=1): r"""Differentiate a FDataBasis object. @@ -465,11 +474,11 @@ def cov(self, eval_points=None): """ return self.to_grid(eval_points).cov() - def to_grid(self, sample_points=None): + def to_grid(self, grid_points=None, *, sample_points=None): """Return the discrete representation of the object. Args: - sample_points (array_like, optional): Points per axis where the + grid_points (array_like, optional): Points per axis where the functions are evaluated. If none are passed it calls numpy.linspace with bounds equal to the ones defined in self.domain_range and the number of points the maximum @@ -492,20 +501,25 @@ def to_grid(self, sample_points=None): [[ 1.], [ 2.], [ 5.]]]), - sample_points=(array([ 0., 1., 2.]),), + grid_points=(array([ 0., 1., 2.]),), domain_range=((0, 5),), ...) """ + if sample_points is not None: + warnings.warn("Parameter sample_points is deprecated. Use the " + "parameter grid_points instead.", + DeprecationWarning) + grid_points = sample_points - if sample_points is None: + if grid_points is None: npoints = max(constants.N_POINTS_FINE_MESH, constants.BASIS_MIN_FACTOR * self.n_basis) - sample_points = [np.linspace(*r, npoints) - for r in self.domain_range] + grid_points = [np.linspace(*r, npoints) + for r in self.domain_range] - return grid.FDataGrid(self.evaluate(sample_points, grid=True), - sample_points=sample_points, + return grid.FDataGrid(self.evaluate(grid_points, grid=True), + grid_points=grid_points, domain_range=self.domain_range) def to_basis(self, basis, eval_points=None, **kwargs): diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 782ce4ca3..9280ee8c0 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -9,6 +9,7 @@ import copy import numbers from typing import Any +import warnings import findiff import pandas.api.extensions @@ -37,7 +38,7 @@ class FDataGrid(FData): data_matrix (numpy.ndarray): a matrix where each entry of the first axis contains the values of a functional datum evaluated at the points of discretisation. - sample_points (numpy.ndarray): 2 dimension matrix where each row + grid_points (numpy.ndarray): 2 dimension matrix where each row contains the points of dicretisation for each axis of data_matrix. domain_range (numpy.ndarray): 2 dimension matrix where each row contains the bounds of the interval in which the functional data @@ -60,8 +61,8 @@ class FDataGrid(FData): with 3 discretization points. >>> data_matrix = [[1, 2, 3], [4, 5, 6]] - >>> sample_points = [2, 4, 5] - >>> FDataGrid(data_matrix, sample_points) + >>> grid_points = [2, 4, 5] + >>> FDataGrid(data_matrix, grid_points) FDataGrid( array([[[ 1.], [ 2.], @@ -70,17 +71,17 @@ class FDataGrid(FData): [[ 4.], [ 5.], [ 6.]]]), - sample_points=(array([ 2., 4., 5.]),), + grid_points=(array([ 2., 4., 5.]),), domain_range=((2.0, 5.0),), ...) The number of columns of data_matrix have to be the length of - sample_points. + grid_points. >>> FDataGrid(np.array([1,2,4,5,8]), range(6)) Traceback (most recent call last): .... - ValueError: Incorrect dimension in data_matrix and sample_points... + ValueError: Incorrect dimension in data_matrix and grid_points... FDataGrid support higher dimensional data both in the domain and image. @@ -88,8 +89,8 @@ class FDataGrid(FData): representing a function :math:`f : \mathbb{R}\longmapsto\mathbb{R}^2`. >>> data_matrix = [[[1, 0.3], [2, 0.4]], [[2, 0.5], [3, 0.6]]] - >>> sample_points = [2, 4] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [2, 4] + >>> fd = FDataGrid(data_matrix, grid_points) >>> fd.dim_domain, fd.dim_codomain (1, 2) @@ -97,8 +98,8 @@ class FDataGrid(FData): representing a function :math:`f : \mathbb{R}^2\longmapsto\mathbb{R}`. >>> data_matrix = [[[1, 0.3], [2, 0.4]], [[2, 0.5], [3, 0.6]]] - >>> sample_points = [[2, 4], [3,6]] - >>> fd = FDataGrid(data_matrix, sample_points) + >>> grid_points = [[2, 4], [3,6]] + >>> fd = FDataGrid(data_matrix, grid_points) >>> fd.dim_domain, fd.dim_codomain (2, 1) @@ -135,7 +136,9 @@ def __len__(self): """Return the number of coordinates.""" return self._fdatagrid.dim_codomain - def __init__(self, data_matrix, sample_points=None, + def __init__(self, data_matrix, grid_points=None, + *, + sample_points=None, domain_range=None, dataset_label=None, dataset_name=None, @@ -150,7 +153,7 @@ def __init__(self, data_matrix, sample_points=None, data_matrix (array_like): a matrix where each row contains the values of a functional datum evaluated at the points of discretisation. - sample_points (array_like, optional): an array containing the + grid_points (array_like, optional): an array containing the points of discretisation where values have been recorded or a list of lists with each of the list containing the points of dicretisation for each axis. @@ -165,35 +168,41 @@ def __init__(self, data_matrix, sample_points=None, of the number of dimensions of the domain plus the number of dimensions of the image. """ + if sample_points is not None: + warnings.warn("Parameter sample_points is deprecated. Use the " + "parameter grid_points instead.", + DeprecationWarning) + grid_points = sample_points + self.data_matrix = _int_to_real(np.atleast_2d(data_matrix)) - if sample_points is None: - self.sample_points = _tuple_of_arrays( + if grid_points is None: + self.grid_points = _tuple_of_arrays( [np.linspace(0., 1., self.data_matrix.shape[i]) for i in range(1, self.data_matrix.ndim)]) else: - # Check that the dimension of the data matches the sample_points + # Check that the dimension of the data matches the grid_points # list - self.sample_points = _tuple_of_arrays(sample_points) + self.grid_points = _tuple_of_arrays(grid_points) data_shape = self.data_matrix.shape[1: 1 + self.dim_domain] - sample_points_shape = [len(i) for i in self.sample_points] + grid_points_shape = [len(i) for i in self.grid_points] - if not np.array_equal(data_shape, sample_points_shape): + if not np.array_equal(data_shape, grid_points_shape): raise ValueError("Incorrect dimension in data_matrix and " - "sample_points. Data has shape {} and sample " + "grid_points. Data has shape {} and grid " "points have shape {}" - .format(data_shape, sample_points_shape)) + .format(data_shape, grid_points_shape)) self._sample_range = np.array( - [(s[0], s[-1]) for s in self.sample_points]) + [(s[0], s[-1]) for s in self.grid_points]) if domain_range is None: domain_range = self.sample_range # Default value for domain_range is a list of tuples with - # the first and last element of each list ofthe sample_points. + # the first and last element of each list of the grid_points. self._domain_range = _domain_range(domain_range) @@ -201,8 +210,8 @@ def __init__(self, data_matrix, sample_points=None, raise ValueError("Incorrect shape of domain_range.") for i in range(self.dim_domain): - if (self._domain_range[i][0] > self.sample_points[i][0] - or self._domain_range[i][-1] < self.sample_points[i] + if (self._domain_range[i][0] > self.grid_points[i][0] + or self._domain_range[i][-1] < self.grid_points[i] [-1]): raise ValueError("Sample points must be within the domain " "range.") @@ -237,6 +246,13 @@ def round(self, decimals=0): """ return self.copy(data_matrix=self.data_matrix.round(decimals)) + @property + def sample_points(self): + warnings.warn("Parameter sample_points is deprecated. Use the " + "parameter grid_points instead.", + DeprecationWarning) + return self.grid_points + @property def dim_domain(self): """Return number of dimensions of the domain. @@ -245,7 +261,7 @@ def dim_domain(self): int: Number of dimensions of the domain. """ - return len(self.sample_points) + return len(self.grid_points) @property def dim_codomain(self): @@ -401,7 +417,7 @@ def derivative(self, *, order=1): [ 1.5], [ 2. ], [ 4. ]]]), - sample_points=(array([ 0., 1., 2., 3., 4.]),), + grid_points=(array([ 0., 1., 2., 3., 4.]),), domain_range=((0.0, 4.0),), ...) @@ -415,7 +431,7 @@ def derivative(self, *, order=1): [-1.], [ 2.], [ 5.]]]), - sample_points=(array([ 0., 1., 2., 3., 4.]),), + grid_points=(array([ 0., 1., 2., 3., 4.]),), domain_range=((0.0, 4.0),), ...) @@ -426,7 +442,7 @@ def derivative(self, *, order=1): operator = findiff.FinDiff(*[(1 + i, p, o) for i, (p, o) in enumerate( - zip(self.sample_points, order_list))]) + zip(self.grid_points, order_list))]) data_matrix = operator(self.data_matrix.astype(float)) if self.dataset_name: @@ -443,7 +459,7 @@ def derivative(self, *, order=1): def __check_same_dimensions(self, other): if self.data_matrix.shape[1:-1] != other.data_matrix.shape[1:-1]: raise ValueError("Error in columns dimensions") - if not np.array_equal(self.sample_points, other.sample_points): + if not np.array_equal(self.grid_points, other.grid_points): raise ValueError("Sample points for both objects must be equal") def sum(self, *, axis=None, out=None, keepdims=False, skipna=False, @@ -513,8 +529,8 @@ def cov(self): return self.copy(data_matrix=np.cov(self.data_matrix[..., 0], rowvar=False)[np.newaxis, ...], - sample_points=[self.sample_points[0], - self.sample_points[0]], + grid_points=[self.grid_points[0], + self.grid_points[0]], domain_range=[self.domain_range[0], self.domain_range[0]], dataset_name=dataset_name, @@ -542,10 +558,10 @@ def equals(self, other): if not np.array_equal(self.data_matrix, other.data_matrix): return False - if len(self.sample_points) != len(other.sample_points): + if len(self.grid_points) != len(other.grid_points): return False - for a, b in zip(self.sample_points, other.sample_points): + for a, b in zip(self.grid_points, other.grid_points): if not np.array_equal(a, b): return False @@ -718,7 +734,7 @@ def concatenate(self, *others, as_coordinates=False): [ 7.], [ 9.], [ 2.]]]), - sample_points=(array([ 0., 1., 2., 3., 4.]),), + grid_points=(array([ 0., 1., 2., 3., 4.]),), domain_range=((0.0, 4.0),), ...) @@ -728,7 +744,7 @@ def concatenate(self, *others, as_coordinates=False): for other in others: self.__check_same_dimensions(other) - elif not all([np.array_equal(self.sample_points, other.sample_points) + elif not all([np.array_equal(self.grid_points, other.grid_points) for other in others]): raise ValueError("All the FDataGrids must be sampled in the same " "sample points.") @@ -834,17 +850,25 @@ def to_basis(self, basis, **kwargs): return smoother.fit_transform(self) - def to_grid(self, sample_points=None): + def to_grid(self, grid_points=None, *, sample_points=None): - if sample_points is None: - sample_points = self.sample_points + if sample_points is not None: + warnings.warn("Parameter sample_points is deprecated. Use the " + "parameter grid_points instead.", + DeprecationWarning) + grid_points = sample_points - return self.copy(data_matrix=self.evaluate(sample_points, grid=True), - sample_points=sample_points) + if grid_points is None: + grid_points = self.grid_points + + return self.copy(data_matrix=self.evaluate(grid_points, grid=True), + grid_points=grid_points) def copy(self, *, deep=False, # For Pandas compatibility - data_matrix=None, sample_points=None, + data_matrix=None, + grid_points=None, + sample_points=None, domain_range=None, dataset_name=None, argument_names=None, @@ -859,13 +883,19 @@ def copy(self, *, """ + if sample_points is not None: + warnings.warn("Parameter sample_points is deprecated. Use the " + "parameter grid_points instead.", + DeprecationWarning) + grid_points = sample_points + if data_matrix is None: # The data matrix won't be writeable data_matrix = self.data_matrix - if sample_points is None: + if grid_points is None: # Sample points won`t be writeable - sample_points = self.sample_points + grid_points = self.grid_points if domain_range is None: domain_range = copy.deepcopy(self.domain_range) @@ -891,7 +921,7 @@ def copy(self, *, if interpolation is None: interpolation = self.interpolation - return FDataGrid(data_matrix, sample_points=sample_points, + return FDataGrid(data_matrix, grid_points=grid_points, domain_range=domain_range, dataset_name=dataset_name, argument_names=argument_names, @@ -918,7 +948,7 @@ def shift(self, shifts, *, restrict_domain=False, extrapolation=None, eval_points (array_like, optional): Set of points where the functions are evaluated to obtain the discrete representation of the object to operate. If an empty list the - current sample_points are used to unificate the domain of the + current grid_points are used to unificate the domain of the shifted data. Returns: @@ -940,10 +970,10 @@ def shift(self, shifts, *, restrict_domain=False, extrapolation=None, # Column vector with shapes shifts = np.atleast_2d(shifts).T - sample_points = self.sample_points + shifts + grid_points = self.grid_points + shifts domain_range = self.domain_range + shifts - return self.copy(sample_points=sample_points, + return self.copy(grid_points=grid_points, domain_range=domain_range) if shifts.shape[0] != self.n_samples: raise ValueError(f"shifts vector ({shifts.shape[0]}) must have the" @@ -951,7 +981,7 @@ def shift(self, shifts, *, restrict_domain=False, extrapolation=None, f"({self.n_samples})") if eval_points is None: - eval_points = self.sample_points + eval_points = self.grid_points else: eval_points = np.atleast_2d(eval_points) @@ -989,7 +1019,7 @@ def shift(self, shifts, *, restrict_domain=False, extrapolation=None, aligned=False, grid=True) - return self.copy(data_matrix=data_matrix, sample_points=eval_points, + return self.copy(data_matrix=data_matrix, grid_points=eval_points, domain_range=domain) def compose(self, fd, *, eval_points=None): @@ -1016,7 +1046,7 @@ def compose(self, fd, *, eval_points=None): if fd.dim_domain == 1: if eval_points is None: try: - eval_points = fd.sample_points[0] + eval_points = fd.grid_points[0] except AttributeError: eval_points = np.linspace(*fd.domain_range[0], constants.N_POINTS_COARSE_MESH) @@ -1026,7 +1056,7 @@ def compose(self, fd, *, eval_points=None): aligned=False) else: if eval_points is None: - eval_points = fd.sample_points + eval_points = fd.grid_points grid_transformation = fd(eval_points, grid=True) @@ -1045,14 +1075,14 @@ def compose(self, fd, *, eval_points=None): aligned=False) return self.copy(data_matrix=data_matrix, - sample_points=eval_points, + grid_points=eval_points, domain_range=fd.domain_range, argument_names=fd.argument_names) def __str__(self): """Return str(self).""" return ('Data set: ' + str(self.data_matrix) - + '\nsample_points: ' + str(self.sample_points) + + '\ngrid_points: ' + str(self.grid_points) + '\ntime range: ' + str(self.domain_range)) def __repr__(self): @@ -1060,7 +1090,7 @@ def __repr__(self): return (f"FDataGrid(" f"\n{repr(self.data_matrix)}," - f"\nsample_points={repr(self.sample_points)}," + f"\ngrid_points={repr(self.grid_points)}," f"\ndomain_range={repr(self.domain_range)}," f"\ndataset_name={repr(self.dataset_name)}," f"\nargument_names={repr(self.argument_names)}," @@ -1085,7 +1115,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): for i in inputs: if isinstance(i, FDataGrid) and not np.array_equal( - i.sample_points, self.sample_points): + i.grid_points, self.grid_points): return NotImplemented new_inputs = [i.data_matrix if isinstance(i, FDataGrid) @@ -1121,7 +1151,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def dtype(self): """The dtype for this extension array, FDataGridDType""" return FDataGridDType( - sample_points=self.sample_points, + grid_points=self.grid_points, domain_range=self.domain_range, dim_codomain=self.dim_codomain) @@ -1131,7 +1161,7 @@ def nbytes(self) -> int: The number of bytes needed to store this object in memory. """ return self.data_matrix.nbytes + sum( - p.nbytes for p in self.sample_points) + p.nbytes for p in self.grid_points) def isna(self): """ @@ -1153,14 +1183,14 @@ class FDataGridDType(pandas.api.extensions.ExtensionDtype): type = FDataGrid na_value = pandas.NA - def __init__(self, sample_points, dim_codomain, domain_range=None) -> None: - sample_points = _tuple_of_arrays(sample_points) + def __init__(self, grid_points, dim_codomain, domain_range=None) -> None: + grid_points = _tuple_of_arrays(grid_points) - self.sample_points = tuple(tuple(s) for s in sample_points) + self.grid_points = tuple(tuple(s) for s in grid_points) if domain_range is None: domain_range = np.array( - [(s[0], s[-1]) for s in self.sample_points]) + [(s[0], s[-1]) for s in self.grid_points]) self.domain_range = _domain_range(domain_range) self.dim_codomain = dim_codomain @@ -1172,13 +1202,13 @@ def construct_array_type(cls): def _na_repr(self) -> FDataGrid: shape = ((1,) - + tuple(len(s) for s in self.sample_points) + + tuple(len(s) for s in self.grid_points) + (self.dim_codomain,)) data_matrix = np.full(shape=shape, fill_value=np.NaN) return FDataGrid( - sample_points=self.sample_points, + grid_points=self.grid_points, domain_range=self.domain_range, data_matrix=data_matrix) @@ -1198,8 +1228,8 @@ def __eq__(self, other: Any) -> bool: return (isinstance(other, FDataGridDType) and self.dim_codomain == other.dim_codomain and self.domain_range == other.domain_range - and self.sample_points == other.sample_points) + and self.grid_points == other.grid_points) def __hash__(self) -> int: - return hash((self.sample_points, + return hash((self.grid_points, self.domain_range, self.dim_codomain)) diff --git a/skfda/representation/interpolation.py b/skfda/representation/interpolation.py index 0b5a40f91..0384b5dd0 100644 --- a/skfda/representation/interpolation.py +++ b/skfda/representation/interpolation.py @@ -125,19 +125,19 @@ def __init__(self, fdatagrid, if self.monotone and self.interpolation_order == 1: monotone = False - sample_points = fdatagrid.sample_points[0] + grid_points = fdatagrid.grid_points[0] if monotone: def constructor(data): """Constructs an unidimensional cubic monotone interpolation""" - return PchipInterpolator(sample_points, data) + return PchipInterpolator(grid_points, data) else: def constructor(data): """Constructs an unidimensional interpolation""" return UnivariateSpline( - sample_points, data, + grid_points, data, s=self.smoothness_parameter, k=self.interpolation_order) @@ -216,8 +216,8 @@ def __init__(self, fdatagrid, for i in range(fdatagrid.n_samples): for j in range(fdatagrid.dim_codomain): self.splines[i, j] = RectBivariateSpline( - fdatagrid.sample_points[0], - fdatagrid.sample_points[1], + fdatagrid.grid_points[0], + fdatagrid.grid_points[1], fdatagrid.data_matrix[i, :, :, j], kx=kx, ky=ky, s=self.smoothness_parameter) @@ -245,7 +245,7 @@ class _SplineListND(_SplineList): RegularGridInterpolator. Args: - sample_points (np.ndarray): Sample points of the fdatagrid. + grid_points (np.ndarray): Sample points of the fdatagrid. data_matrix (np.ndarray): Data matrix of the fdatagrid. k (integer): Order of the spline interpolations. @@ -287,7 +287,7 @@ def __init__(self, fdatagrid, for i in range(fdatagrid.n_samples): for j in range(fdatagrid.dim_codomain): self.splines[i, j] = RegularGridInterpolator( - fdatagrid.sample_points, fdatagrid.data_matrix[i, ..., j], + fdatagrid.grid_points, fdatagrid.data_matrix[i, ..., j], method, False) def _evaluate_one(self, spl, t, derivative=0): diff --git a/tests/test_basis.py b/tests/test_basis.py index c12d5af1d..0f9ef2d31 100644 --- a/tests/test_basis.py +++ b/tests/test_basis.py @@ -18,7 +18,8 @@ def test_from_data_cholesky(self): x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = BSpline((0, 1), n_basis=5) np.testing.assert_array_almost_equal( - FDataBasis.from_data(x, t, basis, method='cholesky' + FDataBasis.from_data(x, grid_points=t, basis=basis, + method='cholesky' ).coefficients.round(2), np.array([[1., 2.78, -3., -0.78, 1.]]) ) @@ -28,7 +29,8 @@ def test_from_data_qr(self): x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = BSpline((0, 1), n_basis=5) np.testing.assert_array_almost_equal( - FDataBasis.from_data(x, t, basis, method='qr' + FDataBasis.from_data(x, grid_points=t, basis=basis, + method='qr' ).coefficients.round(2), np.array([[1., 2.78, -3., -0.78, 1.]]) ) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index 3bdc5bbbd..2499237dd 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -13,10 +13,10 @@ def test_kmeans_univariate(self): data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] - sample_points = [0, 2, 4, 6, 8, 10] - fd = FDataGrid(data_matrix, sample_points) + grid_points = [0, 2, 4, 6, 8, 10] + fd = FDataGrid(data_matrix, grid_points) init = np.array([[0, 0, 0, 0, 0, 0], [2, 1, -1, 0.5, 0, -0.5]]) - init_fd = FDataGrid(init, sample_points) + init_fd = FDataGrid(init, grid_points) kmeans = KMeans(init=init_fd) kmeans.fit(fd) np.testing.assert_array_equal(kmeans.predict(fd), @@ -29,7 +29,7 @@ def test_kmeans_univariate(self): centers = FDataGrid(data_matrix=np.array( [[0.16666667, 0.16666667, 0.83333333, 2., 1.66666667, 1.16666667], [-0.5, -0.5, -0.5, -1., -1., -1.]]), - sample_points=sample_points) + grid_points=grid_points) np.testing.assert_array_almost_equal( kmeans.cluster_centers_.data_matrix, centers.data_matrix) @@ -40,8 +40,8 @@ def test_kmeans_univariate(self): # data_matrix = [[[1, 0.3], [2, 0.4], [3, 0.5], [4, 0.6]], # [[2, 0.5], [3, 0.6], [4, 0.7], [5, 0.7]], # [[3, 0.2], [4, 0.3], [5, 0.4], [6, 0.5]]] - # sample_points = [2, 4, 6, 8] - # fd = FDataGrid(data_matrix, sample_points) + # grid_points = [2, 4, 6, 8] + # fd = FDataGrid(data_matrix, grid_points) # kmeans = KMeans() # kmeans.fit(fd) # np.testing.assert_array_equal(kmeans.predict(fd), @@ -58,7 +58,7 @@ def test_kmeans_univariate(self): # centers = FDataGrid(data_matrix=np.array( # [[[3, 0.2], [4, 0.3], [5, 0.4], [6, 0.5]], # [[1.5, 0.4], [2.5, 0.5], [3.5, 0.6], [4.5, 0.65]]]), - # sample_points=sample_points) + # grid_points=grid_points) # np.testing.assert_allclose(kmeans.cluster_centers_.data_matrix, # centers.data_matrix) # np.testing.assert_allclose(kmeans.score(fd), np.array([-3., -0.1075])) @@ -68,8 +68,8 @@ def test_fuzzy_kmeans_univariate(self): data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] - sample_points = [0, 2, 4, 6, 8, 10] - fd = FDataGrid(data_matrix, sample_points) + grid_points = [0, 2, 4, 6, 8, 10] + fd = FDataGrid(data_matrix, grid_points) fuzzy_kmeans = FuzzyCMeans() fuzzy_kmeans.fit(fd) np.testing.assert_array_equal(fuzzy_kmeans.predict(fd).round(3), @@ -95,11 +95,11 @@ def test_fuzzy_kmeans_univariate(self): # data_matrix = [[[1, 0.3], [2, 0.4], [3, 0.5], [4, 0.6]], # [[2, 0.5], [3, 0.6], [4, 0.7], [5, 0.7]], # [[3, 0.2], [4, 0.3], [5, 0.4], [6, 0.5]]] - # sample_points = [2, 4, 6, 8] - # fd = FDataGrid(data_matrix, sample_points) + # grid_points = [2, 4, 6, 8] + # fd = FDataGrid(data_matrix, grid_points) # init = np.array([[[3, 0], [5, 0], [2, 0], [4, 0]], # [[0, 0], [0, 1], [0, 0], [0, 1]]]) - # init_fd = FDataGrid(init, sample_points) + # init_fd = FDataGrid(init, grid_points) # fuzzy_kmeans = FuzzyKMeans(init=init_fd) # fuzzy_kmeans.fit(fd) # np.testing.assert_array_equal(fuzzy_kmeans.predict(fd), @@ -118,7 +118,7 @@ def test_fuzzy_kmeans_univariate(self): # [24., 0.78333333]]])) # centers = FDataGrid(data_matrix=np.array( # [[[2, 0], [3, 0], [4, 0], [5, 0]], - # [[1, 0], [2, 0], [3, 0], [4, 0]]]), sample_points=sample_points) + # [[1, 0], [2, 0], [3, 0], [4, 0]]]), grid_points=grid_points) # np.testing.assert_allclose(fuzzy_kmeans.cluster_centers_.data_matrix, # centers.data_matrix) # np.testing.assert_allclose(fuzzy_kmeans.score(fd), np.array( diff --git a/tests/test_elastic.py b/tests/test_elastic.py index ea980d882..1671cd5a4 100644 --- a/tests/test_elastic.py +++ b/tests/test_elastic.py @@ -62,7 +62,7 @@ def test_from_srsf_with_output_points(self): # Checks SRSF conversion srsf_transformer = SRSF( initial_value=0, - output_points=self.dummy_sample.sample_points[0]) + output_points=self.dummy_sample.grid_points[0]) srsf = srsf_transformer.inverse_transform(self.dummy_sample) data_matrix = [[[0.], [-0.23449228], [-0.83464009], diff --git a/tests/test_fdata_boxplot.py b/tests/test_fdata_boxplot.py index 63e011730..b5fa0e045 100644 --- a/tests/test_fdata_boxplot.py +++ b/tests/test_fdata_boxplot.py @@ -1,9 +1,9 @@ -import unittest - -import numpy as np from skfda import FDataGrid from skfda.exploratory.depth import fraiman_muniz_depth from skfda.exploratory.visualization import Boxplot, SurfaceBoxplot +import unittest + +import numpy as np class TestBoxplot(unittest.TestCase): @@ -13,8 +13,8 @@ def test_fdboxplot_univariate(self): [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] - sample_points = [0, 2, 4, 6, 8, 10] - fd = FDataGrid(data_matrix, sample_points) + grid_points = [0, 2, 4, 6, 8, 10] + fd = FDataGrid(data_matrix, grid_points) fdataBoxplot = Boxplot(fd, depth_method=fraiman_muniz_depth) np.testing.assert_array_equal( fdataBoxplot.median.ravel(), diff --git a/tests/test_fpca.py b/tests/test_fpca.py index 98f3c499f..a5d69b287 100644 --- a/tests/test_fpca.py +++ b/tests/test_fpca.py @@ -36,13 +36,13 @@ def test_discretized_fpca_fit_attributes(self): # check that if n_components is bigger than the number of samples then # an exception should be thrown - fd = FDataGrid([[0.5], [0.1]], sample_points=[0]) + fd = FDataGrid([[0.5], [0.1]], grid_points=[0]) with self.assertRaises(AttributeError): fpca.fit(fd) # check that n_components must be smaller than the number of attributes # in the FDataGrid object - fd = FDataGrid([[0.9], [0.7], [0.5]], sample_points=[0]) + fd = FDataGrid([[0.9], [0.7], [0.5]], grid_points=[0]) with self.assertRaises(AttributeError): fpca.fit(fd) diff --git a/tests/test_grid.py b/tests/test_grid.py index 028524c85..a0daa09e4 100644 --- a/tests/test_grid.py +++ b/tests/test_grid.py @@ -19,7 +19,7 @@ def test_init(self): np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]])) np.testing.assert_array_equal(fd.sample_range, [(0, 1)]) np.testing.assert_array_equal( - fd.sample_points, np.array([[0., 0.25, 0.5, 0.75, 1.]])) + fd.grid_points, np.array([[0., 0.25, 0.5, 0.75, 1.]])) def test_copy_equals(self): fd = FDataGrid([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]]) @@ -33,7 +33,7 @@ def test_mean(self): np.array([1.5, 2.5, 3.5, 4.5, 5.5])) np.testing.assert_array_equal(fd.sample_range, [(0, 1)]) np.testing.assert_array_equal( - fd.sample_points, + fd.grid_points, np.array([[0., 0.25, 0.5, 0.75, 1.]])) def test_gmean(self): @@ -45,7 +45,7 @@ def test_gmean(self): np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]]))) np.testing.assert_array_equal(fd.sample_range, [(0, 1)]) np.testing.assert_array_equal( - fd.sample_points, + fd.grid_points, np.array([[0., 0.25, 0.5, 0.75, 1.]])) def test_slice(self): @@ -177,9 +177,9 @@ def test_composition(self): X, Y, Z = axes3d.get_test_data(1.2) data_matrix = [Z.T] - sample_points = [X[0, :], Y[:, 0]] + grid_points = [X[0, :], Y[:, 0]] - g = FDataGrid(data_matrix, sample_points) + g = FDataGrid(data_matrix, grid_points) self.assertEqual(g.dim_domain, 2) self.assertEqual(g.dim_codomain, 1) @@ -210,9 +210,9 @@ def setUp(self): ] ]) - sample_points = [[0, 1], [0, 1]] + grid_points = [[0, 1], [0, 1]] - fd = FDataGrid(data_matrix, sample_points=sample_points) + fd = FDataGrid(data_matrix, grid_points=grid_points) self.assertEqual(fd.n_samples, 2) self.assertEqual(fd.dim_domain, 2) self.assertEqual(fd.dim_codomain, 3) diff --git a/tests/test_interpolation.py b/tests/test_interpolation.py index 170a6204f..be9f89e4b 100644 --- a/tests/test_interpolation.py +++ b/tests/test_interpolation.py @@ -23,7 +23,7 @@ def setUp(self): def test_evaluation_linear_simple(self): """Test basic usage of evaluation""" - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10)) + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10)) # Test interpolation in nodes np.testing.assert_array_almost_equal( @@ -38,7 +38,7 @@ def test_evaluation_linear_simple(self): def test_evaluation_linear_point(self): """Test the evaluation of a single point""" - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10)) + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10)) # Test a single point np.testing.assert_array_almost_equal(f(5.3).round(1), @@ -51,7 +51,7 @@ def test_evaluation_linear_point(self): def test_evaluation_linear_grid(self): """Test grid evaluation. With domain dimension = 1""" - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10)) + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10)) # Test interpolation in nodes np.testing.assert_array_almost_equal(f(np.arange(10))[..., 0], @@ -74,7 +74,7 @@ def test_evaluation_linear_grid(self): def test_evaluation_linear_composed(self): - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10)) + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10)) # Evaluate (x**2, (9-x)**2) in (1,8) np.testing.assert_array_almost_equal(f([[1], [8]], @@ -96,7 +96,7 @@ def test_evaluation_linear_composed(self): def test_evaluation_cubic_simple(self): """Test basic usage of evaluation""" - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=SplineInterpolation(3)) # Test interpolation in nodes @@ -112,7 +112,7 @@ def test_evaluation_cubic_simple(self): def test_evaluation_cubic_point(self): """Test the evaluation of a single point""" - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=SplineInterpolation(3)) # Test a single point @@ -127,7 +127,7 @@ def test_evaluation_cubic_point(self): def test_evaluation_cubic_grid(self): """Test grid evaluation. With domain dimension = 1""" - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=SplineInterpolation(3)) t = [0.5, 1.5, 2.5] @@ -148,7 +148,7 @@ def test_evaluation_cubic_grid(self): def test_evaluation_cubic_composed(self): - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=SplineInterpolation(3)) # Evaluate (x**2, (9-x)**2) in (1,8) @@ -174,7 +174,7 @@ def test_evaluation_nodes(self): for degree in range(1, 6): interpolation = SplineInterpolation(degree) - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=interpolation) # Test interpolation in nodes @@ -186,13 +186,13 @@ def test_error_degree(self): with np.testing.assert_raises(ValueError): interpolation = SplineInterpolation(7) - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=interpolation) f(1) with np.testing.assert_raises(ValueError): interpolation = SplineInterpolation(0) - f = FDataGrid(self.data_matrix_1_1, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_1, grid_points=np.arange(10), interpolation=interpolation) f(1) @@ -221,7 +221,7 @@ def setUp(self): def test_evaluation_simple(self): """Test basic usage of evaluation""" - f = FDataGrid(self.data_matrix_1_n, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_n, grid_points=np.arange(10), interpolation=self.interpolation) # Test interpolation in nodes @@ -240,7 +240,7 @@ def test_evaluation_simple(self): def test_evaluation_point(self): """Test the evaluation of a single point""" - f = FDataGrid(self.data_matrix_1_n, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_n, grid_points=np.arange(10), interpolation=self.interpolation) # Test a single point @@ -253,7 +253,7 @@ def test_evaluation_point(self): def test_evaluation_grid(self): """Test grid evaluation. With domain dimension = 1""" - f = FDataGrid(self.data_matrix_1_n, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_n, grid_points=np.arange(10), interpolation=SplineInterpolation(2)) t = [1.5, 2.5, 3.5] @@ -275,7 +275,7 @@ def test_evaluation_grid(self): def test_evaluation_composed(self): - f = FDataGrid(self.data_matrix_1_n, sample_points=self.t, + f = FDataGrid(self.data_matrix_1_n, grid_points=self.t, interpolation=self.interpolation) # Evaluate (x**2, (9-x)**2) in (1,8) @@ -292,7 +292,7 @@ def test_evaluation_nodes(self): for degree in range(1, 6): interpolation = SplineInterpolation(degree) - f = FDataGrid(self.data_matrix_1_n, sample_points=np.arange(10), + f = FDataGrid(self.data_matrix_1_n, grid_points=np.arange(10), interpolation=interpolation) # Test interpolation in nodes @@ -317,12 +317,12 @@ def coordinate_function(*args): return np.sum(domain_indexes) for dim_domain in range(1, 6): - sample_points = [np.array([0, 1]) for _ in range(dim_domain)] + grid_points = [np.array([0, 1]) for _ in range(dim_domain)] data_matrix = np.fromfunction( function=coordinate_function, shape=(n_samples,) + (2,) * dim_domain + (dim_codomain,)) - f = FDataGrid(data_matrix, sample_points=sample_points) + f = FDataGrid(data_matrix, grid_points=grid_points) evaluation = f([[0.] * dim_domain, [0.5] * dim_domain, [1.] * dim_domain]) diff --git a/tests/test_math.py b/tests/test_math.py index a53729c8c..31789f836 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -22,10 +22,10 @@ def f(x, y, z): data_matrix = f(x2, y2, z2) - sample_points = [t, 2 * t, 3 * t] + grid_points = [t, 2 * t, 3 * t] fd = skfda.FDataGrid( - data_matrix[np.newaxis, ...], sample_points=sample_points) + data_matrix[np.newaxis, ...], grid_points=grid_points) basis = Tensor([Monomial(n_basis=5, domain_range=(0, 1)), Monomial(n_basis=5, domain_range=(0, 2)), @@ -52,10 +52,10 @@ def g(y): data_matrix = np.array([np.array([f(t), g(t)]).T]) - sample_points = [t] + grid_points = [t] fd = skfda.FDataGrid( - data_matrix, sample_points=sample_points) + data_matrix, grid_points=grid_points) basis = VectorValued([Monomial(n_basis=5), Monomial(n_basis=5)]) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index e95371f1b..9c38b5db7 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -13,9 +13,9 @@ class TestLpMetrics(unittest.TestCase): def setUp(self): - sample_points = [1, 2, 3, 4, 5] + grid_points = [1, 2, 3, 4, 5] self.fd = FDataGrid([[2, 3, 4, 5, 6], [1, 4, 9, 16, 25]], - sample_points=sample_points) + grid_points=grid_points) basis = Monomial(n_basis=3, domain_range=(1, 5)) self.fd_basis = FDataBasis(basis, [[1, 1, 0], [0, 0, 1]]) self.fd_curve = self.fd.concatenate(self.fd, as_coordinates=True) @@ -55,17 +55,17 @@ def test_lp_error_dimensions(self): lp_distance(self.fd_surface, self.fd_curve) def test_lp_error_domain_ranges(self): - sample_points = [2, 3, 4, 5, 6] + grid_points = [2, 3, 4, 5, 6] fd2 = FDataGrid([[2, 3, 4, 5, 6], [1, 4, 9, 16, 25]], - sample_points=sample_points) + grid_points=grid_points) with np.testing.assert_raises(ValueError): lp_distance(self.fd, fd2) - def test_lp_error_sample_points(self): - sample_points = [1, 2, 4, 4.3, 5] + def test_lp_error_grid_points(self): + grid_points = [1, 2, 4, 4.3, 5] fd2 = FDataGrid([[2, 3, 4, 5, 6], [1, 4, 9, 16, 25]], - sample_points=sample_points) + grid_points=grid_points) with np.testing.assert_raises(ValueError): lp_distance(self.fd, fd2) diff --git a/tests/test_oneway_anova.py b/tests/test_oneway_anova.py index 31eed81b7..a08af91b4 100644 --- a/tests/test_oneway_anova.py +++ b/tests/test_oneway_anova.py @@ -39,7 +39,7 @@ def test_v_stats(self): m1 = [1 for _ in range(n_features)] m2 = [2 for _ in range(n_features)] m3 = [3 for _ in range(n_features)] - fd = FDataGrid([m1, m2, m3], sample_points=t) + fd = FDataGrid([m1, m2, m3], grid_points=t) self.assertEqual(v_sample_stat(fd, weights), 7.0) self.assertAlmostEqual(v_sample_stat(fd.to_basis(Fourier(n_basis=5)), weights), 7.0) diff --git a/tests/test_outliers.py b/tests/test_outliers.py index a46abce5d..b1806f1bb 100644 --- a/tests/test_outliers.py +++ b/tests/test_outliers.py @@ -1,10 +1,10 @@ -import unittest - -import numpy as np from skfda import FDataGrid from skfda.exploratory.depth import modified_band_depth from skfda.exploratory.outliers import DirectionalOutlierDetector from skfda.exploratory.outliers import directional_outlyingness_stats +import unittest + +import numpy as np class TestsDirectionalOutlyingness(unittest.TestCase): @@ -13,8 +13,8 @@ def test_directional_outlyingness(self): data_matrix = [[[0.3], [0.4], [0.5], [0.6]], [[0.5], [0.6], [0.7], [0.7]], [[0.2], [0.3], [0.4], [0.5]]] - sample_points = [2, 4, 6, 8] - fd = FDataGrid(data_matrix, sample_points) + grid_points = [2, 4, 6, 8] + fd = FDataGrid(data_matrix, grid_points) stats = directional_outlyingness_stats( fd, depth_method=modified_band_depth) np.testing.assert_allclose(stats.directional_outlyingness, @@ -46,8 +46,8 @@ def test_asymptotic_formula(self): [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] - sample_points = [0, 2, 4, 6, 8, 10] - fd = FDataGrid(data_matrix, sample_points) + grid_points = [0, 2, 4, 6, 8, 10] + fd = FDataGrid(data_matrix, grid_points) out_detector = DirectionalOutlierDetector( _force_asymptotic=True) prediction = out_detector.fit_predict(fd) diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index 73ba3df22..b459f13fb 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -16,7 +16,7 @@ def dtype(): """A fixture providing the ExtensionDtype to validate.""" return skfda.representation.grid.FDataGridDType( - sample_points=[ + grid_points=[ np.arange(10), np.arange(10) / 10], dim_codomain=3 @@ -32,11 +32,11 @@ def data(): """ data_matrix = np.arange(1, 100 * 10 * 10 * 3 + 1).reshape(100, 10, 10, 3) - sample_points = [ + grid_points = [ np.arange(10), np.arange(10) / 10] - return skfda.FDataGrid(data_matrix, sample_points=sample_points) + return skfda.FDataGrid(data_matrix, grid_points=grid_points) @pytest.fixture @@ -45,11 +45,11 @@ def data_for_twos(): data_matrix = np.full( 100 * 10 * 10 * 3, fill_value=2).reshape(100, 10, 10, 3) - sample_points = [ + grid_points = [ np.arange(10), np.arange(10) / 10] - return skfda.FDataGrid(data_matrix, sample_points=sample_points) + return skfda.FDataGrid(data_matrix, grid_points=grid_points) @pytest.fixture @@ -59,11 +59,11 @@ def data_missing(): data_matrix = np.arange( 2 * 10 * 10 * 3, dtype=np.float_).reshape(2, 10, 10, 3) data_matrix[0, ...] = np.NaN - sample_points = [ + grid_points = [ np.arange(10), np.arange(10) / 10] - return skfda.FDataGrid(data_matrix, sample_points=sample_points) + return skfda.FDataGrid(data_matrix, grid_points=grid_points) @pytest.fixture(params=["data", "data_missing"]) diff --git a/tests/test_recursive_maxima_hunting.py b/tests/test_recursive_maxima_hunting.py index 1834d3902..3f73c4201 100644 --- a/tests/test_recursive_maxima_hunting.py +++ b/tests/test_recursive_maxima_hunting.py @@ -38,7 +38,7 @@ def mean_1(t): stopping_condition=stopping_condition) _ = rmh.fit(X, y) point_mask = rmh.get_support() - points = X.sample_points[0][point_mask] + points = X.grid_points[0][point_mask] np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) diff --git a/tests/test_registration.py b/tests/test_registration.py index 8c455bdca..62781ee4b 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -48,7 +48,7 @@ def test_standard_normalize_warping(self): # Test new domain range (0, 1) np.testing.assert_array_equal(normalized.domain_range, [(0, 1)]) - np.testing.assert_array_almost_equal(normalized.sample_points[0], + np.testing.assert_array_almost_equal(normalized.grid_points[0], np.linspace(0, 1, 50)) np.testing.assert_array_almost_equal( @@ -65,7 +65,7 @@ def test_standard_normalize_warping_default_value(self): # Test new domain range (0, 1) np.testing.assert_array_equal(normalized.domain_range, [(-1, 1)]) - np.testing.assert_array_almost_equal(normalized.sample_points[0], + np.testing.assert_array_almost_equal(normalized.grid_points[0], np.linspace(-1, 1, 50)) np.testing.assert_array_almost_equal( @@ -84,7 +84,7 @@ def test_normalize_warping(self): # Test new domain range (0, 1) np.testing.assert_array_equal(normalized.domain_range, [domain]) - np.testing.assert_array_almost_equal(normalized.sample_points[0], + np.testing.assert_array_almost_equal(normalized.grid_points[0], np.linspace(*domain, 50)) np.testing.assert_array_equal(normalized(a)[..., 0], [[a], [a]]) @@ -340,7 +340,7 @@ def test_amplitude_phase_score(self): np.testing.assert_allclose(score, 0.972095, rtol=1e-6) def test_amplitude_phase_score_with_output_points(self): - eval_points = self.X.sample_points[0] + eval_points = self.X.grid_points[0] scorer = AmplitudePhaseDecomposition(eval_points=eval_points) score = scorer(self.shift_registration, self.X) np.testing.assert_allclose(score, 0.972095, rtol=1e-6) diff --git a/tests/test_smoothing.py b/tests/test_smoothing.py index 076ca5ed9..1061bc02a 100644 --- a/tests/test_smoothing.py +++ b/tests/test_smoothing.py @@ -78,7 +78,7 @@ def test_cholesky(self): t = np.linspace(0, 1, 5) x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = BSpline((0, 1), n_basis=5) - fd = FDataGrid(data_matrix=x, sample_points=t) + fd = FDataGrid(data_matrix=x, grid_points=t) smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=10, @@ -96,7 +96,7 @@ def test_qr(self): t = np.linspace(0, 1, 5) x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = BSpline((0, 1), n_basis=5) - fd = FDataGrid(data_matrix=x, sample_points=t) + fd = FDataGrid(data_matrix=x, grid_points=t) smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=10, @@ -116,7 +116,7 @@ def test_monomial_smoothing(self): t = np.linspace(0, 1, 5) x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = Monomial(n_basis=4) - fd = FDataGrid(data_matrix=x, sample_points=t) + fd = FDataGrid(data_matrix=x, grid_points=t) smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=1, From d4090262e88fd28c9aae3483ac0ad5c58ab8ca3f Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 13 Oct 2020 00:55:30 +0200 Subject: [PATCH 060/210] Added integrated depth. --- skfda/exploratory/depth/_depth.py | 127 +++++++++++++++--------- skfda/exploratory/depth/multivariate.py | 92 ++++++++++++++++- 2 files changed, 173 insertions(+), 46 deletions(-) diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index 3c7833e79..d4c2e0849 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -3,6 +3,7 @@ This module includes different methods to order functional data, from the center (larger values) outwards(smaller ones).""" +import abc from functools import reduce import math @@ -11,11 +12,91 @@ import numpy as np +from . import multivariate + __author__ = "Amanda Hernando Bernabé" __email__ = "amanda.hernando@estudiante.uam.es" +class FunctionalDepth(multivariate.Depth): + """ + Abstract class representing a functional depth function. + + Usually it will accept a distribution in the initializer. + + """ + pass + + +def _cumulative_distribution(column): + """Calculates the cumulative distribution function of the values passed to + the function and evaluates it at each point. + + Args: + column (numpy.darray): Array containing the values over which the + distribution function is calculated. + + Returns: + numpy.darray: Array containing the evaluation at each point of the + distribution function. + + Examples: + >>> _cumulative_distribution(np.array([1, 4, 5, 1, 2, 2, 4, 1, 1, 3])) + array([ 0.4, 0.9, 1. , 0.4, 0.6, 0.6, 0.9, 0.4, 0.4, 0.7]) + + """ + if len(column.shape) != 1: + raise ValueError("Only supported 1 dimensional arrays.") + _, indexes, counts = np.unique(column, return_inverse=True, + return_counts=True) + count_cumulative = np.cumsum(counts) / len(column) + return count_cumulative[indexes].reshape(column.shape) + + +class IntegratedDepth(FunctionalDepth): + """ + Functional depth as the integral of a multivariate depth. + + """ + + def __init__(self, distribution, *, + multivariate_depth=multivariate._UnivariateFraimanMuniz): + if distribution.dim_domain > 1 or distribution.dim_codomain > 1: + raise ValueError("Currently multivariate data is not allowed") + + self._domain_range = distribution.domain_range + self._grid_points = distribution.grid_points + self._multivariate_depth = multivariate_depth(distribution.data_matrix) + + def __call__(self, data_points, pointwise=False): + if data_points.dim_domain > 1 or data_points.dim_codomain > 1: + raise ValueError("Currently multivariate data is not allowed") + + pointwise_depth = self._multivariate_depth(data_points.data_matrix) + + if pointwise: + return pointwise_depth + else: + + interval_len = (self._domain_range[0][1] + - self._domain_range[0][0]) + + depth = (scipy.integrate.simps(pointwise_depth, + self._grid_points[0]) + / interval_len) + + return depth + + @property + def max(self): + return self._multivariate_depth.max + + @property + def min(self): + return self._multivariate_depth.min + + def outlyingness_to_depth(outlyingness, *, supreme=None): r"""Convert outlyingness function to depth function. @@ -222,31 +303,6 @@ def modified_band_depth(fdatagrid, *, pointwise=False): return depth -def _cumulative_distribution(column): - """Calculates the cumulative distribution function of the values passed to - the function and evaluates it at each point. - - Args: - column (numpy.darray): Array containing the values over which the - distribution function is calculated. - - Returns: - numpy.darray: Array containing the evaluation at each point of the - distribution function. - - Examples: - >>> _cumulative_distribution(np.array([1, 4, 5, 1, 2, 2, 4, 1, 1, 3])) - array([ 0.4, 0.9, 1. , 0.4, 0.6, 0.6, 0.9, 0.4, 0.4, 0.7]) - - """ - if len(column.shape) != 1: - raise ValueError("Only supported 1 dimensional arrays.") - _, indexes, counts = np.unique(column, return_inverse=True, - return_counts=True) - count_cumulative = np.cumsum(counts) / len(column) - return count_cumulative[indexes].reshape(column.shape) - - def fraiman_muniz_depth(fdatagrid, *, pointwise=False): r"""Implementation of Fraiman and Muniz (FM) Depth for functional data. @@ -302,23 +358,4 @@ def fraiman_muniz_depth(fdatagrid, *, pointwise=False): """ - if fdatagrid.dim_domain > 1 or fdatagrid.dim_codomain > 1: - raise ValueError("Currently multivariate data is not allowed") - - pointwise_depth = np.array([ - 1 - abs(0.5 - _cumulative_distribution( - fdatagrid.data_matrix[:, i, 0]) - ) for i in range(len(fdatagrid.grid_points[0]))]).T - - if pointwise: - return pointwise_depth - else: - - interval_len = (fdatagrid.domain_range[0][1] - - fdatagrid.domain_range[0][0]) - - depth = (scipy.integrate.simps(pointwise_depth, - fdatagrid.grid_points[0]) - / interval_len) - - return depth + return IntegratedDepth(fdatagrid)(fdatagrid, pointwise=pointwise) diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index 2fb9f6a2e..31b99b536 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -1,8 +1,97 @@ +import abc + import scipy.stats import numpy as np -from . import outlyingness_to_depth + +class Depth(abc.ABC): + """ + Abstract class representing a depth function. + + Usually it will accept a distribution in the initializer. + + """ + + @abc.abstractmethod + def __init__(self, distribution): + pass + + @abc.abstractmethod + def __call__(self, data_points): + """ + Evaluate the depth over a different set of points. + + """ + pass + + @property + def max(self): + """ + Maximum (or supremum if there is no maximum) of the depth values. + + """ + return 1 + + @property + def min(self): + """ + Minimum (or infimum if there is no maximum) of the depth values. + + """ + return 0 + + +def _cumulative_one_dim(array, values): + searched_index = np.searchsorted(array, values, side='right') + + return searched_index / len(array) + + +_cumulative_distribution_ordered = np.vectorize( + _cumulative_one_dim, + signature='(n),(m)->(m)') + + +def _cumulative_distribution(column): + """Calculates the cumulative distribution function of the values passed to + the function and evaluates it at each point. + + Args: + column (numpy.darray): Array containing the values over which the + distribution function is calculated. + + Returns: + numpy.darray: Array containing the evaluation at each point of the + distribution function. + + Examples: + >>> _cumulative_distribution(np.array([1, 4, 5, 1, 2, 2, 4, 1, 1, 3])) + array([ 0.4, 0.9, 1. , 0.4, 0.6, 0.6, 0.9, 0.4, 0.4, 0.7]) + + """ + return _cumulative_distribution_ordered(np.sort(column), column) + + +class _UnivariateFraimanMuniz(Depth): + """ + Univariate depth used to compute the Fraiman an Muniz depth. + """ + + def __init__(self, distribution): + self.sorted_values = np.sort(distribution, axis=0) + + def __call__(self, data_points): + cum_dist = _cumulative_distribution_ordered( + np.moveaxis(self.sorted_values, 0, -1), + np.moveaxis(data_points, 0, -1)) + + assert cum_dist.shape[-2] == 1 + return 1 - np.abs(0.5 - np.moveaxis(cum_dist, -1, 0)[..., 0]) + + @property + def min(self): + return 1 / 2 def _stagel_donoho_outlyingness(X, *, pointwise=False): @@ -27,6 +116,7 @@ def projection_depth(X, *, pointwise=False): The projection depth is the depth function associated with the Stagel-Donoho outlyingness. """ + from . import outlyingness_to_depth depth = outlyingness_to_depth(_stagel_donoho_outlyingness) From b730955f9ac8c3d6f1e8636601a174c64056bcb7 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 13 Oct 2020 17:55:44 +0200 Subject: [PATCH 061/210] Use Sklearn API for depths. --- skfda/exploratory/depth/_depth.py | 25 +++++---- skfda/exploratory/depth/multivariate.py | 68 +++++++++++++++++++++---- 2 files changed, 72 insertions(+), 21 deletions(-) diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index d4c2e0849..a7a6a79bd 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -60,20 +60,24 @@ class IntegratedDepth(FunctionalDepth): """ - def __init__(self, distribution, *, - multivariate_depth=multivariate._UnivariateFraimanMuniz): - if distribution.dim_domain > 1 or distribution.dim_codomain > 1: + def __init__(self, *, + multivariate_depth=multivariate._UnivariateFraimanMuniz()): + self._multivariate_depth = multivariate_depth + + def fit(self, X, y=None): + if X.dim_domain > 1 or X.dim_codomain > 1: raise ValueError("Currently multivariate data is not allowed") - self._domain_range = distribution.domain_range - self._grid_points = distribution.grid_points - self._multivariate_depth = multivariate_depth(distribution.data_matrix) + self._domain_range = X.domain_range + self._grid_points = X.grid_points + self._multivariate_depth.fit(X.data_matrix) + return self - def __call__(self, data_points, pointwise=False): - if data_points.dim_domain > 1 or data_points.dim_codomain > 1: + def predict(self, X, *, pointwise=False): + if X.dim_domain > 1 or X.dim_codomain > 1: raise ValueError("Currently multivariate data is not allowed") - pointwise_depth = self._multivariate_depth(data_points.data_matrix) + pointwise_depth = self._multivariate_depth.predict(X.data_matrix) if pointwise: return pointwise_depth @@ -358,4 +362,5 @@ def fraiman_muniz_depth(fdatagrid, *, pointwise=False): """ - return IntegratedDepth(fdatagrid)(fdatagrid, pointwise=pointwise) + return IntegratedDepth().fit(fdatagrid).predict( + fdatagrid, pointwise=pointwise) diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index 31b99b536..994287cb7 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -1,11 +1,12 @@ import abc import scipy.stats +import sklearn import numpy as np -class Depth(abc.ABC): +class Depth(abc.ABC, sklearn.base.BaseEstimator): """ Abstract class representing a depth function. @@ -13,18 +14,61 @@ class Depth(abc.ABC): """ - @abc.abstractmethod - def __init__(self, distribution): - pass + def fit(self, X, y=None): + """ + Learn the distribution from the observations. + + Args: + X: Functional dataset from which the distribution of the data is + inferred. + y: Unused. Kept only for convention. + + Returns: + self: Fitted estimator. + + """ + return self @abc.abstractmethod - def __call__(self, data_points): + def predict(self, X): """ - Evaluate the depth over a different set of points. + Compute the depth inside the learned distribution. + + Args: + X: Points whose depth is going to be evaluated. """ pass + def fit_predict(self, X, y=None): + """ + Compute the depth of each observation with respect to the whole + dataset. + + Args: + X: Dataset. + y: Unused. Kept only for convention. + + """ + return self.fit(X).predict(X) + + def __call__(self, X, distribution=None): + """ + Allows the depth to be used as a function. + + Args: + X: Points whose depth is going to be evaluated. + distribution: Functional dataset from which the distribution of + the data is inferred. If ``None`` it is the same as ``X``. + + """ + copy = sklearn.base.clone(self) + + if distribution is None: + return copy.fit_predict(X) + else: + return copy.fit(distribution).predict(X) + @property def max(self): """ @@ -36,7 +80,7 @@ def max(self): @property def min(self): """ - Minimum (or infimum if there is no maximum) of the depth values. + Minimum (or infimum if there is no maximum) of the depth values. """ return 0 @@ -76,15 +120,17 @@ def _cumulative_distribution(column): class _UnivariateFraimanMuniz(Depth): """ Univariate depth used to compute the Fraiman an Muniz depth. + """ - def __init__(self, distribution): - self.sorted_values = np.sort(distribution, axis=0) + def fit(self, X, y=None): + self.sorted_values = np.sort(X, axis=0) + return self - def __call__(self, data_points): + def predict(self, X): cum_dist = _cumulative_distribution_ordered( np.moveaxis(self.sorted_values, 0, -1), - np.moveaxis(data_points, 0, -1)) + np.moveaxis(X, 0, -1)) assert cum_dist.shape[-2] == 1 return 1 - np.abs(0.5 - np.moveaxis(cum_dist, -1, 0)[..., 0]) From e5cab83386ab86858d59440f891e5985075d9aec Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 17 Oct 2020 18:29:01 +0200 Subject: [PATCH 062/210] Implement modified_band_depth as integrated depth. Fixes bug in modified_band_depth when some values are the same. --- skfda/_utils/_utils.py | 18 ++++++ skfda/exploratory/depth/_depth.py | 43 +++++--------- skfda/exploratory/depth/multivariate.py | 76 +++++++++++++++++++++---- tests/test_magnitude_shape.py | 75 ++++++++++++------------ 4 files changed, 134 insertions(+), 78 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 4da22d8e0..f3fc2a9de 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -1,5 +1,6 @@ """Module with generic methods""" +from builtins import getattr import functools import numbers import types @@ -422,6 +423,23 @@ def _int_to_real(array): return array + 0.0 +def _integral_fdata(function, operation=None): + + integrand = function + + if operation is not None: + try: + integrand = operation(function) + except TypeError: + def integrand(x): return operation(function(x)[:, 0, :]) + + integral = nquad_vec( + integrand, + function.domain_range) + + return integral + + def _check_array_key(array, key): """ Checks a getitem key. diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index a7a6a79bd..692d6ff9c 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -65,8 +65,6 @@ def __init__(self, *, self._multivariate_depth = multivariate_depth def fit(self, X, y=None): - if X.dim_domain > 1 or X.dim_codomain > 1: - raise ValueError("Currently multivariate data is not allowed") self._domain_range = X.domain_range self._grid_points = X.grid_points @@ -74,8 +72,6 @@ def fit(self, X, y=None): return self def predict(self, X, *, pointwise=False): - if X.dim_domain > 1 or X.dim_codomain > 1: - raise ValueError("Currently multivariate data is not allowed") pointwise_depth = self._multivariate_depth.predict(X.data_matrix) @@ -86,11 +82,16 @@ def predict(self, X, *, pointwise=False): interval_len = (self._domain_range[0][1] - self._domain_range[0][0]) - depth = (scipy.integrate.simps(pointwise_depth, - self._grid_points[0]) - / interval_len) + integrand = pointwise_depth - return depth + for d, s in zip(X.domain_range, X.grid_points): + integrand = scipy.integrate.simps(integrand, + x=s, + axis=1) + interval_len = d[1] - d[0] + integrand /= interval_len + + return integrand @property def max(self): @@ -276,7 +277,7 @@ def modified_band_depth(fdatagrid, *, pointwise=False): >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> depth = modified_band_depth(fd) >>> depth.round(2) - array([ 0.5 , 0.83, 0.72, 0.67]) + array([ 0.5 , 0.83, 0.73, 0.67]) >>> pointwise = modified_band_depth(fd, pointwise = True) >>> pointwise.round(2) array([[ 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ], @@ -285,26 +286,10 @@ def modified_band_depth(fdatagrid, *, pointwise=False): [ 0.83, 0.83, 0.83, 0.5 , 0.5 , 0.5 ]]) """ - n = fdatagrid.n_samples - nchoose2 = n * (n - 1) / 2 - - ranks = _rank_samples(fdatagrid) - n_samples_above = fdatagrid.n_samples - ranks - n_samples_below = ranks - 1 - match = n_samples_above * n_samples_below - axis = tuple(range(1, fdatagrid.dim_domain + 1)) - - if pointwise: - depth_pointwise = (match + fdatagrid.n_samples - 1) / nchoose2 - - return depth_pointwise - else: - npoints_sample = reduce(lambda x, y: x * len(y), - fdatagrid.grid_points, 1) - proportion = match.sum(axis=axis) / npoints_sample - depth = (proportion + fdatagrid.n_samples - 1) / nchoose2 - - return depth + return IntegratedDepth( + multivariate_depth=multivariate.SimplicialDepth()).fit( + fdatagrid).predict( + fdatagrid, pointwise=pointwise) def fraiman_muniz_depth(fdatagrid, *, pointwise=False): diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index 994287cb7..3152b6198 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -1,4 +1,5 @@ import abc +from scipy.special import comb import scipy.stats import sklearn @@ -86,15 +87,20 @@ def min(self): return 0 -def _cumulative_one_dim(array, values): - searched_index = np.searchsorted(array, values, side='right') +def _searchsorted_one_dim(array, values, *, side='left'): + searched_index = np.searchsorted(array, values, side=side) - return searched_index / len(array) + return searched_index -_cumulative_distribution_ordered = np.vectorize( - _cumulative_one_dim, - signature='(n),(m)->(m)') +_searchsorted_vectorized = np.vectorize( + _searchsorted_one_dim, + signature='(n),(m),()->(m)', + excluded='side') + + +def _searchsorted_ordered(array, values, *, side='left'): + return _searchsorted_vectorized(array, values, side=side) def _cumulative_distribution(column): @@ -114,7 +120,8 @@ def _cumulative_distribution(column): array([ 0.4, 0.9, 1. , 0.4, 0.6, 0.6, 0.9, 0.4, 0.4, 0.7]) """ - return _cumulative_distribution_ordered(np.sort(column), column) + return _searchsorted_ordered(np.sort(column), column, + side='right') / len(column) class _UnivariateFraimanMuniz(Depth): @@ -124,13 +131,13 @@ class _UnivariateFraimanMuniz(Depth): """ def fit(self, X, y=None): - self.sorted_values = np.sort(X, axis=0) + self._sorted_values = np.sort(X, axis=0) return self def predict(self, X): - cum_dist = _cumulative_distribution_ordered( - np.moveaxis(self.sorted_values, 0, -1), - np.moveaxis(X, 0, -1)) + cum_dist = _searchsorted_ordered( + np.moveaxis(self._sorted_values, 0, -1), + np.moveaxis(X, 0, -1), side='right') / len(self._sorted_values) assert cum_dist.shape[-2] == 1 return 1 - np.abs(0.5 - np.moveaxis(cum_dist, -1, 0)[..., 0]) @@ -140,6 +147,53 @@ def min(self): return 1 / 2 +class SimplicialDepth(Depth): + """ + Simplicial depth. + + """ + + def fit(self, X, y=None): + self._dim = X.shape[-1] + + if self._dim == 1: + self.sorted_values = np.sort(X, axis=0) + else: + raise NotImplementedError("SimplicialDepth is currently only " + "implemented for one-dimensional data.") + + return self + + def predict(self, X): + + assert self._dim == X.shape[-1] + + if self._dim == 1: + positions_left = _searchsorted_ordered( + np.moveaxis(self.sorted_values, 0, -1), + np.moveaxis(X, 0, -1)) + + positions_left = np.moveaxis(positions_left, -1, 0)[..., 0] + + positions_right = _searchsorted_ordered( + np.moveaxis(self.sorted_values, 0, -1), + np.moveaxis(X, 0, -1), side='right') + + positions_right = np.moveaxis(positions_right, -1, 0)[..., 0] + + num_strictly_below = positions_left + num_strictly_above = len(self.sorted_values) - positions_right + + total_pairs = comb(len(self.sorted_values), 2) + + return (total_pairs - comb(num_strictly_below, 2) + - comb(num_strictly_above, 2)) / total_pairs + + @property + def min(self): + return 1 / 2 + + def _stagel_donoho_outlyingness(X, *, pointwise=False): if pointwise is False: diff --git a/tests/test_magnitude_shape.py b/tests/test_magnitude_shape.py index 4911af0f0..45667a0f6 100644 --- a/tests/test_magnitude_shape.py +++ b/tests/test_magnitude_shape.py @@ -15,46 +15,45 @@ def test_magnitude_shape_plot(self): msplot = MagnitudeShapePlot( fd_temperatures, depth_method=modified_band_depth) np.testing.assert_allclose(msplot.points, - np.array([[0.25839562, 3.14995827], - [1.3774155, 0.91556716], - [0.94389069, 2.74940766], - [2.10767177, 7.22065509], - [0.82331252, 0.8250163], - [1.22912089, 0.2194518], - [-2.65530111, 0.9666511], - [0.15784599, 0.99960958], - [-0.43631897, 0.66055387], - [0.70501476, 0.66301126], - [0.72895263, 0.33074653], - [3.47490723, 12.5630275], - [3.14674773, 13.81447167], - [3.51793514, 10.46943904], - [3.94435195, 15.24142224], - [-0.48353674, 0.50215652], - [0.64316089, 6.81513544], - [-0.82957845, 0.80903798], - [-3.4617439, 1.10389229], - [0.2218012, 1.76299192], - [-0.54253359, 0.94968438], - [-1.70841274, 0.61708188], - [-0.44040451, 0.77602089], - [0.13813459, 1.02279698], - [7.57827303, 40.70985885], - [7.55791925, 35.94093086], - [7.10977399, 45.84310211], - [0.05730784, 1.75335899], - [1.52672644, 8.82803475], - [-1.48288999, 0.22412958], - [-2.84526533, 4.49585828], - [-2.41633786, 1.46528758], - [-5.87118328, 5.34300766], - [-5.42854833, 5.1694065], - [-16.34459211, 0.9397118]] - )) + np.array([[0.2112587, 3.0322570], + [1.2823448, 0.8272850], + [0.8646544, 1.8619370], + [1.9862512, 5.5287354], + [0.7534918, 0.7203502], + [1.1325291, 0.2808455], + [-2.650529, 0.9702889], + [0.1434387, 0.9159834], + [-0.402844, 0.6413531], + [0.6354411, 0.6934311], + [0.5727553, 0.4628254], + [3.0524899, 8.8008899], + [2.7355803, 10.338497], + [3.1179374, 7.0686220], + [3.4944047, 11.479432], + [-0.402532, 0.5253690], + [0.5782190, 5.5400704], + [-0.839887, 0.7350041], + [-3.456470, 1.1156415], + [0.2260207, 1.5071672], + [-0.561562, 0.8836978], + [-1.690263, 0.6392155], + [-0.385394, 0.7401909], + [0.1467050, 0.9090058], + [7.1811993, 39.003407], + [6.8943132, 30.968126], + [6.6227164, 41.448548], + [0.0726709, 1.5960063], + [1.4450617, 8.7183435], + [-1.459836, 0.2719813], + [-2.824349, 4.5729382], + [-2.390462, 1.5464775], + [-5.869571, 5.3517279], + [-5.426019, 5.1817219], + [-16.34459, 0.9397117]]), rtol=1e-5) np.testing.assert_array_almost_equal(msplot.outliers, np.array( - [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1])) From a2067a6f1a3be74600c2c9277002a855d08d2eba Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 17 Oct 2020 19:02:44 +0200 Subject: [PATCH 063/210] Remove integration auxiliary function. --- skfda/_utils/_utils.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f3fc2a9de..9241dea3b 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -423,23 +423,6 @@ def _int_to_real(array): return array + 0.0 -def _integral_fdata(function, operation=None): - - integrand = function - - if operation is not None: - try: - integrand = operation(function) - except TypeError: - def integrand(x): return operation(function(x)[:, 0, :]) - - integral = nquad_vec( - integrand, - function.domain_range) - - return integral - - def _check_array_key(array, key): """ Checks a getitem key. From 819fabdb567f432ff75900a98d53e78eb0609965 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 17 Oct 2020 20:20:12 +0200 Subject: [PATCH 064/210] Add tests for equal samples. --- skfda/exploratory/depth/__init__.py | 6 ++++-- skfda/exploratory/depth/_depth.py | 20 +++++++++++-------- tests/test_depth.py | 30 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 10 deletions(-) create mode 100644 tests/test_depth.py diff --git a/skfda/exploratory/depth/__init__.py b/skfda/exploratory/depth/__init__.py index 78e552ff0..1c4359880 100644 --- a/skfda/exploratory/depth/__init__.py +++ b/skfda/exploratory/depth/__init__.py @@ -1,5 +1,7 @@ -from ._depth import (band_depth, +from . import multivariate +from ._depth import (IntegratedDepth, + ModifiedBandDepth, + band_depth, modified_band_depth, fraiman_muniz_depth, outlyingness_to_depth) -from . import multivariate diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index 692d6ff9c..a6f3921aa 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -62,18 +62,18 @@ class IntegratedDepth(FunctionalDepth): def __init__(self, *, multivariate_depth=multivariate._UnivariateFraimanMuniz()): - self._multivariate_depth = multivariate_depth + self.multivariate_depth = multivariate_depth def fit(self, X, y=None): self._domain_range = X.domain_range self._grid_points = X.grid_points - self._multivariate_depth.fit(X.data_matrix) + self.multivariate_depth.fit(X.data_matrix) return self def predict(self, X, *, pointwise=False): - pointwise_depth = self._multivariate_depth.predict(X.data_matrix) + pointwise_depth = self.multivariate_depth.predict(X.data_matrix) if pointwise: return pointwise_depth @@ -95,11 +95,17 @@ def predict(self, X, *, pointwise=False): @property def max(self): - return self._multivariate_depth.max + return self.multivariate_depth.max @property def min(self): - return self._multivariate_depth.min + return self.multivariate_depth.min + + +class ModifiedBandDepth(IntegratedDepth): + + def __init__(self): + super().__init__(multivariate_depth=multivariate.SimplicialDepth()) def outlyingness_to_depth(outlyingness, *, supreme=None): @@ -286,9 +292,7 @@ def modified_band_depth(fdatagrid, *, pointwise=False): [ 0.83, 0.83, 0.83, 0.5 , 0.5 , 0.5 ]]) """ - return IntegratedDepth( - multivariate_depth=multivariate.SimplicialDepth()).fit( - fdatagrid).predict( + return ModifiedBandDepth().fit(fdatagrid).predict( fdatagrid, pointwise=pointwise) diff --git a/tests/test_depth.py b/tests/test_depth.py new file mode 100644 index 000000000..fe746d792 --- /dev/null +++ b/tests/test_depth.py @@ -0,0 +1,30 @@ +import skfda +from skfda.exploratory.depth import IntegratedDepth, ModifiedBandDepth +import unittest +import numpy as np + + +class TestsDepthSameCurves(unittest.TestCase): + + def setUp(self): + data_matrix = [[1, 2, 3, 4], + [1, 2, 3, 4], + [1, 2, 3, 4], + [1, 2, 3, 4], + [1, 2, 3, 4]] + + self.fd = skfda.FDataGrid(data_matrix) + + def test_integrated_equal(self): + + depth = IntegratedDepth() + + np.testing.assert_almost_equal( + depth(self.fd), [0.5, 0.5, 0.5, 0.5, 0.5]) + + def test_modified_band_depth_equal(self): + + depth = ModifiedBandDepth() + + np.testing.assert_almost_equal( + depth(self.fd), [1, 1, 1, 1, 1]) From cbd5cef55ff4ca892da86a7d045132e0c5791723 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 18 Oct 2020 20:31:06 +0200 Subject: [PATCH 065/210] Band depth. --- skfda/exploratory/depth/__init__.py | 1 + skfda/exploratory/depth/_depth.py | 131 +++++++++------------------- 2 files changed, 40 insertions(+), 92 deletions(-) diff --git a/skfda/exploratory/depth/__init__.py b/skfda/exploratory/depth/__init__.py index 1c4359880..e45c7564e 100644 --- a/skfda/exploratory/depth/__init__.py +++ b/skfda/exploratory/depth/__init__.py @@ -1,6 +1,7 @@ from . import multivariate from ._depth import (IntegratedDepth, ModifiedBandDepth, + BandDepth, band_depth, modified_band_depth, fraiman_muniz_depth, diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index a6f3921aa..f4fffa7b3 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -3,8 +3,7 @@ This module includes different methods to order functional data, from the center (larger values) outwards(smaller ones).""" -import abc -from functools import reduce +import itertools import math import scipy.integrate @@ -29,31 +28,6 @@ class FunctionalDepth(multivariate.Depth): pass -def _cumulative_distribution(column): - """Calculates the cumulative distribution function of the values passed to - the function and evaluates it at each point. - - Args: - column (numpy.darray): Array containing the values over which the - distribution function is calculated. - - Returns: - numpy.darray: Array containing the evaluation at each point of the - distribution function. - - Examples: - >>> _cumulative_distribution(np.array([1, 4, 5, 1, 2, 2, 4, 1, 1, 3])) - array([ 0.4, 0.9, 1. , 0.4, 0.6, 0.6, 0.9, 0.4, 0.4, 0.7]) - - """ - if len(column.shape) != 1: - raise ValueError("Only supported 1 dimensional arrays.") - _, indexes, counts = np.unique(column, return_inverse=True, - return_counts=True) - count_cumulative = np.cumsum(counts) / len(column) - return count_cumulative[indexes].reshape(column.shape) - - class IntegratedDepth(FunctionalDepth): """ Functional depth as the integral of a multivariate depth. @@ -108,6 +82,42 @@ def __init__(self): super().__init__(multivariate_depth=multivariate.SimplicialDepth()) +class BandDepth(FunctionalDepth): + """ + Functional depth as the integral of a multivariate depth. + + """ + + def fit(self, X, y=None): + + if X.dim_codomain != 1: + raise NotImplementedError("Band depth not implemented for vector " + "valued functions") + + self._distribution = X + return self + + def predict(self, X, *, pointwise=False): + + num_in = 0 + n_total = 0 + + for f1, f2 in itertools.combinations(self._distribution, 2): + between_range_1 = (f1.data_matrix <= X.data_matrix) & ( + X.data_matrix <= f2.data_matrix) + + between_range_2 = (f2.data_matrix <= X.data_matrix) & ( + X.data_matrix <= f1.data_matrix) + + between_range = between_range_1 | between_range_2 + + num_in += np.all(between_range, + axis=tuple(range(1, X.data_matrix.ndim))) + n_total += 1 + + return num_in / n_total + + def outlyingness_to_depth(outlyingness, *, supreme=None): r"""Convert outlyingness function to depth function. @@ -147,60 +157,6 @@ def depth(*args, **kwargs): return depth -def _rank_samples(fdatagrid): - """Ranks the he samples in the FDataGrid at each point of discretisation. - - Args: - fdatagrid (FDataGrid): Object whose samples are ranked. - - Returns: - numpy.darray: Array containing the ranks of the sample points. - - Examples: - Univariate setting: - - >>> import skfda - >>> - >>> data_matrix = [[1, 1, 2, 3, 2.5, 2], - ... [0.5, 0.5, 1, 2, 1.5, 1], - ... [-1, -1, -0.5, 1, 1, 0.5], - ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> grid_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> _rank_samples(fd) - array([[ 4., 4., 4., 4., 4., 4.], - [ 3., 3., 3., 3., 3., 3.], - [ 1., 1., 2., 2., 2., 2.], - [ 2., 2., 2., 1., 1., 1.]]) - - Several input dimensions: - - >>> data_matrix = [[[[1], [0.7], [1]], - ... [[4], [0.4], [5]]], - ... [[[2], [0.5], [2]], - ... [[3], [0.6], [3]]]] - >>> grid_points = [[2, 4], [3, 6, 8]] - >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> _rank_samples(fd) - array([[[ 1., 2., 1.], - [ 2., 1., 2.]], - [[ 2., 1., 2.], - [ 1., 2., 1.]]]) - - - - """ - if fdatagrid.dim_codomain > 1: - raise ValueError("Currently multivariate data is not allowed") - - ranks = np.zeros(fdatagrid.data_matrix.shape[:-1]) - - for index, _ in np.ndenumerate(ranks[0]): - ranks[(slice(None),) + index] = rankdata( - fdatagrid.data_matrix[(slice(None),) + index + (0,)], method='max') - return ranks - - def band_depth(fdatagrid, *, pointwise=False): """Implementation of Band Depth for functional data. @@ -238,17 +194,8 @@ def band_depth(fdatagrid, *, pointwise=False): if pointwise: return modified_band_depth(fdatagrid, pointwise) else: - n = fdatagrid.n_samples - nchoose2 = n * (n - 1) / 2 - - ranks = _rank_samples(fdatagrid) - axis = tuple(range(1, fdatagrid.dim_domain + 1)) - n_samples_above = fdatagrid.n_samples - np.amax(ranks, axis=axis) - n_samples_below = np.amin(ranks, axis=axis) - 1 - depth = ((n_samples_below * n_samples_above + fdatagrid.n_samples - 1) - / nchoose2) - - return depth + return BandDepth().fit(fdatagrid).predict( + fdatagrid, pointwise=pointwise) def modified_band_depth(fdatagrid, *, pointwise=False): From 1fafd4876c3211570146cfed69b4f783902bedcb Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 19 Oct 2020 00:59:55 +0200 Subject: [PATCH 066/210] Changed ProjectionDepth to class. --- skfda/exploratory/depth/__init__.py | 3 +- skfda/exploratory/depth/_depth.py | 39 ----- skfda/exploratory/depth/multivariate.py | 146 +++++++++++++++--- .../outliers/_directional_outlyingness.py | 18 +-- .../visualization/_magnitude_shape_plot.py | 18 +-- tests/test_magnitude_shape.py | 2 +- tests/test_outliers.py | 2 +- 7 files changed, 142 insertions(+), 86 deletions(-) diff --git a/skfda/exploratory/depth/__init__.py b/skfda/exploratory/depth/__init__.py index e45c7564e..497b7460c 100644 --- a/skfda/exploratory/depth/__init__.py +++ b/skfda/exploratory/depth/__init__.py @@ -4,5 +4,4 @@ BandDepth, band_depth, modified_band_depth, - fraiman_muniz_depth, - outlyingness_to_depth) + fraiman_muniz_depth) diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index f4fffa7b3..3dfddf80c 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -118,45 +118,6 @@ def predict(self, X, *, pointwise=False): return num_in / n_total -def outlyingness_to_depth(outlyingness, *, supreme=None): - r"""Convert outlyingness function to depth function. - - An outlyingness function :math:`O(x)` can be converted to a depth - function as - - .. math:: - D(x) = \frac{1}{1 + O(x)} - - if :math:`O(x)` is unbounded or as - - .. math:: - D(x) = 1 - \frac{O(x)}{\sup O(x)} - - if :math:`O(x)` is bounded ([Se06]_). - - Args: - outlyingness (Callable): Outlyingness function. - supreme (float, optional): Supreme value of the outlyingness function. - - Returns: - Callable: The corresponding depth function. - - References: - .. [Se06] Serfling, R. (2006). Depth functions in nonparametric - multivariate inference. DIMACS Series in Discrete Mathematics and - Theoretical Computer Science, 72, 1. - """ - - if supreme is None or math.isinf(supreme): - def depth(*args, **kwargs): - return 1 / (1 + outlyingness(*args, **kwargs)) - else: - def depth(*args, **kwargs): - return 1 - outlyingness(*args, **kwargs) / supreme - - return depth - - def band_depth(fdatagrid, *, pointwise=False): """Implementation of Band Depth for functional data. diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index 3152b6198..abe5b5743 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -1,4 +1,5 @@ import abc +import math from scipy.special import comb import scipy.stats @@ -7,11 +8,9 @@ import numpy as np -class Depth(abc.ABC, sklearn.base.BaseEstimator): +class _DepthOrOutlyingness(abc.ABC, sklearn.base.BaseEstimator): """ - Abstract class representing a depth function. - - Usually it will accept a distribution in the initializer. + Abstract class representing a depth or outlyingness function. """ @@ -33,7 +32,7 @@ def fit(self, X, y=None): @abc.abstractmethod def predict(self, X): """ - Compute the depth inside the learned distribution. + Compute the depth or outlyingness inside the learned distribution. Args: X: Points whose depth is going to be evaluated. @@ -43,8 +42,8 @@ def predict(self, X): def fit_predict(self, X, y=None): """ - Compute the depth of each observation with respect to the whole - dataset. + Compute the depth or outlyingness of each observation with respect to + the whole dataset. Args: X: Dataset. @@ -55,7 +54,7 @@ def fit_predict(self, X, y=None): def __call__(self, X, distribution=None): """ - Allows the depth to be used as a function. + Allows the depth or outlyingness to be used as a function. Args: X: Points whose depth is going to be evaluated. @@ -73,7 +72,8 @@ def __call__(self, X, distribution=None): @property def max(self): """ - Maximum (or supremum if there is no maximum) of the depth values. + Maximum (or supremum if there is no maximum) of the possibly predicted + values. """ return 1 @@ -81,12 +81,29 @@ def max(self): @property def min(self): """ - Minimum (or infimum if there is no maximum) of the depth values. + Minimum (or infimum if there is no maximum) of the possibly predicted + values. """ return 0 +class Depth(_DepthOrOutlyingness): + """ + Abstract class representing a depth function. + + """ + pass + + +class Outlyingness(_DepthOrOutlyingness): + """ + Abstract class representing an outlyingness function. + + """ + pass + + def _searchsorted_one_dim(array, values, *, side='left'): searched_index = np.searchsorted(array, values, side=side) @@ -194,20 +211,103 @@ def min(self): return 1 / 2 -def _stagel_donoho_outlyingness(X, *, pointwise=False): +class OutlyingnessBasedDepth(Depth): + r""" + Computes depth based on an outlyingness measure. + + An outlyingness function :math:`O(x)` can be converted to a depth + function as + + .. math:: + D(x) = \frac{1}{1 + O(x)} - if pointwise is False: - raise NotImplementedError("Only implemented pointwise") + if :math:`O(x)` is unbounded or as - if X.dim_codomain == 1: - # Special case, can be computed exactly - m = X.data_matrix[..., 0] + .. math:: + D(x) = 1 - \frac{O(x)}{\sup O(x)} - return (np.abs(m - np.median(m, axis=0)) / - scipy.stats.median_abs_deviation(m, axis=0, scale=1 / 1.4826)) + if :math:`O(x)` is bounded ([Se06]_). If the infimum value of the + outlyiness function is not zero, it is subtracted beforehand. - else: - raise NotImplementedError("Only implemented for one dimension") + Args: + outlyingness (Outlyingness): Outlyingness object. + + References: + .. [Se06] Serfling, R. (2006). Depth functions in nonparametric + multivariate inference. DIMACS Series in Discrete Mathematics and + Theoretical Computer Science, 72, 1. + + """ + + def __init__(self, outlyingness): + self.outlyingness = outlyingness + + def fit(self, X, y=None): + self.outlyingness.fit(X) + + return self + + def predict(self, X): + outlyingness_values = self.outlyingness.predict(X) + + min_val = self.outlyingness.min + max_val = self.outlyingness.max + + if(math.isinf(max_val)): + return 1 / (1 + outlyingness_values - min_val) + else: + return 1 - (outlyingness_values - min_val) / (max_val - min_val) + + +class StahelDonohoOutlyingness(Outlyingness): + r""" + Computes Stahel-Donoho outlyingness. + + Stahel-Donoho outlyingness is defined as + + .. math:: + \sup_{\|u\|=1} \frac{|u^T x - \text{Med}(u^T X))|}{\text{MAD}(u^TX)} + + where :math:`\text{X}` is a sample with distribution :math:`F`, + :math:`\text{Med}` is the median and :math:`\text{MAD}` is the + median absolute deviation. + + """ + + def fit(self, X, y=None): + + dim = X.shape[-1] + + if dim == 1: + self._location = np.median(X, axis=0) + self._scale = scipy.stats.median_abs_deviation( + X, axis=0, scale=1 / 1.4826) + else: + raise NotImplementedError("Only implemented for one dimension") + + return self + + def predict(self, X): + + dim = X.shape[-1] + + if dim == 1: + # Special case, can be computed exactly + return (np.abs(X - self._location) / + self._scale)[..., 0] + + else: + raise NotImplementedError("Only implemented for one dimension") + + @property + def max(self): + return np.inf + + +class ProjectionDepth(OutlyingnessBasedDepth): + + def __init__(self): + super().__init__(outlyingness=StahelDonohoOutlyingness()) def projection_depth(X, *, pointwise=False): @@ -216,8 +316,4 @@ def projection_depth(X, *, pointwise=False): The projection depth is the depth function associated with the Stagel-Donoho outlyingness. """ - from . import outlyingness_to_depth - - depth = outlyingness_to_depth(_stagel_donoho_outlyingness) - - return depth(X, pointwise=pointwise) + return ProjectionDepth().fit_predict(X.data_matrix) diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index 0887e3f5a..b20c645ef 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -22,7 +22,7 @@ class DirectionalOutlyingnessStats(typing.NamedTuple): def directional_outlyingness_stats( fdatagrid: FDataGrid, *, - depth_method=projection_depth, + multivariate_depth=projection_depth, pointwise_weights=None) -> DirectionalOutlyingnessStats: r"""Computes the directional outlyingness of the functional data. @@ -83,9 +83,9 @@ def directional_outlyingness_stats( Args: fdatagrid (FDataGrid): Object containing the samples to be ordered according to the directional outlyingness. - depth_method (:ref:`depth measure `, optional): Method - used to order the data. Defaults to :func:`modified band depth - `. + multivariate_depth (:ref:`depth measure `, optional): + Method used to order the data. Defaults to :func:`modified band + depth `. pointwise_weights (array_like, optional): an array containing the weights of each point of discretisation where values have been recorded. Defaults to the same weight for each of the points: @@ -162,7 +162,7 @@ def directional_outlyingness_stats( len(fdatagrid.grid_points[0])) / ( fdatagrid.domain_range[0][1] - fdatagrid.domain_range[0][0]) - depth_pointwise = depth_method(fdatagrid, pointwise=True) + depth_pointwise = multivariate_depth(fdatagrid, pointwise=True) assert depth_pointwise.shape == fdatagrid.data_matrix.shape[:-1] # Obtaining the pointwise median sample Z, to calculate @@ -253,7 +253,7 @@ class DirectionalOutlierDetector(BaseEstimator, OutlierMixin): detecting outliers under a normal distribution. Parameters: - depth_method (:ref:`depth measure `, optional): + multivariate_depth (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`projection depth `. pointwise_weights (array_like, optional): an array containing the @@ -303,7 +303,7 @@ class DirectionalOutlierDetector(BaseEstimator, OutlierMixin): """ def __init__( - self, *, depth_method=projection_depth, + self, *, multivariate_depth=projection_depth, pointwise_weights=None, assume_centered=False, support_fraction=None, @@ -311,7 +311,7 @@ def __init__( random_state=0, alpha=0.993, _force_asymptotic=False): - self.depth_method = depth_method + self.multivariate_depth = multivariate_depth self.pointwise_weights = pointwise_weights self.assume_centered = assume_centered self.support_fraction = support_fraction @@ -324,7 +324,7 @@ def _compute_points(self, X): # The depths of the samples are calculated giving them an ordering. *_, mean_dir_outl, variation_dir_outl = directional_outlyingness_stats( X, - depth_method=self.depth_method, + multivariate_depth=self.multivariate_depth, pointwise_weights=self.pointwise_weights) points = np.concatenate((mean_dir_outl, diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index faddef6f8..1fc8e7f7b 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -38,7 +38,7 @@ class MagnitudeShapePlot: Args: fdatagrid (FDataGrid): Object containing the data. - depth_method (:ref:`depth measure `, optional): + multivariate_depth (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`projection depth `. pointwise_weights (array_like, optional): an array containing the @@ -69,9 +69,9 @@ class MagnitudeShapePlot: Attributes: fdatagrid (FDataGrid): Object to be visualized. - depth_method (:ref:`depth measure `, optional): Method - used to order the data. Defaults to :func:`modified band depth - `. + multivariate_depth (:ref:`depth measure `, optional): + Method used to order the data. Defaults to :func:`modified band + depth `. pointwise_weights (array_like, optional): an array containing the weights of each points of discretisation where values have been recorded. @@ -148,7 +148,7 @@ class MagnitudeShapePlot: grid_points=(array([ 0., 2., 4., 6., 8., 10.]),), domain_range=((0.0, 10.0),), ...), - depth_method=projection_depth, + multivariate_depth=projection_depth, pointwise_weights=None, alpha=0.993, points=array([[ 1.12415127, 0.05813094], @@ -177,7 +177,7 @@ def __init__(self, fdatagrid, **kwargs): Args: fdatagrid (FDataGrid): Object containing the data. - depth_method (:ref:`depth measure `, optional): + multivariate_depth (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`projection depth `. pointwise_weights (array_like, optional): an array containing the @@ -231,8 +231,8 @@ def fdatagrid(self): return self._fdatagrid @property - def depth_method(self): - return self.outlier_detector.depth_method + def multivariate_depth(self): + return self.outlier_detector.multivariate_depth @property def pointwise_weights(self): @@ -317,7 +317,7 @@ def __repr__(self): """Return repr(self).""" return (f"MagnitudeShapePlot(" f"\nFDataGrid={repr(self.fdatagrid)}," - f"\ndepth_method={self.depth_method.__name__}," + f"\nmultivariate_depth={self.multivariate_depth.__name__}," f"\npointwise_weights={repr(self.pointwise_weights)}," f"\nalpha={repr(self.alpha)}," f"\npoints={repr(self.points)}," diff --git a/tests/test_magnitude_shape.py b/tests/test_magnitude_shape.py index 45667a0f6..2061cc128 100644 --- a/tests/test_magnitude_shape.py +++ b/tests/test_magnitude_shape.py @@ -13,7 +13,7 @@ def test_magnitude_shape_plot(self): fd = fetch_weather()["data"] fd_temperatures = fd.coordinates[0] msplot = MagnitudeShapePlot( - fd_temperatures, depth_method=modified_band_depth) + fd_temperatures, multivariate_depth=modified_band_depth) np.testing.assert_allclose(msplot.points, np.array([[0.2112587, 3.0322570], [1.2823448, 0.8272850], diff --git a/tests/test_outliers.py b/tests/test_outliers.py index b1806f1bb..32687a5e1 100644 --- a/tests/test_outliers.py +++ b/tests/test_outliers.py @@ -16,7 +16,7 @@ def test_directional_outlyingness(self): grid_points = [2, 4, 6, 8] fd = FDataGrid(data_matrix, grid_points) stats = directional_outlyingness_stats( - fd, depth_method=modified_band_depth) + fd, multivariate_depth=modified_band_depth) np.testing.assert_allclose(stats.directional_outlyingness, np.array([[[0.], [0.], From 334208d5da71b835b2397d4c98d8213709593fad Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 19 Oct 2020 01:24:10 +0200 Subject: [PATCH 067/210] Fix MAD in Stahel-Donoho outlyingness. --- skfda/exploratory/depth/multivariate.py | 23 +++++++++- .../outliers/_directional_outlyingness.py | 46 +++++++++---------- .../visualization/_magnitude_shape_plot.py | 6 +-- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index abe5b5743..7f8fefe4a 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -270,7 +270,13 @@ class StahelDonohoOutlyingness(Outlyingness): where :math:`\text{X}` is a sample with distribution :math:`F`, :math:`\text{Med}` is the median and :math:`\text{MAD}` is the - median absolute deviation. + median absolute deviation ([ZuCuHe04]_). + + References: + + .. [ZuCuHe04] Zuo, Y., Cui, H., & He, X. (2004). On the Stahel-Donoho + estimator and depth-weighted means of multivariate data. Annals of + Statistics, 32(1), 167–188. https://doi.org/10.1214/aos/1079120132 """ @@ -281,7 +287,7 @@ def fit(self, X, y=None): if dim == 1: self._location = np.median(X, axis=0) self._scale = scipy.stats.median_abs_deviation( - X, axis=0, scale=1 / 1.4826) + X, axis=0) else: raise NotImplementedError("Only implemented for one dimension") @@ -305,6 +311,19 @@ def max(self): class ProjectionDepth(OutlyingnessBasedDepth): + r""" + Computes Projection depth. + + It is defined as the depth induced by the Stahel-Donoho outlyingness + ([ZuCuHe04]_). + + References: + + .. [ZuCuHe04] Zuo, Y., Cui, H., & He, X. (2004). On the Stahel-Donoho + estimator and depth-weighted means of multivariate data. Annals of + Statistics, 32(1), 167–188. https://doi.org/10.1214/aos/1079120132 + + """ def __init__(self): super().__init__(outlyingness=StahelDonohoOutlyingness()) diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index b20c645ef..fb4e66526 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -104,42 +104,42 @@ def directional_outlyingness_stats( >>> fd = FDataGrid(data_matrix, grid_points) >>> stats = directional_outlyingness_stats(fd) >>> stats.directional_outlyingness - array([[[ 0.89932101], - [ 0.89932101], - [ 1.57381177], - [ 1.01173614], - [ 1.12415127], - [ 1.12415127]], + array([[[ 1.33333333], + [ 1.33333333], + [ 2.33333333], + [ 1.5 ], + [ 1.66666667], + [ 1.66666667]], [[ 0. ], [ 0. ], [ 0. ], [ 0. ], [ 0. ], [ 0. ]], - [[-0.89932101], - [-0.89932101], - [-0.67449076], - [-0.33724538], - [-0.22483025], - [-0.22483025]], - [[-0.44966051], - [-0.44966051], - [-0.67449076], - [-1.6862269 ], - [-2.02347228], - [-1.57381177]]]) + [[-1.33333333], + [-1.33333333], + [-1. ], + [-0.5 ], + [-0.33333333], + [-0.33333333]], + [[-0.66666667], + [-0.66666667], + [-1. ], + [-2.5 ], + [-3. ], + [-2.33333333]]]) >>> stats.functional_directional_outlyingness - array([ 2.99742218, 2.93929124, 3.01966359, 3.36873005]) + array([ 6.58864198, 6.4608642 , 6.63753086, 7.40481481]) >>> stats.mean_directional_outlyingness - array([[ 1.12415127], + array([[ 1.66666667], [ 0. ], - [-0.53959261], - [-1.17661166]]) + [-0.8 ], + [-1.74444444]]) >>> stats.variation_directional_outlyingness - array([ 0.05813094, 0. , 0.08037234, 0.4294388 ]) + array([ 0.12777778, 0. , 0.17666667, 0.94395062]) References: Dai, Wenlin, and Genton, Marc G. "Directional outlyingness for diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 1fc8e7f7b..be5119935 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -151,10 +151,10 @@ class MagnitudeShapePlot: multivariate_depth=projection_depth, pointwise_weights=None, alpha=0.993, - points=array([[ 1.12415127, 0.05813094], + points=array([[ 1.66666667, 0.12777778], [ 0. , 0. ], - [-0.53959261, 0.08037234], - [-1.17661166, 0.4294388 ]]), + [-0.8 , 0.17666667], + [-1.74444444, 0.94395062]]), outliers=array([False, False, False, False]), colormap=seismic, color=0.2, From b40ff0a4231434a3a9cb1d46d2d12d36129c2e27 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 19 Oct 2020 02:41:56 +0200 Subject: [PATCH 068/210] Replaced all depth functions by classes. Some examples still missing. --- examples/plot_boxplot.py | 20 +- skfda/exploratory/depth/__init__.py | 5 +- skfda/exploratory/depth/_depth.py | 252 +++++++----------- skfda/exploratory/depth/multivariate.py | 19 +- .../outliers/_directional_outlyingness.py | 16 +- skfda/exploratory/outliers/_iqr.py | 4 +- skfda/exploratory/stats/_stats.py | 10 +- skfda/exploratory/visualization/_boxplot.py | 18 +- .../visualization/_magnitude_shape_plot.py | 24 +- tests/test_fdata_boxplot.py | 4 +- tests/test_magnitude_shape.py | 4 +- tests/test_outliers.py | 4 +- 12 files changed, 151 insertions(+), 229 deletions(-) diff --git a/examples/plot_boxplot.py b/examples/plot_boxplot.py index 2130824d2..03a2abf0a 100644 --- a/examples/plot_boxplot.py +++ b/examples/plot_boxplot.py @@ -11,12 +11,13 @@ # sphinx_gallery_thumbnail_number = 2 -import matplotlib.pyplot as plt -import numpy as np from skfda import datasets -from skfda.exploratory.depth import band_depth, fraiman_muniz_depth +from skfda.exploratory.depth import ModifiedBandDepth, IntegratedDepth from skfda.exploratory.visualization import Boxplot +import matplotlib.pyplot as plt +import numpy as np + ############################################################################## # First, the Canadian Weather dataset is downloaded from the package 'fda' in @@ -77,20 +78,21 @@ ############################################################################## # The curves pointed as outliers are are those curves with significantly lower # values than the rest. This is the expected result due to the depth measure -# used, :func:`~skfda.exploratory.depth.fraiman_muniz_depth`, which ranks +# used, :func:`~skfda.exploratory.depth.IntegratedDepth`, which ranks # the samples according to their magnitude. # # The :class:`~skfda.exploratory.visualization.Boxplot` object admits any # :ref:`depth measure ` defined or customized by the user. Now -# the call is done with the :func:`~skfda.exploratory.depth.band_depth` and -# the factor is reduced in order to designate some samples as outliers +# the call is done with the :class:`~skfda.exploratory.depth.ModifiedBandDepth` +# and the factor is reduced in order to designate some samples as outliers # (otherwise, with this measure and the default factor, none of the curves are # pointed out as outliers). We can see that the outliers detected belong to # the Pacific and Arctic climates which are less common to find in Canada. As # a consequence, this measure detects better shape outliers compared to the # previous one. -fdBoxplot = Boxplot(fd_temperatures, depth_method=band_depth, factor=0.4) +fdBoxplot = Boxplot( + fd_temperatures, depth_method=ModifiedBandDepth(), factor=0.4) fdBoxplot.show_full_outliers = True fdBoxplot.plot() @@ -101,10 +103,10 @@ # 50% one. # # In the following instantiation, the -# :func:`~skfda.exploratory.depth.fraiman_muniz_depth` is used and the 25% and +# :func:`~skfda.exploratory.depth.IntegratedDepth` is used and the 25% and # 75% central regions are specified. -fdBoxplot = Boxplot(fd_temperatures, depth_method=fraiman_muniz_depth, +fdBoxplot = Boxplot(fd_temperatures, depth_method=IntegratedDepth(), prob=[0.75, 0.5, 0.25]) fdBoxplot.plot() diff --git a/skfda/exploratory/depth/__init__.py b/skfda/exploratory/depth/__init__.py index 497b7460c..092e8bcc7 100644 --- a/skfda/exploratory/depth/__init__.py +++ b/skfda/exploratory/depth/__init__.py @@ -1,7 +1,4 @@ from . import multivariate from ._depth import (IntegratedDepth, ModifiedBandDepth, - BandDepth, - band_depth, - modified_band_depth, - fraiman_muniz_depth) + BandDepth) diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index 3dfddf80c..b14f36a59 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -4,10 +4,8 @@ from the center (larger values) outwards(smaller ones).""" import itertools -import math import scipy.integrate -from scipy.stats import rankdata import numpy as np @@ -32,6 +30,35 @@ class IntegratedDepth(FunctionalDepth): """ Functional depth as the integral of a multivariate depth. + This type of depth was introduced by Fraiman and Muniz ([FrMu01]_). + + Args: + multivariate_depth (Depth): Multivariate depth to integrate. + By default it is the one used in [FrMu01]_, that is, + + .. math:: + D(x) = 1 - \left\lvert \frac{1}{2}- F(x)\right\rvert + + Examples: + + >>> import skfda + >>> + >>> data_matrix = [[1, 1, 2, 3, 2.5, 2], + ... [0.5, 0.5, 1, 2, 1.5, 1], + ... [-1, -1, -0.5, 1, 1, 0.5], + ... [-0.5, -0.5, -0.5, -1, -1, -1]] + >>> grid_points = [0, 2, 4, 6, 8, 10] + >>> fd = skfda.FDataGrid(data_matrix, grid_points) + >>> depth = skfda.exploratory.depth.IntegratedDepth() + >>> depth(fd) + array([ 0.5 , 0.75 , 0.925, 0.875]) + + References: + + .. [FrMu01] Fraiman, R., & Muniz, G. (2001). Trimmed means for functional + data. Test, 10(2), 419–440. https://doi.org/10.1007/BF02595706 + + """ def __init__(self, *, @@ -45,27 +72,23 @@ def fit(self, X, y=None): self.multivariate_depth.fit(X.data_matrix) return self - def predict(self, X, *, pointwise=False): + def predict(self, X): pointwise_depth = self.multivariate_depth.predict(X.data_matrix) - if pointwise: - return pointwise_depth - else: - - interval_len = (self._domain_range[0][1] - - self._domain_range[0][0]) + interval_len = (self._domain_range[0][1] + - self._domain_range[0][0]) - integrand = pointwise_depth + integrand = pointwise_depth - for d, s in zip(X.domain_range, X.grid_points): - integrand = scipy.integrate.simps(integrand, - x=s, - axis=1) - interval_len = d[1] - d[0] - integrand /= interval_len + for d, s in zip(X.domain_range, X.grid_points): + integrand = scipy.integrate.simps(integrand, + x=s, + axis=1) + interval_len = d[1] - d[0] + integrand /= interval_len - return integrand + return integrand @property def max(self): @@ -77,66 +100,14 @@ def min(self): class ModifiedBandDepth(IntegratedDepth): - - def __init__(self): - super().__init__(multivariate_depth=multivariate.SimplicialDepth()) - - -class BandDepth(FunctionalDepth): """ - Functional depth as the integral of a multivariate depth. - - """ - - def fit(self, X, y=None): + Implementation of Modified Band Depth for functional data. - if X.dim_codomain != 1: - raise NotImplementedError("Band depth not implemented for vector " - "valued functions") - - self._distribution = X - return self - - def predict(self, X, *, pointwise=False): - - num_in = 0 - n_total = 0 - - for f1, f2 in itertools.combinations(self._distribution, 2): - between_range_1 = (f1.data_matrix <= X.data_matrix) & ( - X.data_matrix <= f2.data_matrix) - - between_range_2 = (f2.data_matrix <= X.data_matrix) & ( - X.data_matrix <= f1.data_matrix) - - between_range = between_range_1 | between_range_2 - - num_in += np.all(between_range, - axis=tuple(range(1, X.data_matrix.ndim))) - n_total += 1 - - return num_in / n_total - - -def band_depth(fdatagrid, *, pointwise=False): - """Implementation of Band Depth for functional data. - - The band depth of each sample is obtained by computing the fraction of the - bands determined by two sample curves containing the whole graph of the - first one. In the case the fdatagrid domain dimension is 2, instead of - curves, surfaces determine the bands. In larger dimensions, the hyperplanes - determine the bands. - - Args: - fdatagrid (FDataGrid): Object over whose samples the band depth is - going to be calculated. - pointwise (boolean, optional): Indicates if the pointwise depth is - returned instead. Defaults to False. - - Returns: - depth (numpy.darray): Array containing the band depth of the samples, - or the band depth of the samples at each point of discretization - if pointwise equals to True. + The band depth of each sample is obtained by computing the fraction of time + its graph is contained in the bands determined by two sample curves. + In the case the fdatagrid domain dimension is 2, instead of curves, + surfaces determine the bands. In larger dimensions, the hyperplanes + determine the bands. This method was originally defined in [LoRo09]_. Examples: @@ -148,36 +119,31 @@ def band_depth(fdatagrid, *, pointwise=False): ... [-0.5, -0.5, -0.5, -1, -1, -1]] >>> grid_points = [0, 2, 4, 6, 8, 10] >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> band_depth(fd) - array([ 0.5 , 0.83333333, 0.5 , 0.5 ]) + >>> depth = skfda.exploratory.depth.ModifiedBandDepth() + >>> values = depth(fd) + >>> values.round(2) + array([ 0.5 , 0.83, 0.73, 0.67]) + References: + .. [LoRo09] López-Pintado, S., & Romo, J. (2009). On the Concept of + Depth for Functional Data. Journal of the American Statistical + Association, 104(486), 718–734. + https://doi.org/10.1198/jasa.2009.0108 """ - if pointwise: - return modified_band_depth(fdatagrid, pointwise) - else: - return BandDepth().fit(fdatagrid).predict( - fdatagrid, pointwise=pointwise) + def __init__(self): + super().__init__(multivariate_depth=multivariate.SimplicialDepth()) -def modified_band_depth(fdatagrid, *, pointwise=False): - """Implementation of Modified Band Depth for functional data. - The band depth of each sample is obtained by computing the fraction of time - its graph is contained in the bands determined by two sample curves. - In the case the fdatagrid domain dimension is 2, instead of curves, - surfaces determine the bands. In larger dimensions, the hyperplanes - determine the bands. - - Args: - fdatagrid (FDataGrid): Object over whose samples the modified band - depth is going to be calculated. - pointwise (boolean, optional): Indicates if the pointwise depth is - returned instead. Defaults to False. +class BandDepth(FunctionalDepth): + """ + Implementation of Band Depth for functional data. - Returns: - depth (numpy.darray): Array containing the modified band depth of the - samples, or the modified band depth of the samples at each point - of discretization if pointwise equals to True. + The band depth of each sample is obtained by computing the fraction of the + bands determined by two sample curves containing the whole graph of the + first one. In the case the fdatagrid domain dimension is 2, instead of + curves, surfaces determine the bands. In larger dimensions, the hyperplanes + determine the bands. This method was originally defined in [LoRo09]_. Examples: @@ -189,75 +155,43 @@ def modified_band_depth(fdatagrid, *, pointwise=False): ... [-0.5, -0.5, -0.5, -1, -1, -1]] >>> grid_points = [0, 2, 4, 6, 8, 10] >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> depth = modified_band_depth(fd) - >>> depth.round(2) - array([ 0.5 , 0.83, 0.73, 0.67]) - >>> pointwise = modified_band_depth(fd, pointwise = True) - >>> pointwise.round(2) - array([[ 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ], - [ 0.83, 0.83, 0.83, 0.83, 0.83, 0.83], - [ 0.5 , 0.5 , 0.83, 0.83, 0.83, 0.83], - [ 0.83, 0.83, 0.83, 0.5 , 0.5 , 0.5 ]]) - - """ - return ModifiedBandDepth().fit(fdatagrid).predict( - fdatagrid, pointwise=pointwise) - - -def fraiman_muniz_depth(fdatagrid, *, pointwise=False): - r"""Implementation of Fraiman and Muniz (FM) Depth for functional data. + >>> depth = skfda.exploratory.depth.BandDepth() + >>> depth(fd) + array([ 0.5 , 0.83333333, 0.5 , 0.5 ]) - Each column is considered as the samples of an aleatory variable. - The univariate depth of each of the samples of each column is calculated - as follows: + References: + .. [LoRo09] López-Pintado, S., & Romo, J. (2009). On the Concept of + Depth for Functional Data. Journal of the American Statistical + Association, 104(486), 718–734. + https://doi.org/10.1198/jasa.2009.0108 - .. math:: - D(x) = 1 - \left\lvert \frac{1}{2}- F(x)\right\rvert + """ - Where :math:`F` stands for the marginal univariate distribution function of - each column. + def fit(self, X, y=None): - The depth of a sample is the result of integrating the previously computed - depth for each of its points and normalizing dividing by the length of - the interval. + if X.dim_codomain != 1: + raise NotImplementedError("Band depth not implemented for vector " + "valued functions") - Args: - fdatagrid (FDataGrid): Object over whose samples the FM depth is going - to be calculated. - pointwise (boolean, optional): Indicates if the pointwise depth is - returned instead. Defaults to False. + self._distribution = X + return self - Returns: - depth (numpy.darray): Array containing the Fraiman-Muniz depth of the - samples, or the Fraiman-Muniz of the samples at each point - of discretization if pointwise equals to True. + def predict(self, X): - Examples: - Currently, this depth function can only be used - for univariate functional data: + num_in = 0 + n_total = 0 - >>> import skfda - >>> - >>> data_matrix = [[1, 1, 2, 3, 2.5, 2], - ... [0.5, 0.5, 1, 2, 1.5, 1], - ... [-1, -1, -0.5, 1, 1, 0.5], - ... [-0.5, -0.5, -0.5, -1, -1, -1]] - >>> grid_points = [0, 2, 4, 6, 8, 10] - >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> fraiman_muniz_depth(fd) - array([ 0.5 , 0.75 , 0.925, 0.875]) + for f1, f2 in itertools.combinations(self._distribution, 2): + between_range_1 = (f1.data_matrix <= X.data_matrix) & ( + X.data_matrix <= f2.data_matrix) - You can use ``pointwise`` to obtain the pointwise depth, - before the integral is applied. + between_range_2 = (f2.data_matrix <= X.data_matrix) & ( + X.data_matrix <= f1.data_matrix) - >>> pointwise = fraiman_muniz_depth(fd, pointwise = True) - >>> pointwise - array([[ 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ], - [ 0.75, 0.75, 0.75, 0.75, 0.75, 0.75], - [ 0.75, 0.75, 1. , 1. , 1. , 1. ], - [ 1. , 1. , 1. , 0.75, 0.75, 0.75]]) + between_range = between_range_1 | between_range_2 + num_in += np.all(between_range, + axis=tuple(range(1, X.data_matrix.ndim))) + n_total += 1 - """ - return IntegratedDepth().fit(fdatagrid).predict( - fdatagrid, pointwise=pointwise) + return num_in / n_total diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index 7f8fefe4a..46a7e5de3 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -145,6 +145,16 @@ class _UnivariateFraimanMuniz(Depth): """ Univariate depth used to compute the Fraiman an Muniz depth. + Each column is considered as the samples of an aleatory variable. + The univariate depth of each of the samples of each column is calculated + as follows: + + .. math:: + D(x) = 1 - \left\lvert \frac{1}{2}- F(x)\right\rvert + + Where :math:`F` stands for the marginal univariate distribution function of + each column. + """ def fit(self, X, y=None): @@ -327,12 +337,3 @@ class ProjectionDepth(OutlyingnessBasedDepth): def __init__(self): super().__init__(outlyingness=StahelDonohoOutlyingness()) - - -def projection_depth(X, *, pointwise=False): - """Returns the projection depth. - - The projection depth is the depth function associated with the - Stagel-Donoho outlyingness. - """ - return ProjectionDepth().fit_predict(X.data_matrix) diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index fb4e66526..c6f9d8a74 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -1,4 +1,4 @@ -from skfda.exploratory.depth.multivariate import projection_depth +from skfda.exploratory.depth.multivariate import ProjectionDepth import typing from numpy import linalg as la @@ -22,7 +22,7 @@ class DirectionalOutlyingnessStats(typing.NamedTuple): def directional_outlyingness_stats( fdatagrid: FDataGrid, *, - multivariate_depth=projection_depth, + multivariate_depth=ProjectionDepth(), pointwise_weights=None) -> DirectionalOutlyingnessStats: r"""Computes the directional outlyingness of the functional data. @@ -84,8 +84,8 @@ def directional_outlyingness_stats( fdatagrid (FDataGrid): Object containing the samples to be ordered according to the directional outlyingness. multivariate_depth (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`modified band - depth `. + Method used to order the data. Defaults to :func:`projection + depth `. pointwise_weights (array_like, optional): an array containing the weights of each point of discretisation where values have been recorded. Defaults to the same weight for each of the points: @@ -162,7 +162,7 @@ def directional_outlyingness_stats( len(fdatagrid.grid_points[0])) / ( fdatagrid.domain_range[0][1] - fdatagrid.domain_range[0][0]) - depth_pointwise = multivariate_depth(fdatagrid, pointwise=True) + depth_pointwise = multivariate_depth(fdatagrid.data_matrix) assert depth_pointwise.shape == fdatagrid.data_matrix.shape[:-1] # Obtaining the pointwise median sample Z, to calculate @@ -254,8 +254,8 @@ class DirectionalOutlierDetector(BaseEstimator, OutlierMixin): Parameters: multivariate_depth (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`projection - depth `. + Method used to order the data. Defaults to :class:`projection + depth `. pointwise_weights (array_like, optional): an array containing the weights of each points of discretisati on where values have been recorded. @@ -303,7 +303,7 @@ class DirectionalOutlierDetector(BaseEstimator, OutlierMixin): """ def __init__( - self, *, multivariate_depth=projection_depth, + self, *, multivariate_depth=ProjectionDepth(), pointwise_weights=None, assume_centered=False, support_fraction=None, diff --git a/skfda/exploratory/outliers/_iqr.py b/skfda/exploratory/outliers/_iqr.py index 632906f01..98e74f43a 100644 --- a/skfda/exploratory/outliers/_iqr.py +++ b/skfda/exploratory/outliers/_iqr.py @@ -1,7 +1,7 @@ from sklearn.base import BaseEstimator, OutlierMixin from . import _envelopes -from ..depth import modified_band_depth +from ..depth import ModifiedBandDepth class IQROutlierDetector(BaseEstimator, OutlierMixin): @@ -32,7 +32,7 @@ class IQROutlierDetector(BaseEstimator, OutlierMixin): """ - def __init__(self, *, depth_method=modified_band_depth, factor=1.5): + def __init__(self, *, depth_method=ModifiedBandDepth(), factor=1.5): self.depth_method = depth_method self.factor = factor diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 5f0aafc34..9fe2d3ce1 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,6 +1,6 @@ """Functional data descriptive statistics. """ -from ..depth import modified_band_depth +from ..depth import ModifiedBandDepth def mean(fdata): @@ -69,7 +69,7 @@ def cov(fdatagrid): return fdatagrid.cov() -def depth_based_median(fdatagrid, depth_method=modified_band_depth): +def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): """Compute the median based on a depth measure. The depth based median is the deepest curve given a certain @@ -80,7 +80,7 @@ def depth_based_median(fdatagrid, depth_method=modified_band_depth): functional variable. depth_method (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`modified - band depth `. + band depth `. Returns: FDataGrid: object containing the computed depth_based median. @@ -95,7 +95,7 @@ def depth_based_median(fdatagrid, depth_method=modified_band_depth): def trim_mean(fdatagrid, proportiontocut, - depth_method=modified_band_depth): + depth_method=ModifiedBandDepth()): """Compute the trimmed means based on a depth measure. The trimmed means consists in computing the mean function without a @@ -114,7 +114,7 @@ def trim_mean(fdatagrid, dataset. depth_method (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`modified - band depth `. + band depth `. Returns: FDataGrid: object containing the computed trimmed mean. diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index c79b15735..1c0abaa5a 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -12,7 +12,7 @@ import matplotlib.pyplot as plt import numpy as np -from ..depth import modified_band_depth +from ..depth import ModifiedBandDepth from ..outliers import _envelopes from ._utils import (_figure_to_svg, _get_figure_and_axes, _set_figure_layout_for_fdata, _set_labels) @@ -104,7 +104,7 @@ class Boxplot(FDataBoxplot): depth_method (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`modified band depth - `. + `. prob (list of float, optional): List with float numbers (in the range from 1 to 0) that indicate which central regions to represent. @@ -246,7 +246,7 @@ class Boxplot(FDataBoxplot): """ - def __init__(self, fdatagrid, depth_method=modified_band_depth, prob=[0.5], + def __init__(self, fdatagrid, depth_method=ModifiedBandDepth(), prob=[0.5], factor=1.5): """Initialization of the Boxplot class. @@ -255,7 +255,7 @@ def __init__(self, fdatagrid, depth_method=modified_band_depth, prob=[0.5], depth_method (:ref:`depth measure `, optional): Method used to order the data. Defaults to :func:`modified band depth - `. + `. prob (list of float, optional): List with float numbers (in the range from 1 to 0) that indicate which central regions to represent. @@ -465,8 +465,8 @@ class SurfaceBoxplot(FDataBoxplot): fdatagrid (FDataGrid): Object containing the data. method (:ref:`depth measure `, optional): Method - used to order the data. Defaults to :func:`modified band depth - `. + used to order the data. Defaults to :class:`modified band depth + `. prob (list of float, optional): List with float numbers (in the range from 1 to 0) that indicate which central regions to represent. @@ -562,14 +562,14 @@ class SurfaceBoxplot(FDataBoxplot): """ - def __init__(self, fdatagrid, method=modified_band_depth, factor=1.5): + def __init__(self, fdatagrid, method=ModifiedBandDepth(), factor=1.5): """Initialization of the functional boxplot. Args: fdatagrid (FDataGrid): Object containing the data. method (:ref:`depth measure `, optional): Method - used to order the data. Defaults to :func:`modified band depth - `. + used to order the data. Defaults to :class:`modified band depth + `. prob (list of float, optional): List with float numbers (in the range from 1 to 0) that indicate which central regions to represent. diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index be5119935..9e3e5e4d7 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -11,7 +11,6 @@ import matplotlib.pyplot as plt import numpy as np -from ..depth import modified_band_depth from ..outliers import DirectionalOutlierDetector from ._utils import _figure_to_svg, _get_figure_and_axes, _set_figure_layout @@ -39,8 +38,8 @@ class MagnitudeShapePlot: fdatagrid (FDataGrid): Object containing the data. multivariate_depth (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`projection - depth `. + Method used to order the data. Defaults to :class:`projection + depth `. pointwise_weights (array_like, optional): an array containing the weights of each points of discretisati on where values have been recorded. @@ -68,16 +67,6 @@ class MagnitudeShapePlot: Attributes: - fdatagrid (FDataGrid): Object to be visualized. - multivariate_depth (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`modified band - depth `. - pointwise_weights (array_like, optional): an array containing the - weights of each points of discretisation where values have been - recorded. - alpha(float, optional): Denotes the quantile to choose the cutoff - value for detecting outliers Defaults to 0.993, which is used - in the classical boxplot. points(numpy.ndarray): 2-dimensional matrix where each row contains the points plotted in the graph. outliers (1-D array, (fdatagrid.n_samples,)): Contains 1 or 0 to denote @@ -111,7 +100,6 @@ class MagnitudeShapePlot: Example: >>> import skfda - >>> from skfda.exploratory.depth import modified_band_depth >>> data_matrix = [[1, 1, 2, 3, 2.5, 2], ... [0.5, 0.5, 1, 2, 1.5, 1], ... [-1, -1, -0.5, 1, 1, 0.5], @@ -148,7 +136,7 @@ class MagnitudeShapePlot: grid_points=(array([ 0., 2., 4., 6., 8., 10.]),), domain_range=((0.0, 10.0),), ...), - multivariate_depth=projection_depth, + multivariate_depth=ProjectionDepth(), pointwise_weights=None, alpha=0.993, points=array([[ 1.66666667, 0.12777778], @@ -178,8 +166,8 @@ def __init__(self, fdatagrid, **kwargs): Args: fdatagrid (FDataGrid): Object containing the data. multivariate_depth (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`projection - depth `. + Method used to order the data. Defaults to :class:`projection + depth `. pointwise_weights (array_like, optional): an array containing the weights of each points of discretisati on where values have been recorded. @@ -317,7 +305,7 @@ def __repr__(self): """Return repr(self).""" return (f"MagnitudeShapePlot(" f"\nFDataGrid={repr(self.fdatagrid)}," - f"\nmultivariate_depth={self.multivariate_depth.__name__}," + f"\nmultivariate_depth={self.multivariate_depth}," f"\npointwise_weights={repr(self.pointwise_weights)}," f"\nalpha={repr(self.alpha)}," f"\npoints={repr(self.points)}," diff --git a/tests/test_fdata_boxplot.py b/tests/test_fdata_boxplot.py index b5fa0e045..ee39bd7cf 100644 --- a/tests/test_fdata_boxplot.py +++ b/tests/test_fdata_boxplot.py @@ -1,5 +1,5 @@ from skfda import FDataGrid -from skfda.exploratory.depth import fraiman_muniz_depth +from skfda.exploratory.depth import IntegratedDepth from skfda.exploratory.visualization import Boxplot, SurfaceBoxplot import unittest @@ -15,7 +15,7 @@ def test_fdboxplot_univariate(self): [-0.5, -0.5, -0.5, -1, -1, -1]] grid_points = [0, 2, 4, 6, 8, 10] fd = FDataGrid(data_matrix, grid_points) - fdataBoxplot = Boxplot(fd, depth_method=fraiman_muniz_depth) + fdataBoxplot = Boxplot(fd, depth_method=IntegratedDepth()) np.testing.assert_array_equal( fdataBoxplot.median.ravel(), np.array([-1., -1., -0.5, 1., 1., 0.5])) diff --git a/tests/test_magnitude_shape.py b/tests/test_magnitude_shape.py index 2061cc128..bd701e72e 100644 --- a/tests/test_magnitude_shape.py +++ b/tests/test_magnitude_shape.py @@ -1,6 +1,6 @@ from skfda import FDataGrid from skfda.datasets import fetch_weather -from skfda.exploratory.depth import modified_band_depth +from skfda.exploratory.depth.multivariate import SimplicialDepth from skfda.exploratory.visualization import MagnitudeShapePlot import unittest @@ -13,7 +13,7 @@ def test_magnitude_shape_plot(self): fd = fetch_weather()["data"] fd_temperatures = fd.coordinates[0] msplot = MagnitudeShapePlot( - fd_temperatures, multivariate_depth=modified_band_depth) + fd_temperatures, multivariate_depth=SimplicialDepth()) np.testing.assert_allclose(msplot.points, np.array([[0.2112587, 3.0322570], [1.2823448, 0.8272850], diff --git a/tests/test_outliers.py b/tests/test_outliers.py index 32687a5e1..575b96d9f 100644 --- a/tests/test_outliers.py +++ b/tests/test_outliers.py @@ -1,5 +1,5 @@ from skfda import FDataGrid -from skfda.exploratory.depth import modified_band_depth +from skfda.exploratory.depth.multivariate import SimplicialDepth from skfda.exploratory.outliers import DirectionalOutlierDetector from skfda.exploratory.outliers import directional_outlyingness_stats import unittest @@ -16,7 +16,7 @@ def test_directional_outlyingness(self): grid_points = [2, 4, 6, 8] fd = FDataGrid(data_matrix, grid_points) stats = directional_outlyingness_stats( - fd, multivariate_depth=modified_band_depth) + fd, multivariate_depth=SimplicialDepth()) np.testing.assert_allclose(stats.directional_outlyingness, np.array([[[0.], [0.], From d576ab62e1bd9522e864fd7dedda46248bd59743 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 21 Oct 2020 15:21:28 +0200 Subject: [PATCH 069/210] Update depth docs. --- docs/modules/exploratory/depth.rst | 96 ++++++++++++++++++------- examples/plot_magnitude_shape.py | 16 +++-- skfda/exploratory/depth/__init__.py | 1 + skfda/exploratory/depth/_depth.py | 51 ++++++------- skfda/exploratory/depth/multivariate.py | 46 +++++++----- 5 files changed, 133 insertions(+), 77 deletions(-) diff --git a/docs/modules/exploratory/depth.rst b/docs/modules/exploratory/depth.rst index ded94532f..dac591d89 100644 --- a/docs/modules/exploratory/depth.rst +++ b/docs/modules/exploratory/depth.rst @@ -1,48 +1,96 @@ -Depth Measures -============== +Depth and outlyingness measures +=============================== -Functions to order functional data. +Depth and outlyingness functions are related concepts proposed to order the +observations of a dataset, extend the concept of median and trimmed +statistics to multivariate and functional data and to detect outliers. -Each sample of the dataset is assigned a number between 0 and 1. -Larger values correspond to more centered samples and smaller ones to those samples more outward. +Depth +----- .. _depth-measures: +Depth measures are functions that assign, to each possible observation, a value +measuring how deep is that observation inside a given distribution (usually the +distribution is approximated by a dataset). +This function has it maximum value towards a "center" of the distribution, +called the median of the depth. +This allows a extension of the concept of median to multivariate or functional +data. +These functions also provide a natural order of the data, which is required to +apply methods such as the boxplot or the trimmed mean. + +The interface of a depth function is given by the following class: + .. autosummary:: :toctree: autosummary - skfda.exploratory.depth.band_depth - skfda.exploratory.depth.modified_band_depth - skfda.exploratory.depth.fraiman_muniz_depth + skfda.exploratory.depth.Depth -The possibility of obtaining the ordering of each point of the sample (compared to the other samples) -is given if a parameter is specified in the functions. +The following classes implement depth functions for functional data: -All of them support multivariate functional data, with more than one dimension on the image and -on the domain. +.. autosummary:: + :toctree: autosummary -Outlyingness conversion to depth --------------------------------- + skfda.exploratory.depth.IntegratedDepth + skfda.exploratory.depth.BandDepth + skfda.exploratory.depth.ModifiedBandDepth -The concepts of depth and outlyingness are (inversely) related. A deeper datum is less likely an outlier. Conversely, -a datum with very low depth is possibly an outlier. In order to convert an outlying measure to a depth measure -the following convenience function is provided. +Most of them support multivariate functional data, with more than one dimension +on the domain and on the codomain. + +Multivariate depths +^^^^^^^^^^^^^^^^^^^ + +Some utilities, such as the +:class:`~skfda.exploratory.visualization.MagnitudeShapePlot` require computing +a non-functional (multivariate) depth pointwise. +Moreover, some functional depths, such as the +:class:`integrated depth ` are defined +using multivariate depths. +Thus we also provide some multivariate depth functions: .. autosummary:: :toctree: autosummary - skfda.exploratory.depth.outlyingness_to_depth - -Multivariate depths -------------------- + skfda.exploratory.depth.multivariate.ProjectionDepth + skfda.exploratory.depth.multivariate.SimplicialDepth -Some utilities, such as the :class:`~skfda.exploratory.visualization.MagnitudeShapePlot` require computing a non-functional -(multivariate) depth pointwise. Thus we also provide some multivariate depth functions. +Outlyingness +------------ + +The concepts of depth and outlyingness are (inversely) related. +A deeper datum is less likely an outlier. +Conversely, a datum with very low depth is possibly an outlier. +The following interface (which is very similar to the one used for depths) is +used to define an outlyingness measure: + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.depth.Outlyingness + +Multivariate outlyingness +^^^^^^^^^^^^^^^^^^^^^^^^^ + +We provide the classical Stahel-Donoho outlyingness measure for the univariate +data case: .. autosummary:: :toctree: autosummary - skfda.exploratory.depth.multivariate.projection_depth + skfda.exploratory.depth.multivariate.StahelDonohoOutlyingness +Conversion +---------- + +As depth and outlyingness are closely related, there are ways to convert one +into the other. +The following class define a depth based on an outlyingness measure. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.depth.OutlyingnessBasedDepth diff --git a/examples/plot_magnitude_shape.py b/examples/plot_magnitude_shape.py index bda1e6f04..68555c74c 100644 --- a/examples/plot_magnitude_shape.py +++ b/examples/plot_magnitude_shape.py @@ -10,12 +10,14 @@ # sphinx_gallery_thumbnail_number = 2 -import matplotlib.pyplot as plt -import numpy as np from skfda import datasets -from skfda.exploratory.depth import fraiman_muniz_depth, modified_band_depth +from skfda.exploratory.depth import IntegratedDepth +from skfda.exploratory.depth.multivariate import SimplicialDepth from skfda.exploratory.visualization import MagnitudeShapePlot +import matplotlib.pyplot as plt +import numpy as np + ############################################################################## # First, the Canadian Weather dataset is downloaded from the package 'fda' in @@ -49,7 +51,7 @@ # (which is 'seismic' and can be customized), are assigned. msplot = MagnitudeShapePlot(fdatagrid=fd_temperatures, - depth_method=modified_band_depth) + multivariate_depth=SimplicialDepth()) color = 0.3 outliercol = 0.7 @@ -76,12 +78,12 @@ # outliers but in the MS-Plot, they appear further left from the central # points. This behaviour can be modified specifying the parameter alpha. # -# Now we use the pointwise -# :func:`~skfda.exploratory.depth_measures.fraiman_muniz_depth` in the +# Now we use the default multivariate depth from +# :func:`~skfda.exploratory.depth.IntegratedDepth` in the # MS-Plot. msplot = MagnitudeShapePlot(fdatagrid=fd_temperatures, - depth_method=fraiman_muniz_depth) + multivariate_depth=IntegratedDepth().multivariate_depth) msplot.color = color msplot.outliercol = outliercol diff --git a/skfda/exploratory/depth/__init__.py b/skfda/exploratory/depth/__init__.py index 092e8bcc7..b1eb7aff0 100644 --- a/skfda/exploratory/depth/__init__.py +++ b/skfda/exploratory/depth/__init__.py @@ -2,3 +2,4 @@ from ._depth import (IntegratedDepth, ModifiedBandDepth, BandDepth) +from .multivariate import Depth, Outlyingness, OutlyingnessBasedDepth diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index b14f36a59..46bd3d2b2 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -10,31 +10,20 @@ import numpy as np from . import multivariate +from .multivariate import Depth __author__ = "Amanda Hernando Bernabé" __email__ = "amanda.hernando@estudiante.uam.es" -class FunctionalDepth(multivariate.Depth): - """ - Abstract class representing a functional depth function. - - Usually it will accept a distribution in the initializer. - - """ - pass - - -class IntegratedDepth(FunctionalDepth): - """ +class IntegratedDepth(Depth): + r""" Functional depth as the integral of a multivariate depth. - This type of depth was introduced by Fraiman and Muniz ([FrMu01]_). - Args: multivariate_depth (Depth): Multivariate depth to integrate. - By default it is the one used in [FrMu01]_, that is, + By default it is the one used by Fraiman and Muniz, that is, .. math:: D(x) = 1 - \left\lvert \frac{1}{2}- F(x)\right\rvert @@ -55,8 +44,8 @@ class IntegratedDepth(FunctionalDepth): References: - .. [FrMu01] Fraiman, R., & Muniz, G. (2001). Trimmed means for functional - data. Test, 10(2), 419–440. https://doi.org/10.1007/BF02595706 + Fraiman, R., & Muniz, G. (2001). Trimmed means for functional + data. Test, 10(2), 419–440. https://doi.org/10.1007/BF02595706 """ @@ -100,14 +89,14 @@ def min(self): class ModifiedBandDepth(IntegratedDepth): - """ + r""" Implementation of Modified Band Depth for functional data. The band depth of each sample is obtained by computing the fraction of time its graph is contained in the bands determined by two sample curves. In the case the fdatagrid domain dimension is 2, instead of curves, surfaces determine the bands. In larger dimensions, the hyperplanes - determine the bands. This method was originally defined in [LoRo09]_. + determine the bands. Examples: @@ -125,25 +114,26 @@ class ModifiedBandDepth(IntegratedDepth): array([ 0.5 , 0.83, 0.73, 0.67]) References: - .. [LoRo09] López-Pintado, S., & Romo, J. (2009). On the Concept of - Depth for Functional Data. Journal of the American Statistical - Association, 104(486), 718–734. - https://doi.org/10.1198/jasa.2009.0108 + + López-Pintado, S., & Romo, J. (2009). On the Concept of + Depth for Functional Data. Journal of the American Statistical + Association, 104(486), 718–734. + https://doi.org/10.1198/jasa.2009.0108 """ def __init__(self): super().__init__(multivariate_depth=multivariate.SimplicialDepth()) -class BandDepth(FunctionalDepth): - """ +class BandDepth(Depth): + r""" Implementation of Band Depth for functional data. The band depth of each sample is obtained by computing the fraction of the bands determined by two sample curves containing the whole graph of the first one. In the case the fdatagrid domain dimension is 2, instead of curves, surfaces determine the bands. In larger dimensions, the hyperplanes - determine the bands. This method was originally defined in [LoRo09]_. + determine the bands. Examples: @@ -160,10 +150,11 @@ class BandDepth(FunctionalDepth): array([ 0.5 , 0.83333333, 0.5 , 0.5 ]) References: - .. [LoRo09] López-Pintado, S., & Romo, J. (2009). On the Concept of - Depth for Functional Data. Journal of the American Statistical - Association, 104(486), 718–734. - https://doi.org/10.1198/jasa.2009.0108 + + López-Pintado, S., & Romo, J. (2009). On the Concept of + Depth for Functional Data. Journal of the American Statistical + Association, 104(486), 718–734. + https://doi.org/10.1198/jasa.2009.0108 """ diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index 46a7e5de3..e607bb881 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -52,7 +52,7 @@ def fit_predict(self, X, y=None): """ return self.fit(X).predict(X) - def __call__(self, X, distribution=None): + def __call__(self, X, *, distribution=None): """ Allows the depth or outlyingness to be used as a function. @@ -142,7 +142,7 @@ def _cumulative_distribution(column): class _UnivariateFraimanMuniz(Depth): - """ + r""" Univariate depth used to compute the Fraiman an Muniz depth. Each column is considered as the samples of an aleatory variable. @@ -175,9 +175,19 @@ def min(self): class SimplicialDepth(Depth): - """ + r""" Simplicial depth. + The simplicial depth of a point :math:`x` in :math:`\mathbb{R}^p` given a + distribution :math:`F` is the probability that a random simplex with its + :math:`p + 1` points sampled from :math:`F` contains :math:`x`. + + References: + + Liu, R. Y. (1990). On a Notion of Data Depth Based on Random + Simplices. The Annals of Statistics, 18(1), 405–414. + + """ def fit(self, X, y=None): @@ -236,16 +246,17 @@ class OutlyingnessBasedDepth(Depth): .. math:: D(x) = 1 - \frac{O(x)}{\sup O(x)} - if :math:`O(x)` is bounded ([Se06]_). If the infimum value of the + if :math:`O(x)` is bounded. If the infimum value of the outlyiness function is not zero, it is subtracted beforehand. Args: outlyingness (Outlyingness): Outlyingness object. References: - .. [Se06] Serfling, R. (2006). Depth functions in nonparametric - multivariate inference. DIMACS Series in Discrete Mathematics and - Theoretical Computer Science, 72, 1. + + Serfling, R. (2006). Depth functions in nonparametric + multivariate inference. DIMACS Series in Discrete Mathematics and + Theoretical Computer Science, 72, 1. """ @@ -280,13 +291,13 @@ class StahelDonohoOutlyingness(Outlyingness): where :math:`\text{X}` is a sample with distribution :math:`F`, :math:`\text{Med}` is the median and :math:`\text{MAD}` is the - median absolute deviation ([ZuCuHe04]_). + median absolute deviation. References: - .. [ZuCuHe04] Zuo, Y., Cui, H., & He, X. (2004). On the Stahel-Donoho - estimator and depth-weighted means of multivariate data. Annals of - Statistics, 32(1), 167–188. https://doi.org/10.1214/aos/1079120132 + Zuo, Y., Cui, H., & He, X. (2004). On the Stahel-Donoho + estimator and depth-weighted means of multivariate data. Annals of + Statistics, 32(1), 167–188. https://doi.org/10.1214/aos/1079120132 """ @@ -324,14 +335,17 @@ class ProjectionDepth(OutlyingnessBasedDepth): r""" Computes Projection depth. - It is defined as the depth induced by the Stahel-Donoho outlyingness - ([ZuCuHe04]_). + It is defined as the depth induced by the + :class:`Stahel-Donoho outlyingness `. + + See also: + :class:`StahelDonohoOutlyingness`: Stahel-Donoho outlyingness. References: - .. [ZuCuHe04] Zuo, Y., Cui, H., & He, X. (2004). On the Stahel-Donoho - estimator and depth-weighted means of multivariate data. Annals of - Statistics, 32(1), 167–188. https://doi.org/10.1214/aos/1079120132 + Zuo, Y., Cui, H., & He, X. (2004). On the Stahel-Donoho + estimator and depth-weighted means of multivariate data. Annals of + Statistics, 32(1), 167–188. https://doi.org/10.1214/aos/1079120132 """ From 2a14ca8a6df5894b8e378c5524f947bfaa09968c Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 23 Oct 2020 13:04:49 +0200 Subject: [PATCH 070/210] Use optimum_reparam from fdasrsf. --- skfda/preprocessing/registration/elastic.py | 22 ++++++--------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py index 5dd16b34e..0a8ff421e 100644 --- a/skfda/preprocessing/registration/elastic.py +++ b/skfda/preprocessing/registration/elastic.py @@ -1,6 +1,5 @@ -import optimum_reparam - +from fdasrsf.utility_functions import optimum_reparam import scipy.integrate from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted @@ -258,20 +257,11 @@ def _elastic_alignment_array(template_data, q_data, the functions aligned to the template(s). """ - # Select cython function - if template_data.ndim == 1 and q_data.ndim == 1: - reparam = optimum_reparam.coptimum_reparam - - elif template_data.ndim == 1: - reparam = optimum_reparam.coptimum_reparam_n - - else: - reparam = optimum_reparam.coptimum_reparam_n2 - - return reparam(np.ascontiguousarray(template_data.T), - np.ascontiguousarray(eval_points), - np.ascontiguousarray(q_data.T), - penalty, grid_dim).T + return optimum_reparam(np.ascontiguousarray(template_data.T), + np.ascontiguousarray(eval_points), + np.ascontiguousarray(q_data.T), + method="DP2", + lam=penalty).T class ElasticRegistration(RegistrationTransformer): From f07fb3d1c581ac71c6c641c0398b4321ea859184 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 23 Oct 2020 13:46:30 +0200 Subject: [PATCH 071/210] Remove compilation of C libraries. --- deps/fdasrsf/LICENSE | 22 -- deps/fdasrsf/README.rst | 33 --- deps/fdasrsf/dp_grid.c | 364 ------------------------------- deps/fdasrsf/dp_grid.h | 160 -------------- deps/fdasrsf/optimum_reparam.pyx | 164 -------------- pyproject.toml | 2 +- setup.py | 20 -- 7 files changed, 1 insertion(+), 764 deletions(-) delete mode 100644 deps/fdasrsf/LICENSE delete mode 100644 deps/fdasrsf/README.rst delete mode 100644 deps/fdasrsf/dp_grid.c delete mode 100644 deps/fdasrsf/dp_grid.h delete mode 100644 deps/fdasrsf/optimum_reparam.pyx diff --git a/deps/fdasrsf/LICENSE b/deps/fdasrsf/LICENSE deleted file mode 100644 index aaa7533ee..000000000 --- a/deps/fdasrsf/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The ElasticFDA.jl package is licensed under the MIT "Expat" License: - -> Copyright (c) 2016: J. Derek Tucker. -> -> Permission is hereby granted, free of charge, to any person obtaining -> a copy of this software and associated documentation files (the -> "Software"), to deal in the Software without restriction, including -> without limitation the rights to use, copy, modify, merge, publish, -> distribute, sublicense, and/or sell copies of the Software, and to -> permit persons to whom the Software is furnished to do so, subject to -> the following conditions: -> -> The above copyright notice and this permission notice shall be -> included in all copies or substantial portions of the Software. -> -> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/deps/fdasrsf/README.rst b/deps/fdasrsf/README.rst deleted file mode 100644 index 2df9b70f7..000000000 --- a/deps/fdasrsf/README.rst +++ /dev/null @@ -1,33 +0,0 @@ -fdasrsf -======= - -C and Cython routines for functional data analysis using the square root -slope framework and curves using the square root velocity framework -which performs pair-wise and group-wise alignment as well as modelling -using functional component analysis and regression. - -The code has been extracted from the repositories `fdasrsf_python -`_ and `ElasticFDA.jl -`_ developed by J. Derek Tucker. -A copy of the original license can be found `here -`_. - - -References -========== -* Srivastava, A., Wu, W., Kurtek, S., Klassen, E. and Marron, J. S. (2011). - Registration of Functional Data Using Fisher-Rao Metric. - arXiv:1103.3817v2 [math.ST]. - -* Tucker, J. D., Wu, W. and Srivastava, A. (2013). Generative models for - functional data using phase and amplitude separation. Computational Statistics - and Data Analysis 61, 50-66. - -* Joshi, S.H., Srivastava, A., Klassen, E. and Jermyn, I. (2007). - A Novel Representation for Computing Geodesics Between n-Dimensional Elastic - Curves. IEEE Conference on computer Vision and Pattern Recognition (CVPR), - Minneapolis, MN. - -* Srivastava, A., Klassen, E., Joshi, S., Jermyn, I., (2011). Shape analysis - of elastic curves in euclidean spaces. Pattern Analysis and Machine - Intelligence, IEEE Transactions on 33 (7), 1415 –1428. diff --git a/deps/fdasrsf/dp_grid.c b/deps/fdasrsf/dp_grid.c deleted file mode 100644 index fd1b752d7..000000000 --- a/deps/fdasrsf/dp_grid.c +++ /dev/null @@ -1,364 +0,0 @@ - -#include "dp_grid.h" - -#include -#include - -/* Original code developed by J. Derek Tucker in ElasticFDA.jl. The following -* code is under the MIT license, a copy of the license it is included with it. -* -* 03/25/2019: Modified by Pablo Marcos . -*/ - - -#define TOL 1e-6 - - -void dp_optimum_reparam(double* Q1, double* T1, double* Q2, double* T2, - int m1, int n1, int n2, double* tv1, double* tv2, - int n1v, int n2v, double* G, double* T, double* size, - double lam1, int nbhd_dim) -{ - int* idxv1; - int* idxv2; - double* E; /* E[ntv1*j+i] = cost of best path to (tv1[i],tv2[j]) */ - int* P; /* P[ntv1*j+i] = predecessor of (tv1[i],tv2[j]) along best path */ - int * dp_nbhd; /* Indexes for grid points */ - int nbhd_count; /* Number of indexes */ - - idxv1 = malloc((n1v) * sizeof(*idxv1)); - idxv2 = malloc((n2v) * sizeof(*idxv2)); - E = malloc((n1v) * (n2v) * sizeof(*E)); - P = calloc((n1v) * (n2v), sizeof(*P)); - - /* indexes for gridpoints */ - dp_nbhd = dp_generate_nbhd(nbhd_dim, &nbhd_count); - - - dp_all_indexes(T1, n1, tv1, n1v, idxv1); - dp_all_indexes(T2, n2, tv2, n2v, idxv2); - - /* Compute cost of best path from (0,0) to every other grid point */ - dp_costs(Q1, T1, n1, Q2, T2, n2, - m1, tv1, idxv1, n1v, tv2, idxv2, n2v, E, P, lam1, - nbhd_count, dp_nbhd); - - /* Reconstruct best path from (0,0) to (1,1) */ - *size = dp_build_gamma(P, tv1, n1v, tv2, n2v, G, T); - - /* free allocated memory */ - free(dp_nbhd); - free(idxv1); - free(idxv2); - free(E); - free(P); -} - - -double dp_costs( - double* Q1, double* T1, int nsamps1, - double* Q2, double* T2, int nsamps2, - int dim, - double* tv1, int* idxv1, int ntv1, - double* tv2, int* idxv2, int ntv2, - double* E, int* P, double lam, int nbhd_count, int *dp_nbhd) -{ - int sr, sc; /* source row and column */ - int tr, tc; /* target row and column */ - double w, cand_cost; - int i; - - E[0] = 0.0; - for (i = 1; i < ntv1; E[i++] = INFINITY); - for (i = 1; i < ntv2; E[ntv1 * i++] = INFINITY); - - for (tr = 1; tr < ntv2; ++tr) { - for (tc = 1; tc < ntv1; ++tc) { - E[ntv1 * tr + tc] = INFINITY; - - for (i = 0; i < 2 * nbhd_count; i += 2) { - sr = tr - dp_nbhd[i]; - sc = tc - dp_nbhd[i + 1]; - - if (sr < 0 || sc < 0) continue; - - w = dp_edge_weight(Q1, T1, nsamps1, Q2, T2, nsamps2, dim, - tv1[sc], tv1[tc], tv2[sr], tv2[tr], - idxv1[sc], idxv2[sr], lam); - - cand_cost = E[ntv1 * sr + sc] + w; - if (cand_cost < E[ntv1 * tr + tc]) { - E[ntv1 * tr + tc] = cand_cost; - P[ntv1 * tr + tc] = ntv1 * sr + sc; - } - } - } - } - - return E[ntv1 * ntv2 - 1]; -} - - -double dp_edge_weight( - double* Q1, double* T1, int nsamps1, - double* Q2, double* T2, int nsamps2, - int dim, - double a, double b, - double c, double d, - int aidx, int cidx, double lam) -{ - double res = 0.0; - int Q1idx, Q2idx; - int Q1idxnext, Q2idxnext; - double t1, t2; - double t1next, t2next; - double t1nextcand1, t1nextcand2; - double slope, rslope; - double dq, dqi; - int i; - - Q1idx = aidx; /*dp_lookup( T1, nsamps1, a );*/ - Q2idx = cidx; /*dp_lookup( T2, nsamps2, c );*/ - - t1 = a; - t2 = c; - - slope = (d - c) / (b - a); - rslope = sqrt(slope); - - while (t1 < b && t2 < d) { - if (Q1idx > nsamps1 - 2 || Q2idx > nsamps2 - 2) break; - - /* Find endpoint of current interval */ - t1nextcand1 = T1[Q1idx + 1]; - t1nextcand2 = a + (T2[Q2idx + 1] - c) / slope; - - if (fabs(t1nextcand1 - t1nextcand2) < TOL) { - t1next = T1[Q1idx + 1]; - t2next = T2[Q2idx + 1]; - Q1idxnext = Q1idx + 1; - Q2idxnext = Q2idx + 1; - } - else if (t1nextcand1 < t1nextcand2) { - t1next = t1nextcand1; - t2next = c + slope * (t1next - a); - Q1idxnext = Q1idx + 1; - Q2idxnext = Q2idx; - } - else { - t1next = t1nextcand2; - t2next = T2[Q2idx + 1]; - Q1idxnext = Q1idx; - Q2idxnext = Q2idx + 1; - } - - if (t1next > b) t1next = b; - if (t2next > d) t2next = d; - - /* Get contribution for current interval */ - dq = 0.0; - for (i = 0; i < dim; ++i) { - /* Q1 and Q2 are column-major arrays! */ - dqi = Q1[Q1idx * dim + i] - rslope * Q2[Q2idx * dim + i]; - dq += dqi * dqi + lam * (1 - rslope) * (1 - rslope); - } - res += (t1next - t1) * dq; - - t1 = t1next; - t2 = t2next; - Q1idx = Q1idxnext; - Q2idx = Q2idxnext; - } - - return res; -} - - -int dp_build_gamma( - int* P, - double* tv1, int ntv1, - double* tv2, int ntv2, - double* G, double* T) -{ - int sr, sc; - int tr, tc; - int p, i; - int npts; /* result = length of Tg */ - - /* Dry run first, to determine length of Tg */ - npts = 1; - tr = ntv2 - 1; - tc = ntv1 - 1; - while (tr > 0 && tc > 0) { - p = P[tr * ntv1 + tc]; - tr = p / ntv1; - tc = p % ntv1; - ++npts; - } - - G[npts - 1] = tv2[ntv2 - 1]; - T[npts - 1] = tv1[ntv1 - 1]; - - tr = ntv2 - 1; - tc = ntv1 - 1; - i = npts - 2; - while (tr > 0 && tc > 0) { - p = P[tr * ntv1 + tc]; - sr = p / ntv1; - sc = p % ntv1; - - G[i] = tv2[sr]; - T[i] = tv1[sc]; - - tr = sr; - tc = sc; - --i; - } - - return npts; -} - - -int dp_lookup(double* T, int n, double t) -{ - int l, m, r; - - if (t < T[n - 1]) { - l = 0; - r = n; - m = (l + r) / 2; - - while (1) { - if (t >= T[m + 1]) - l = m; - else if (t < T[m]) - r = m; - else - break; - - m = (r + l) / 2; - } - - return m; - } - else { - return n - 2; - } -} - -void dp_all_indexes(double* p, int np, double* tv, int ntv, int* idxv) -{ - int i; - int pi = 0; - - for (i = 0; i < ntv; ++i) { - while (pi < np - 2 && tv[i] >= p[pi + 1]) ++pi; - idxv[i] = pi; - } -} - - -int gcd(int a, int b) { - /* Greatest common divisor. - * Computes the greates common divisor between a and b using the euclids - * algorithm. - */ - - int temp; - - /* Swap if b > a */ - if(b > a) { - temp = a; - a = b; - b = temp; - } - - /* Iterative Euclid's algorithm */ - while (b != 0) - { - a %= b; - temp = a; - a = b; - b = temp; - } - return a; -} - -int compute_nbhd_count_rec(int n, int *states) { - /* Computes the number of elements in the nbhd grid, wich is the number of - * elements in the set - * {(i,j) : gcd(i,j)=1 & 1 <= i,j <= n } - * - * This number corresponds with the OEIS A018805 sequence and can be computed - * using the following formula: - * - * a(n) = n^2 - Sum_{j=2..n} a(floor(n/j)) - */ - int an, j; - - if (states[n] != -1) { - return states[n]; - } - - an = n * n; - - for(j = 2; j <= n; j++) { - an -= compute_nbhd_count_rec(n / j, states); - } - - states[n] = an; - - return an; - -} - -int compute_nbhd_count(int n) { - /* Computes the number of elements in the nbhd grid, wich is the number of - * elements in the set - * {(i,j) : gcd(i,j)=1 & 1 <= i,j <= n } - * - * This number corresponds with the OEIS A018805 sequence and can be computed - * using the following formula: - * - * a(n) = n^2 - Sum_{j=2..n} a(floor(n/j)) - */ - - int *states; - int an, i; - - states = malloc((n + 1) * sizeof(*states)); - for(i = 0; i < n + 1; states[i++] = -1); - - an = compute_nbhd_count_rec(n, states); - - free(states); - - return an; -} - -int *dp_generate_nbhd(int nbhd_dim, int *nbhd_count) { - - int i, j, k = 0; - int *dp_nbhd; - - *nbhd_count = compute_nbhd_count(nbhd_dim) ; - - /* Allocate memory for the partition, using the exact amount of we can use - ~60% of memory that if we use nbhd_dim^2*/ - dp_nbhd = malloc(2 * (*nbhd_count) * sizeof(*dp_nbhd)); - - /* dp_nbhd = malloc(2 * nbhd_dim * nbhd_dim * sizeof(*dp_nbhd)); */ - - - for(i = 1; i <= nbhd_dim; i++) { - for(j = 1; j <= nbhd_dim; j++) { - /* If irreducible fraction add as a coordinate */ - if (gcd(i, j) == 1) { - dp_nbhd[k++] = i; - dp_nbhd[k++] = j; - } - } - } - - return dp_nbhd; -} diff --git a/deps/fdasrsf/dp_grid.h b/deps/fdasrsf/dp_grid.h deleted file mode 100644 index cfb61c625..000000000 --- a/deps/fdasrsf/dp_grid.h +++ /dev/null @@ -1,160 +0,0 @@ -#ifndef DP_GRID_H -#define DP_GRID_H 1 - -/* Original code developed by J. Derek Tucker in ElasticFDA.jl. The following -* code is under the MIT license, a copy of the license it is included with it. -* -* 03/25/2019: Modified by Pablo Marcos . -*/ - -/** - * Computes cost of best path from (0,0) to all other gridpoints. - * - * \param Q1 values of the first SRVF - * \param T1 changepoint parameters of the first SRVF - * \param nsamps1 the length of T1 - * \param Q2 values of the second SRVF - * \param T2 changepoint parameters of the second SRVF - * \param nsamps2 the length of T2 - * \param dim dimension of the ambient space - * \param tv1 the Q1 (column) parameter values for the DP grid - * \param idxv1 Q1 indexes for tv1, as computed by \c dp_all_indexes() - * \param ntv1 the length of tv1 - * \param tv2 the Q2 (row) parameter values for the DP grid - * \param idxv2 Q2 indexes for tv2, as computed by \c dp_all_indexes() - * \param ntv2 the length of tv2 - * \param E [output] on return, E[ntv2*i+j] holds the cost of the best - * path from (0,0) to (tv1[i],tv2[j]) in the grid. - * \param P [output] on return, P[ntv2*i+j] holds the predecessor of - * (tv1[i],tv2[j]). If predecessor is (tv1[k],tv2[l]), then - * P[ntv2*i+j] = k*ntv2+l. - * \return E[ntv1*ntv2-1], the cost of the best path from (tv1[0],tv2[0]) - * to (tv1[ntv1-1],tv2[ntv2-1]). - */ -void dp_optimum_reparam(double* Q1, double* T1, double* Q2, double* T2, - int m1, int n1, int n2, double* tv1, double* tv2, - int n1v, int n2v, double* G, double* T, - double* size, double lam1, int nbhd_dim); -/** - * Computes cost of best path from (0,0) to all other gridpoints. - * - * \param Q1 values of the first SRVF - * \param T1 changepoint parameters of the first SRVF - * \param nsamps1 the length of T1 - * \param Q2 values of the second SRVF - * \param T2 changepoint parameters of the second SRVF - * \param nsamps2 the length of T2 - * \param dim dimension of the ambient space - * \param tv1 the Q1 (column) parameter values for the DP grid - * \param idxv1 Q1 indexes for tv1, as computed by \c dp_all_indexes() - * \param ntv1 the length of tv1 - * \param tv2 the Q2 (row) parameter values for the DP grid - * \param idxv2 Q2 indexes for tv2, as computed by \c dp_all_indexes() - * \param ntv2 the length of tv2 - * \param E [output] on return, E[ntv2*i+j] holds the cost of the best - * path from (0,0) to (tv1[i],tv2[j]) in the grid. - * \param P [output] on return, P[ntv2*i+j] holds the predecessor of - * (tv1[i],tv2[j]). If predecessor is (tv1[k],tv2[l]), then - * P[ntv2*i+j] = k*ntv2+l. - * \return E[ntv1*ntv2-1], the cost of the best path from (tv1[0],tv2[0]) - * to (tv1[ntv1-1],tv2[ntv2-1]). - */ -double dp_costs( - double* Q1, double* T1, int nsamps1, - double* Q2, double* T2, int nsamps2, - int dim, - double* tv1, int* idxv1, int ntv1, - double* tv2, int* idxv2, int ntv2, - double* E, int* P, double lam, int nbhd_count, int *dp_nbhd); - -/** - * Computes the weight of the edge from (a,c) to (b,d) in the DP grid. - * - * \param Q1 values of the first SRVF - * \param T1 changepoint parameters of the first SRVF - * \param nsamps1 the length of T1 - * \param Q2 values of the second SRVF - * \param T2 changepoint parameters of the second SRVF - * \param nsamps2 the length of T2 - * \param dim dimension of the ambient space - * \param a source Q1 parameter - * \param b target Q1 parameter - * \param c source Q2 parameter - * \param d target Q2 parameter - * \param aidx index such that Q1[aidx] <= a < Q1[aidx+1] - * \param cidx index such that Q2[cidx] <= c < Q2[cidx+1] - */ -double dp_edge_weight( - double* Q1, double* T1, int nsamps1, - double* Q2, double* T2, int nsamps2, - int dim, - double a, double b, - double c, double d, - int aidx, int cidx, double lam); - - -/** - * Given predecessor table P, builds the piecewise-linear reparametrization - * function gamma. - * - * G and T must already be allocated with size max(ntv1,ntv2). The actual - * number of points on gamma will be the return value. - * - * \param P P[ntv2*i+j] holds the predecessor of (tv1[i],tv2[j]). If - * predecessor is (tv1[k],tv2[l]), then P[ntv2*i+j] = k*ntv2+l. - * \param tv1 the Q1 (column) parameter values for the DP grid - * \param ntv1 the length of tv1 - * \param tv2 the Q2 (row) parameter values for the DP grid - * \param ntv2 the length of tv2 - * \param G [output] reparametrization function values - * \param T [output] reparametrization changepoint parameters - * \return the length of G (same as length of T). - */ -int dp_build_gamma( - int* P, - double* tv1, int ntv1, - double* tv2, int ntv2, - double* G, double* T); - -/** - * Given t in [0,1], return the integer i such that t lies in the interval - * [T[i],T[i+1]) (or returns n-2 if t==T[n-1]). - * - * \param T an increasing sequence - * \param n the length of T - * \param t the parameter value to lookup ( T[0] <= t <= T[n-1] ). - * \return the integer i such that t lies in the interval [T[i],T[i+1]) - * (or n-2 if t==T[n-1]). - */ -int dp_lookup(double* T, int n, double t); - -/** - * 1-D table lookup for a sorted array of query points. - * - * Given a partition p and an increasing sequence of numbers tv between - * p[0] and p[np-1], computes the sequence of indexes idxv such that for - * i=0,...,ntv-1, p[idxv[i]] <= tv[i] < p[idxv[i]+1]. If tv[i]==p[np-1], - * then idxv[i] will be set to np-2. - * - * \param p an increasing sequence (the table) - * \param np the length of \a p - * \param tv an increasing sequence (the query points) - * \param ntv the length of \a tv - * \param idxv [output] pre-allocated array of \a ntv ints to hold result - */ -void dp_all_indexes(double* p, int np, double* tv, int ntv, int* idxv); - -/** - * Generates a list with the indexes of the grid to be explored. Used internally - * in the dp algorithm. - * - * The grid will be composed with the pairs (i, j), i,j < nbhd_dim and - * gcd(i, j) = 1. - * - * \param nbhd_dim dimension of the grid - * \param nbhd_count pointer where the number of indexes will be stored - * \return the list of indexes - */ -int *dp_generate_nbhd(int nbhd_dim, int *nbhd_count); - -#endif /* DP_GRID_H */ diff --git a/deps/fdasrsf/optimum_reparam.pyx b/deps/fdasrsf/optimum_reparam.pyx deleted file mode 100644 index 577efe47b..000000000 --- a/deps/fdasrsf/optimum_reparam.pyx +++ /dev/null @@ -1,164 +0,0 @@ -#!python -#cython: language_level=3 - - -# Original code developed by J. Derek Tucker. -# 03/25/2019: Modified by Pablo Marcos . - - -import numpy as np -from numpy.linalg import norm - -cimport numpy as np -from cpython cimport array - -cdef extern from "dp_grid.h": - void dp_optimum_reparam(double *Q1, double *T1, double *Q2, double *T2, - int m1, int n1, int n2, double *tv1, double *tv2, - int n1v, int n2v, double *G, double *T, - double *size, double lam1, int nbhd_dim) - - - -def coptimum_reparam_n(np.ndarray[double, ndim=1, mode="c"] mq, - np.ndarray[double, ndim=1, mode="c"] time, - np.ndarray[double, ndim=2, mode="c"] q, - lam1=0.0, nbhd_dim=7): - """ - cython interface calculates the warping to align a set of srfs q to a - single srsf mq. - - :param mq: vector of size N samples of first SRSF - :param time: vector of size N describing the sample points - :param q: numpy ndarray of shape (M,N) of N srsfs with M samples - :param lam1: controls the amount of elasticity (default = 0.0) - - :rtype numpy ndarray - :return gam: describing the warping functions used to align columns of - q with mq - - """ - cdef int M, N, n1 - cdef double lam - mq = mq / norm(mq) - M, N = q.shape[0], q.shape[1] - n1 = 1 - lam = lam1 - cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] T = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] qi = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] size = np.zeros(1) - - gam = np.zeros((M, N)) - sizes = np.zeros(N, dtype=np.int32) - Go = np.zeros((M, N)) - To = np.zeros((M, N)) - for k in range(0, N): - qi = q[:, k] / norm(q[:, k]) - qi = np.ascontiguousarray(qi) - - dp_optimum_reparam(&mq[0], &time[0], &qi[0], &time[0], n1, M, M, - &time[0], &time[0], M, M, &G[0], &T[0], - &size[0], lam, nbhd_dim) - sizes[k] = np.int32(size) - Go[:, k] = G - To[:, k] = T - - for k in range(0, N): - gam0 = np.interp(time, To[0:sizes[k], k], Go[0:sizes[k], k]) - gam[:, k] = (gam0 - gam0[0]) / (gam0[-1] - gam0[0]) - - return gam - -def coptimum_reparam_n2(np.ndarray[double, ndim=2, mode="c"] q1, - np.ndarray[double, ndim=1, mode="c"] time, - np.ndarray[double, ndim=2, mode="c"] q2, - lam1=0.0, nbhd_dim=7): - """ - cython interface calculates the warping to align a set of srsfs q1 to - another set of srsfs q2 - - :param q1: numpy ndarray of shape (M,N) of M srsfs with N samples - :param time: vector of size N describing the sample points - :param q2: numpy ndarray of shape (M,N) of M srsfs with N samples - :param lam1: controls the amount of elasticity (default = 0.0) - - :rtype numpy ndarray - :return gam: describing the warping functions used to align columns of - q with mq - - """ - cdef int M, N, n1 - cdef double lam - - M, N = q1.shape[0], q1.shape[1] - n1 = 1 - lam = lam1 - cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] T = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] q1i = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] q2i = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] size = np.zeros(1) - - gam = np.zeros((M, N)) - sizes = np.zeros(N, dtype=np.int32) - Go = np.zeros((M, N)) - To = np.zeros((M, N)) - for k in range(0, N): - q1i = q1[:, k] / norm(q1[:, k]) - q2i = q2[:, k] / norm(q2[:, k]) - q1i = np.ascontiguousarray(q1i) - q2i = np.ascontiguousarray(q2i) - - dp_optimum_reparam(&q1i[0], &time[0], &q2i[0], &time[0], n1, - M, M, &time[0], &time[0], M, M, &G[0], - &T[0], &size[0], lam, nbhd_dim) - sizes[k] = np.int32(size) - Go[:, k] = G - To[:, k] = T - - for k in range(0, N): - gam0 = np.interp(time, To[0:sizes[k], k], Go[0:sizes[k], k]) - gam[:, k] = (gam0 - gam0[0]) / (gam0[-1] - gam0[0]) - - return gam - -def coptimum_reparam(np.ndarray[double, ndim=1, mode="c"] q1, - np.ndarray[double, ndim=1, mode="c"] time, - np.ndarray[double, ndim=1, mode="c"] q2, - lam1=0.0, nbhd_dim=7): - """ - cython interface for calculates the warping to align srsf q2 to q1 - - :param q1: vector of size N samples of first SRSF - :param time: vector of size N describing the sample points - :param q2: vector of size N samples of second SRSF - :param lam1: controls the amount of elasticity (default = 0.0) - - :rtype vector - :return gam: describing the warping function used to align q2 with q1 - """ - cdef int M, n1 - cdef double lam - M = q1.shape[0] - n1 = 1 - lam = lam1 - q1 = q1 / norm(q1) - q2 = q2 / norm(q2) - cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] T = np.zeros(M) - cdef np.ndarray[double, ndim=1, mode="c"] size = np.zeros(1) - - sizes = np.zeros(1, dtype=np.int32) - Go = np.zeros((M, 1)) - To = np.zeros((M, 1)) - dp_optimum_reparam(&q1[0], &time[0], &q2[0], &time[0], n1, M, M, - &time[0], &time[0], M, M, &G[0], - &T[0], &size[0], lam, nbhd_dim) - sizes = np.int32(size) - Go[:, 0] = G - To[:, 0] = T - gam0 = np.interp(time, To[0:sizes[0], 0], Go[0:sizes[0], 0]) - gam = (gam0 - gam0[0]) / (gam0[-1] - gam0[0]) - - return gam diff --git a/pyproject.toml b/pyproject.toml index 58be524db..486c41f3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,3 @@ [build-system] # Minimum requirements for the build system to execute. -requires = ["setuptools", "wheel", "cython", "numpy"] \ No newline at end of file +requires = ["setuptools", "wheel"] \ No newline at end of file diff --git a/setup.py b/setup.py index f1a50bd45..7eac20be0 100644 --- a/setup.py +++ b/setup.py @@ -23,11 +23,7 @@ import os import sys -from Cython.Build import cythonize from setuptools import setup, find_packages -from setuptools.extension import Extension - -import numpy as np needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) @@ -39,21 +35,6 @@ 'VERSION'), 'r') as version_file: version = version_file.read().strip() -deps_path = 'deps' -fdasrsf_path = os.path.join(deps_path, 'fdasrsf') - - -extensions = [ - Extension(name='optimum_reparam', - sources=[ - os.path.join(fdasrsf_path, 'optimum_reparam.pyx'), - os.path.join(fdasrsf_path, 'dp_grid.c') - ], - include_dirs=[np.get_include()], - language='c', - ), -] - setup(name='scikit-fda', version=version, description=DOCLINES[1], @@ -61,7 +42,6 @@ url='https://fda.readthedocs.io', maintainer='Carlos Ramos Carreño', maintainer_email='vnmabus@gmail.com', - ext_modules=cythonize(extensions), include_package_data=True, platforms=['any'], license='BSD', From b2b74a8458203c6367f107a49effa91adeb7d77b Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 23 Oct 2020 15:10:06 +0200 Subject: [PATCH 072/210] Add dependency to setup.py. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 7eac20be0..f11bb3a6d 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ install_requires=[ 'cython', 'dcor', + 'fdasrsf', 'findiff', 'matplotlib', 'mpldatacursor', From d85f5150fed4a72948db4a5733c3a1af3ad02943 Mon Sep 17 00:00:00 2001 From: "J. Derek Tucker" Date: Mon, 26 Oct 2020 12:00:17 -0600 Subject: [PATCH 073/210] update references and change default method --- README.rst | 2 +- docs/modules/preprocessing/registration.rst | 4 ++++ examples/plot_elastic_registration.py | 4 ++++ skfda/preprocessing/registration/elastic.py | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index aef40ddda..02e95cc84 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,7 @@ Documentation The documentation is available at `fda.readthedocs.io/en/stable/ `_, which includes detailed information of the different modules, classes and methods of -the package, along with several examples showing different funcionalities. +the package, along with several examples showing different functionalities. The documentation of the latest version, corresponding with the develop version of the package, can be found at diff --git a/docs/modules/preprocessing/registration.rst b/docs/modules/preprocessing/registration.rst index abf7db45a..67ca0cb8b 100644 --- a/docs/modules/preprocessing/registration.rst +++ b/docs/modules/preprocessing/registration.rst @@ -121,6 +121,10 @@ References * Srivastava, Anuj & Klassen, Eric P. (2016). Functional and shape data analysis. Springer. +* Tucker, J. D., Wu, W. and Srivastava, A. (2013). Generative Models for + Functional Data using Phase and Amplitude Separation. Computational Statistics + and Data Analysis, Vol. 61, 50-66. + * J. S. Marron, James O. Ramsay, Laura M. Sangalli and Anuj Srivastava (2015). Functional Data Analysis of Amplitude and Phase Variation. Statistical Science 2015, Vol. 30, No. 4 diff --git a/examples/plot_elastic_registration.py b/examples/plot_elastic_registration.py index 1688126ad..c49be791f 100644 --- a/examples/plot_elastic_registration.py +++ b/examples/plot_elastic_registration.py @@ -94,6 +94,10 @@ # * Srivastava, Anuj & Klassen, Eric P. (2016). Functional and shape data # analysis. In *Functional Data and Elastic Registration* (pp. 73-122). # Springer. +# +# * Tucker, J. D., Wu, W. and Srivastava, A. (2013). Generative Models for +# Functional Data using Phase and Amplitude Separation. Computational Statistics +# and Data Analysis, Vol. 61, 50-66. # # * J. S. Marron, James O. Ramsay, Laura M. Sangalli and Anuj Srivastava # (2015). Functional Data Analysis of Amplitude and Phase Variation. diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py index 0a8ff421e..0813d02e1 100644 --- a/skfda/preprocessing/registration/elastic.py +++ b/skfda/preprocessing/registration/elastic.py @@ -260,7 +260,7 @@ def _elastic_alignment_array(template_data, q_data, return optimum_reparam(np.ascontiguousarray(template_data.T), np.ascontiguousarray(eval_points), np.ascontiguousarray(q_data.T), - method="DP2", + method="DP", lam=penalty).T From 22c85588ed027e611bf6807e8087badd47d6073c Mon Sep 17 00:00:00 2001 From: "J. Derek Tukcer" Date: Mon, 26 Oct 2020 14:54:36 -0600 Subject: [PATCH 074/210] reset to DP2, per incoming changes to fdasrsf --- skfda/preprocessing/registration/elastic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py index 0813d02e1..0a8ff421e 100644 --- a/skfda/preprocessing/registration/elastic.py +++ b/skfda/preprocessing/registration/elastic.py @@ -260,7 +260,7 @@ def _elastic_alignment_array(template_data, q_data, return optimum_reparam(np.ascontiguousarray(template_data.T), np.ascontiguousarray(eval_points), np.ascontiguousarray(q_data.T), - method="DP", + method="DP2", lam=penalty).T From 0b5ff2f9e64d6cca97cf7570034b4481e86e898f Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 29 Oct 2020 12:03:26 +0100 Subject: [PATCH 075/210] Pass grid_dim parameter to fdasrsf package. --- README.rst | 1 + setup.py | 2 +- skfda/preprocessing/registration/elastic.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 02e95cc84..774935851 100644 --- a/README.rst +++ b/README.rst @@ -62,6 +62,7 @@ Requirements *scikit-fda* depends on the following packages: * `cython `_ - Python to C compiler +* `fdasrsf `_ - SRSF framework * `findiff `_ - Finite differences * `matplotlib `_ - Plotting with Python * `mpldatacursor `_ - Interactive data cursors for matplotlib diff --git a/setup.py b/setup.py index f11bb3a6d..a71366d6e 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ install_requires=[ 'cython', 'dcor', - 'fdasrsf', + 'fdasrsf>=2.2.0', 'findiff', 'matplotlib', 'mpldatacursor', diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py index 0a8ff421e..03e0de7a2 100644 --- a/skfda/preprocessing/registration/elastic.py +++ b/skfda/preprocessing/registration/elastic.py @@ -261,7 +261,7 @@ def _elastic_alignment_array(template_data, q_data, np.ascontiguousarray(eval_points), np.ascontiguousarray(q_data.T), method="DP2", - lam=penalty).T + lam=penalty, grid_dim=grid_dim).T class ElasticRegistration(RegistrationTransformer): From 3a3f60c0f8d53046a994ab4df37f4141e05df8f1 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 29 Oct 2020 17:37:56 +0100 Subject: [PATCH 076/210] Reduce memory usage in RMH test. --- .../variable_selection/recursive_maxima_hunting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index efa21536a..be3b5f22d 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -842,7 +842,7 @@ class RecursiveMaximaHunting( We create trajectories from two classes, one with zero mean and the other with a peak-like mean. Both have Brownian covariance. - >>> n_samples = 10000 + >>> n_samples = 1000 >>> n_features = 100 >>> >>> def mean_1(t): @@ -877,7 +877,7 @@ class RecursiveMaximaHunting( >>> len(X.grid_points[0]) 100 >>> X_dimred.shape - (10000, 3) + (1000, 3) References: From c4484710168dfdb441d621e2e56bd7a201a44051 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 29 Oct 2020 18:56:03 +0100 Subject: [PATCH 077/210] Fix kernel smoothing example. --- examples/plot_kernel_smoothing.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/examples/plot_kernel_smoothing.py b/examples/plot_kernel_smoothing.py index 16ba7dcb4..a118be297 100644 --- a/examples/plot_kernel_smoothing.py +++ b/examples/plot_kernel_smoothing.py @@ -11,9 +11,10 @@ # Author: Miguel Carbajo Berrocal # License: MIT +import skfda + import matplotlib.pylab as plt import numpy as np -import skfda import skfda.preprocessing.smoothing.kernel_smoothers as ks import skfda.preprocessing.smoothing.validation as val @@ -37,33 +38,34 @@ # Here we show the general cross validation scores for different values of the # parameters given to the different smoothing methods. -param_values = np.linspace(start=2, stop=25, num=24) +param_values_knn = np.arange(1, 24, 2) +param_values_others = param_values_knn / 32 # Local linear regression kernel smoothing. llr = val.SmoothingParameterSearch( - ks.LocalLinearRegressionSmoother(), param_values) + ks.LocalLinearRegressionSmoother(), param_values_others) llr.fit(fd) llr_fd = llr.transform(fd) # Nadaraya-Watson kernel smoothing. nw = val.SmoothingParameterSearch( - ks.NadarayaWatsonSmoother(), param_values) + ks.NadarayaWatsonSmoother(), param_values_others) nw.fit(fd) nw_fd = nw.transform(fd) # K-nearest neighbours kernel smoothing. knn = val.SmoothingParameterSearch( - ks.KNeighborsSmoother(), param_values) + ks.KNeighborsSmoother(), param_values_knn) knn.fit(fd) knn_fd = knn.transform(fd) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) -ax.plot(param_values, knn.cv_results_['mean_test_score'], +ax.plot(param_values_knn, knn.cv_results_['mean_test_score'], label='k-nearest neighbors') -ax.plot(param_values, llr.cv_results_['mean_test_score'], +ax.plot(param_values_knn, llr.cv_results_['mean_test_score'], label='local linear regression') -ax.plot(param_values, nw.cv_results_['mean_test_score'], +ax.plot(param_values_knn, nw.cv_results_['mean_test_score'], label='Nadaraya-Watson') ax.legend() fig @@ -115,8 +117,10 @@ # We can also appreciate the effects of undersmoothing and oversmoothing in # the following plots. -fd_us = ks.NadarayaWatsonSmoother(smoothing_parameter=2).fit_transform(fd[10]) -fd_os = ks.NadarayaWatsonSmoother(smoothing_parameter=15).fit_transform(fd[10]) +fd_us = ks.NadarayaWatsonSmoother( + smoothing_parameter=2 / 32).fit_transform(fd[10]) +fd_os = ks.NadarayaWatsonSmoother( + smoothing_parameter=15 / 32).fit_transform(fd[10]) ############################################################################## # Under-smoothed From d9d188da813e4d6b3ec5fc2ecfafe8713da6a8ae Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 2 Nov 2020 20:38:56 +0100 Subject: [PATCH 078/210] First version of glossary. --- docs/glossary.rst | 77 +++++++++++++++++++ docs/index.rst | 1 + docs/modules/exploratory/depth.rst | 4 +- .../exploratory/visualization/boxplot.rst | 8 +- docs/modules/representation/extrapolation.rst | 2 +- skfda/exploratory/depth/_depth.py | 8 +- .../outliers/_directional_outlyingness.py | 2 +- skfda/exploratory/visualization/_boxplot.py | 9 ++- .../visualization/representation.py | 5 +- skfda/misc/operators/_operators.py | 2 +- .../registration/_landmark_registration.py | 8 +- .../registration/_shift_registration.py | 6 +- skfda/preprocessing/registration/_warping.py | 2 +- .../representation/_evaluation_trasformer.py | 4 +- skfda/representation/_functional_data.py | 22 +++--- skfda/representation/basis/_basis.py | 4 +- skfda/representation/basis/_fdatabasis.py | 5 -- skfda/representation/basis/_vector_basis.py | 5 +- skfda/representation/evaluator.py | 7 +- skfda/representation/extrapolation.py | 9 ++- skfda/representation/grid.py | 12 --- 21 files changed, 134 insertions(+), 68 deletions(-) create mode 100644 docs/glossary.rst diff --git a/docs/glossary.rst b/docs/glossary.rst new file mode 100644 index 000000000..e0efe2ee0 --- /dev/null +++ b/docs/glossary.rst @@ -0,0 +1,77 @@ +.. _glossary: + +======================== +Glossary of Common Terms +======================== + +This glossary contains concepts and API elements specific to the functional +data analysis setting or for the package ``scikit-fda``. If the term you +are looking is not listed here, it may be a more generally applicable term +listed in the scikit-learn :ref:`sklearn:glossary`. + +General Concepts +================ + +.. glossary:: + + codomain + The set of allowed output values of a function. + Note that the set of actual output values, called the :term:`image`, + can be a strict subset of the :term:`codomain`. + + curve + trajectory + A :term:`functional data object` whose domain and codomain are both + the set of real numbers (:math:`\mathbb{R}`). + Thus, its ``dim_domain`` and ``dim_codomain`` attributes shall both + be 1. + + domain + The set of possible input values of a function. + + FDA + Functional Data Analysis + The branch of statistics that deals with curves, surfaces or other + objects varying over a a continuum (:term:`functional data objects`). + + functional data + A collection of :term:`functional data objects`. + Usually represented by a :class:`FData` object. + + functional data object + functional data objects + functional object + functional objects + An object of study of Functional Data Analysis. + It is a function or map between two sets, often subsets of powers of + :math:`\mathbb{R}`. + Usually represented by a :class:`FData` object of length 1, but in + some cases regular Python :term:`callables ` are + also accepted. + + functional observation + functional observations + An observed :term:`functional data object`, usually represented as a + :class:`FData` object of length 1. + + image + The set of actual ouput values that a function takes. + It must be a (non necessarily strict) subset of the :term:`codomain`. + + multivariate functional data + Often used for :term:`functional data` where each + :term:`functional data object` is a :term:`vector-valued function`. + + multivariate object + multivariate objects + An object of study of multivariate statistics. + It is a vector of possibly related variables, usually represented + as a :term:`sklearn:1d array`. + + operator + operators + Function whose :term:`domain` is a set of functions. + + vector-valued function + A :term:`functional data object` that outputs vectors, that is, its + :term:`codomain` has dimension greater than 1. diff --git a/docs/index.rst b/docs/index.rst index f451e872b..272a0438c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,6 +21,7 @@ Github you can find more information related to the development of the package. :titlesonly: apilist + glossary .. toctree:: diff --git a/docs/modules/exploratory/depth.rst b/docs/modules/exploratory/depth.rst index dac591d89..944165477 100644 --- a/docs/modules/exploratory/depth.rst +++ b/docs/modules/exploratory/depth.rst @@ -36,8 +36,8 @@ The following classes implement depth functions for functional data: skfda.exploratory.depth.BandDepth skfda.exploratory.depth.ModifiedBandDepth -Most of them support multivariate functional data, with more than one dimension -on the domain and on the codomain. +Most of them support functional data with more than one dimension +on the :term:`domain` and on the :term:`codomain`. Multivariate depths ^^^^^^^^^^^^^^^^^^^ diff --git a/docs/modules/exploratory/visualization/boxplot.rst b/docs/modules/exploratory/visualization/boxplot.rst index f5284e27b..9b7ff4a54 100644 --- a/docs/modules/exploratory/visualization/boxplot.rst +++ b/docs/modules/exploratory/visualization/boxplot.rst @@ -2,10 +2,10 @@ Functional Data Boxplot ======================= Classes to construct the functional data boxplot. Only supported for -functional data with domain dimension 1 or 2 and as many dimensions on -the image as required. +functional data with :term:`domain` dimension 1 or 2 and as many dimensions on +the :term:`codomain` as required. -If the dimension of the domain is 1, the following class must be used. +If the dimension of the :term:`domain` is 1, the following class must be used. See the :ref:`sphx_glr_auto_examples_plot_boxplot.py` example for detailed explanation. .. autosummary:: @@ -13,7 +13,7 @@ See the :ref:`sphx_glr_auto_examples_plot_boxplot.py` example for detailed expla skfda.exploratory.visualization.Boxplot -If the dimension of the domain is 2, this one. See the :ref:`sphx_glr_auto_examples_plot_surface_boxplot.py` +If the dimension of the :term:`domain` is 2, this one. See the :ref:`sphx_glr_auto_examples_plot_surface_boxplot.py` example for detailed explanation. .. autosummary:: diff --git a/docs/modules/representation/extrapolation.rst b/docs/modules/representation/extrapolation.rst index 0736e883a..23faa49bb 100644 --- a/docs/modules/representation/extrapolation.rst +++ b/docs/modules/representation/extrapolation.rst @@ -2,7 +2,7 @@ Extrapolation ============= This module contains the extrapolators used to evaluate points outside the -domain range of :class:`FDataBasis` or :class:`FDataGrid`. See +:term:`domain` range of :class:`FDataBasis` or :class:`FDataGrid`. See `Extrapolation Example <../auto_examples/plot_extrapolation.html>`_ for detailed explanation. diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index 46bd3d2b2..04e188413 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -94,7 +94,7 @@ class ModifiedBandDepth(IntegratedDepth): The band depth of each sample is obtained by computing the fraction of time its graph is contained in the bands determined by two sample curves. - In the case the fdatagrid domain dimension is 2, instead of curves, + In the case the fdatagrid :term:`domain` dimension is 2, instead of curves, surfaces determine the bands. In larger dimensions, the hyperplanes determine the bands. @@ -131,9 +131,9 @@ class BandDepth(Depth): The band depth of each sample is obtained by computing the fraction of the bands determined by two sample curves containing the whole graph of the - first one. In the case the fdatagrid domain dimension is 2, instead of - curves, surfaces determine the bands. In larger dimensions, the hyperplanes - determine the bands. + first one. In the case the fdatagrid :term:`domain` dimension is 2, instead + of curves, surfaces determine the bands. In larger dimensions, the + hyperplanes determine the bands. Examples: diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index c6f9d8a74..6b00efc0c 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -70,7 +70,7 @@ def directional_outlyingness_stats( \left(\mathbf{X}(t), F_{\mathbf{X}(t)}\right)-\mathbf{MO}\left( \mathbf{X} , F_{\mathbf{X}}\right) \rVert^2 \cdot w(t) dt - where :math:`w(t)` a weight function defined on the domain of + where :math:`w(t)` a weight function defined on the :term:`domain` of :math:`\mathbf{X}`, :math:`I`. Then, the total functional outlyingness can be computed using these values: diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index 1c0abaa5a..191f888a7 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -26,7 +26,8 @@ class FDataBoxplot(ABC): """Abstract class inherited by the Boxplot and SurfaceBoxplot classes. It the data of the functional boxplot or surface boxplot of a FDataGrid - object, depending on the dimensions of the domain, 1 or 2 respectively. + object, depending on the dimensions of the :term:`domain`, 1 or 2 + respectively. It forces to both classes, Boxplot and SurfaceBoxplot to conain at least the median, central and outlying envelopes and a colormap for their @@ -89,7 +90,7 @@ class Boxplot(FDataBoxplot): Class implementing the functionl boxplot which is an informative exploratory tool for visualizing functional data, as well as its generalization, the enhanced functional boxplot. Only supports 1 - dimensional domain functional data. + dimensional :term:`domain` functional data. Based on the center outward ordering induced by a :ref:`depth measure ` for functional data, the descriptive statistics of a @@ -452,8 +453,8 @@ class SurfaceBoxplot(FDataBoxplot): Class implementing the surface boxplot. Analogously to the functional boxplot, it is an informative exploratory tool for visualizing functional - data with domain dimension 2. Nevertheless, it does not implement the - enhanced surface boxplot. + data with :term:`domain` dimension 2. Nevertheless, it does not implement + the enhanced surface boxplot. Based on the center outward ordering induced by a :ref:`depth measure ` diff --git a/skfda/exploratory/visualization/representation.py b/skfda/exploratory/visualization/representation.py index 209356e5a..7ccb90794 100644 --- a/skfda/exploratory/visualization/representation.py +++ b/skfda/exploratory/visualization/representation.py @@ -85,8 +85,9 @@ def plot_graph(fdata, chart=None, *, fig=None, axes=None, **kwargs): """Plot the FDatGrid object graph as hypersurfaces. - Plots each coordinate separately. If the domain is one dimensional, the - plots will be curves, and if it is two dimensional, they will be surfaces. + Plots each coordinate separately. If the :term:`domain` is one dimensional, + the plots will be curves, and if it is two dimensional, they will be + surfaces. Args: chart (figure object, axe or list of axes, optional): figure over diff --git a/skfda/misc/operators/_operators.py b/skfda/misc/operators/_operators.py index 8d1b955d5..7781aaf46 100644 --- a/skfda/misc/operators/_operators.py +++ b/skfda/misc/operators/_operators.py @@ -5,7 +5,7 @@ class Operator(abc.ABC): """ - Abstract class for operators (functions whose domain are functions). + Abstract class for :term:`operators`. """ diff --git a/skfda/preprocessing/registration/_landmark_registration.py b/skfda/preprocessing/registration/_landmark_registration.py index 40a666a9c..8f54466a3 100644 --- a/skfda/preprocessing/registration/_landmark_registration.py +++ b/skfda/preprocessing/registration/_landmark_registration.py @@ -20,8 +20,8 @@ def landmark_shift_deltas(fd, landmarks, location=None): The function will calculate the corresponding :math:`\delta_i` shuch that :math:`t_i = t^* + \delta_i`. - This procedure will work independent of the dimension of the domain - and the image. + This procedure will work independent of the dimension of the + :term:`domain` and the :term:`codomain`. Args: fd (:class:`FData`): Functional data object. @@ -286,8 +286,8 @@ def landmark_registration(fd, landmarks, *, location=None, eval_points=None): eval_points (array_like, optional): Set of points where the functions are evaluated to obtain a discrete representation of the object. In case of objects with - multidimensional domain a list axis with points of evaluation - for each dimension. + multidimensional :term:`domain` a list axis with points of + evaluation for each dimension. Returns: :class:`FData`: FData with the functional data object registered. diff --git a/skfda/preprocessing/registration/_shift_registration.py b/skfda/preprocessing/registration/_shift_registration.py index 602e0ec77..f7c4c7a20 100644 --- a/skfda/preprocessing/registration/_shift_registration.py +++ b/skfda/preprocessing/registration/_shift_registration.py @@ -51,14 +51,14 @@ class ShiftRegistration(RegistrationTransformer): in each iteration. In [RaSi2005-7-9-1]_ is described in detail this procedure. Defaults to "mean". extrapolation (str or :class:`Extrapolation`, optional): Controls the - extrapolation mode for points outside the domain range. + extrapolation mode for points outside the :term:`domain` range. By default uses the method defined in the data to be transformed. See the `extrapolation` documentation to obtain more information. step_size (int or float, optional): Parameter to adjust the rate of convergence in the Newton-Raphson algorithm, see [RaSi2005-7-9-1]_. Defaults to 1. - restrict_domain (bool, optional): If True restricts the domain to avoid - evaluate points outside the domain using extrapolation, in which + restrict_domain (bool, optional): If True restricts the :term:`domain` + to avoid the need of using extrapolation, in which case only the fit_transform method will be available, as training and transformation must be done together. Defaults to False. initial (str or array_like, optional): Array with an initial estimation diff --git a/skfda/preprocessing/registration/_warping.py b/skfda/preprocessing/registration/_warping.py index b2f4a222f..35fc7ba86 100644 --- a/skfda/preprocessing/registration/_warping.py +++ b/skfda/preprocessing/registration/_warping.py @@ -110,7 +110,7 @@ def _normalize_scale(t, a=0, b=1): def normalize_warping(warping, domain_range=None): - r"""Rescale a warping to normalize their domain. + r"""Rescale a warping to normalize their :term:`domain`. Given a set of warpings :math:`\gamma_i:[a,b]\rightarrow [a,b]` it is used an affine traslation to change the domain of the transformation to diff --git a/skfda/representation/_evaluation_trasformer.py b/skfda/representation/_evaluation_trasformer.py index c3921f1e4..b91444c31 100644 --- a/skfda/representation/_evaluation_trasformer.py +++ b/skfda/representation/_evaluation_trasformer.py @@ -13,8 +13,8 @@ class EvaluationTransformer(BaseEstimator, TransformerMixin): evaluated. If `None`, the functions must be `FDatagrid` objects and all points will be returned. extrapolation (str or Extrapolation, optional): Controls the - extrapolation mode for elements outside the domain range. By - default it is used the mode defined during the instance of the + extrapolation mode for elements outside the :term:`domain` range. + By default it is used the mode defined during the instance of the object. grid (bool, optional): Whether to evaluate the results on a grid spanned by the input arrays, or at points specified by the diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index b37c36145..dd63b1501 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -155,7 +155,7 @@ def n_samples(self): @property @abstractmethod def dim_domain(self): - """Return number of dimensions of the domain. + """Return number of dimensions of the :term:`domain`. Returns: int: Number of dimensions of the domain. @@ -166,7 +166,7 @@ def dim_domain(self): @property @abstractmethod def dim_codomain(self): - """Return number of dimensions of the codomain. + """Return number of dimensions of the :term:`codomain`. Returns: int: Number of dimensions of the codomain. @@ -204,7 +204,7 @@ def extrapolation(self, value): @property @abstractmethod def domain_range(self): - """Return the domain range of the object + """Return the :term:`domain` range of the object Returns: List of tuples with the ranges for each domain dimension. @@ -305,9 +305,9 @@ def evaluate(self, eval_points, *, derivative=0, extrapolation=None, Args: eval_points (array_like): List of points where the functions are evaluated. If ``grid`` is ``True``, a list of axes, one per - domain dimension, must be passed instead. If ``aligned`` is - ``True``, then a list of lists (of points or axes, as - explained) must be passed, with one list per sample. + :term:`domain` dimension, must be passed instead. If + ``aligned`` is ``True``, then a list of lists (of points or + axes, as explained) must be passed, with one list per sample. extrapolation (str or Extrapolation, optional): Controls the extrapolation mode for elements outside the domain range. By default it is used the mode defined during the instance of the @@ -507,11 +507,11 @@ def plot(self, *args, **kwargs): will be used 30 points per axis, wich makes a grid with 900 points. domain_range (tuple or list of tuples, optional): Range where the - function will be plotted. In objects with unidimensional domain - the domain range should be a tuple with the bounds of the - interval; in the case of surfaces a list with 2 tuples with - the ranges for each dimension. Default uses the domain range - of the functional object. + function will be plotted. In objects with unidimensional + :term:`domain` the domain range should be a tuple with the + bounds of the interval; in the case of surfaces a list with 2 + tuples with the ranges for each dimension. Default uses the + domain range of the functional object. group (list of int): contains integers from [0 to number of labels) indicating to which group each sample belongs to. Then, the samples with the same label are plotted in the same color. diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index f7e9c207f..f634b8a8b 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -196,8 +196,8 @@ def _coordinate(self, fdatabasis, key): return self._coordinate_nonfull(fdatabasis=fdatabasis, key=r_key) def rescale(self, domain_range=None): - r"""Return a copy of the basis with a new domain range, with the - corresponding values rescaled to the new bounds. + r"""Return a copy of the basis with a new :term:`domain` range, with + the corresponding values rescaled to the new bounds. Args: domain_range (tuple, optional): Definition of the interval diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index c6040752e..e7151dc7b 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -215,19 +215,14 @@ def from_data(cls, data_matrix, *, basis, @property def n_samples(self): - """Return number of samples.""" return self.coefficients.shape[0] @property def dim_domain(self): - """Return number of dimensions of the domain.""" - return self.basis.dim_domain @property def dim_codomain(self): - """Return number of dimensions of the image.""" - return self.basis.dim_codomain @property diff --git a/skfda/representation/basis/_vector_basis.py b/skfda/representation/basis/_vector_basis.py index 1a9fa0fe7..839a8cbbe 100644 --- a/skfda/representation/basis/_vector_basis.py +++ b/skfda/representation/basis/_vector_basis.py @@ -9,9 +9,10 @@ class VectorValued(Basis): r"""Vector-valued basis. - Basis for vector-valued functions constructed from scalar-valued bases. + Basis for :term:`vector-valued functions ` + constructed from scalar-valued bases. - For each dimension in the codomain, it uses a scalar-valued basis + For each dimension in the :term:`codomain`, it uses a scalar-valued basis multiplying each basis by the corresponding unitary vector. Attributes: diff --git a/skfda/representation/evaluator.py b/skfda/representation/evaluator.py index 7cdd3a41e..3740ce987 100644 --- a/skfda/representation/evaluator.py +++ b/skfda/representation/evaluator.py @@ -8,9 +8,10 @@ class Evaluator(ABC): """Structure of an evaluator. An evaluator defines how to evaluate points of a functional object, it - can be used as extrapolator to evaluate points outside the domain range or - as interpolation in a :class:`FDataGrid`. The corresponding examples of - Interpolation and Extrapolation shows the basic usage of this class. + can be used as extrapolator to evaluate points outside the :term:`domain` + range or as interpolation in a :class:`FDataGrid`. The corresponding + examples of Interpolation and Extrapolation shows the basic usage of + this class. The evaluator is called internally by :func:`evaluate`. diff --git a/skfda/representation/extrapolation.py b/skfda/representation/extrapolation.py index 80aaec35a..fa6a057c3 100644 --- a/skfda/representation/extrapolation.py +++ b/skfda/representation/extrapolation.py @@ -1,6 +1,6 @@ """Module with the extrapolation methods. -Defines methods to evaluate points outside the domain range. +Defines methods to evaluate points outside the :term:`domain` range. """ @@ -10,7 +10,7 @@ class PeriodicExtrapolation(Evaluator): - """Extends the domain range periodically. + """Extends the :term:`domain` range periodically. Examples: @@ -57,7 +57,7 @@ def evaluate(self, fdata, eval_points, *, aligned=True): class BoundaryExtrapolation(Evaluator): - """Extends the domain range using the boundary values. + """Extends the :term:`domain` range using the boundary values. Examples: @@ -142,7 +142,8 @@ def evaluate(self, fdata, eval_points, *, aligned=True): class FillExtrapolation(Evaluator): - """Values outside the domain range will be filled with a fixed value. + """ + Values outside the :term:`domain` range will be filled with a fixed value. Examples: diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 9280ee8c0..e0fc2e186 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -255,22 +255,10 @@ def sample_points(self): @property def dim_domain(self): - """Return number of dimensions of the domain. - - Returns: - int: Number of dimensions of the domain. - - """ return len(self.grid_points) @property def dim_codomain(self): - """Return number of dimensions of the image. - - Returns: - int: Number of dimensions of the image. - - """ try: # The dimension of the image is the length of the array that can # be extracted from the data_matrix using all the dimensions of From f41994e197992d8d32d467385b64627984ba52fc Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 6 Nov 2020 16:25:30 +0100 Subject: [PATCH 079/210] Simplify glossary entries. --- docs/glossary.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/glossary.rst b/docs/glossary.rst index e0efe2ee0..502fb2829 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -36,23 +36,22 @@ General Concepts functional data A collection of :term:`functional data objects`. - Usually represented by a :class:`FData` object. + Represented by a :class:`~skfda.representation.FData` object. functional data object functional data objects functional object functional objects An object of study of Functional Data Analysis. - It is a function or map between two sets, often subsets of powers of - :math:`\mathbb{R}`. - Usually represented by a :class:`FData` object of length 1, but in - some cases regular Python :term:`callables ` are - also accepted. + It is a function between :math:`\mathbb{R}^p` and :math:`\mathbb{R}^q`. + Usually represented by a :class:`~skfda.representation.FData` object of + length 1, but in some cases regular Python + :term:`callables ` are also accepted. functional observation functional observations - An observed :term:`functional data object`, usually represented as a - :class:`FData` object of length 1. + An observed :term:`functional data object`, represented as a + :class:`~skfda.representation.FData` object of length 1. image The set of actual ouput values that a function takes. @@ -65,7 +64,7 @@ General Concepts multivariate object multivariate objects An object of study of multivariate statistics. - It is a vector of possibly related variables, usually represented + It is a vector of possibly related variables, represented as a :term:`sklearn:1d array`. operator From ff4521c554b58607877b4618601eaade48cddf18 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 14 Nov 2020 01:27:46 +0100 Subject: [PATCH 080/210] Modify Phoneme. --- skfda/datasets/_real_datasets.py | 34 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 97a0462b0..d3b82b4c5 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -1,7 +1,10 @@ import rdata import warnings +from sklearn.utils import Bunch + import numpy as np +import pandas as pd from .. import FDataGrid @@ -153,6 +156,7 @@ def _fetch_fda_usc(name): _param_descr = """ Args: return_X_y: Return only the data and target as a tuple. + as_frame: Return the data in a Pandas Dataframe or Series. """ _phoneme_descr = """ @@ -200,7 +204,7 @@ def _fetch_fda_usc(name): """ -def fetch_phoneme(return_X_y: bool = False): +def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): """ Load the phoneme dataset. @@ -225,16 +229,28 @@ def fetch_phoneme(return_X_y: bool = False): argument_names=("frequency (kHz)",), coordinate_names=("log-periodogram",)) + if as_frame: + frame = pd.DataFrame({"log-periodogram": curves, + "phoneme": sound}) + curves = frame.iloc[:, 0] + target = frame.iloc[:, 1] + meta = pd.Series(speaker, name="speaker") + else: + target = sound.codes + meta = np.array([speaker]).T + if return_X_y: - return curves, sound + return curves, target else: - return {"data": curves, - "target": sound.codes, - "target_names": sound.categories.tolist(), - "target_feature_names": ["sound"], - "meta": np.array([speaker]).T, - "meta_feature_names": ["speaker"], - "DESCR": DESCR} + return Bunch( + data=curves, + target=target, + frame=frame, + target_names=sound.categories.tolist(), + target_feature_names=["sound"], + meta=meta, + meta_feature_names=["speaker"], + DESCR=DESCR) if hasattr(fetch_phoneme, "__doc__"): # docstrings can be stripped off From 09734e8e6fc83a8477796b2db7b49c09bcde76f5 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 14 Nov 2020 03:25:22 +0100 Subject: [PATCH 081/210] Update Berkeley Growth. --- skfda/datasets/_real_datasets.py | 42 +++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index d3b82b4c5..50ba9be9c 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -229,9 +229,13 @@ def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): argument_names=("frequency (kHz)",), coordinate_names=("log-periodogram",)) + curve_name = "log-periodogram" + target_name = "phoneme" + frame = None + if as_frame: - frame = pd.DataFrame({"log-periodogram": curves, - "phoneme": sound}) + frame = pd.DataFrame({curve_name: curves, + target_name: sound}) curves = frame.iloc[:, 0] target = frame.iloc[:, 1] meta = pd.Series(speaker, name="speaker") @@ -246,10 +250,11 @@ def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): data=curves, target=target, frame=frame, - target_names=sound.categories.tolist(), - target_feature_names=["sound"], + categories={target_name: sound.categories.tolist()}, + feature_names=[curve_name], + target_names=[target_name], meta=meta, - meta_feature_names=["speaker"], + meta_names=["speaker"], DESCR=DESCR) @@ -271,7 +276,7 @@ def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_growth(return_X_y: bool = False): +def fetch_growth(return_X_y: bool = False, as_frame: bool = False): """ Load the Berkeley Growth Study dataset. @@ -289,22 +294,35 @@ def fetch_growth(return_X_y: bool = False): females = data["hgtf"].T males = data["hgtm"].T + sex = np.array([0] * males.shape[0] + [1] * females.shape[0]) curves = FDataGrid(data_matrix=np.concatenate((males, females), axis=0), grid_points=ages, dataset_name="Berkeley Growth Study", argument_names=("age",), coordinate_names=("height",)) - sex = np.array([0] * males.shape[0] + [1] * females.shape[0]) + curve_name = "height" + target_name = "sex" + target_categories = ["male", "female"] + frame = None + + if as_frame: + sex = pd.Categorical.from_codes(sex, categories=target_categories) + frame = pd.DataFrame({curve_name: curves, + target_name: sex}) + curves = frame.iloc[:, 0] + sex = frame.iloc[:, 1] if return_X_y: return curves, sex else: - return {"data": curves, - "target": sex, - "target_names": ["male", "female"], - "target_feature_names": ["sex"], - "DESCR": DESCR} + return Bunch(data=curves, + target=sex, + frame=frame, + categories={target_name: target_categories}, + feature_names=[curve_name], + target_names=[target_name], + DESCR=DESCR) if hasattr(fetch_growth, "__doc__"): # docstrings can be stripped off From b525ed82cd8954c1ce8a723fe9414f852d84022b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 14 Nov 2020 16:53:04 +0100 Subject: [PATCH 082/210] Update tecator. --- skfda/datasets/_real_datasets.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 50ba9be9c..eb361769a 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -366,7 +366,7 @@ def fetch_growth(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_tecator(return_X_y: bool = False): +def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): """ Load the Tecator dataset. @@ -381,16 +381,28 @@ def fetch_tecator(return_X_y: bool = False): data = raw_dataset["tecator"] curves = data['absorp.fdata'] - target = data['y'].values - target_feature_names = data['y'].columns.values.tolist() + target = data['y'] + feature_names = [curves.dataset_name] + target_names = target.columns.values.tolist() + + frame = None + + if as_frame: + curves = pd.Series(curves, name=curves.dataset_name) + frame = pd.concat([curves, target], axis=1) + else: + target = target.values if return_X_y: return curves, target else: - return {"data": curves, - "target": target, - "target_feature_names": target_feature_names, - "DESCR": DESCR} + return Bunch(data=curves, + target=target, + frame=frame, + categories={}, + feature_names=feature_names, + target_names=target_names, + DESCR=DESCR) if hasattr(fetch_tecator, "__doc__"): # docstrings can be stripped off From f07a3e8e39ecb468a150d3131e732dc270e31744 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 14 Nov 2020 17:41:59 +0100 Subject: [PATCH 083/210] Update medflies. --- skfda/datasets/_real_datasets.py | 37 +++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index eb361769a..14d41d8f8 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -381,14 +381,14 @@ def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): data = raw_dataset["tecator"] curves = data['absorp.fdata'] - target = data['y'] - feature_names = [curves.dataset_name] + target = data['y'].rename(columns=str.lower) + feature_name = curves.dataset_name.lower() target_names = target.columns.values.tolist() frame = None if as_frame: - curves = pd.Series(curves, name=curves.dataset_name) + curves = pd.Series(curves, name=feature_name) frame = pd.concat([curves, target], axis=1) else: target = target.values @@ -400,7 +400,7 @@ def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): target=target, frame=frame, categories={}, - feature_names=feature_names, + feature_names=[feature_name], target_names=target_names, DESCR=DESCR) @@ -446,7 +446,7 @@ def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_medflies(return_X_y: bool = False): +def fetch_medflies(return_X_y: bool = False, as_frame: bool = False): """ Load the Medflies dataset, where the flies are separated in two classes according to their longevity. @@ -464,18 +464,31 @@ def fetch_medflies(return_X_y: bool = False): curves = data[0] unique = np.unique(data[1], return_inverse=True) - target_names = [unique[0][1], unique[0][0]] + target_categories = [unique[0][1], unique[0][0]] target = 1 - unique[1] - target_feature_names = ["lifetime"] + curve_name = 'eggs' + target_name = "lifetime" + + frame = None + + if as_frame: + target = pd.Categorical.from_codes( + target, categories=target_categories) + frame = pd.DataFrame({curve_name: curves, + target_name: target}) + curves = frame.iloc[:, 0] + target = frame.iloc[:, 1] if return_X_y: return curves, target else: - return {"data": curves, - "target": target, - "target_names": target_names, - "target_feature_names": target_feature_names, - "DESCR": DESCR} + return Bunch(data=curves, + target=target, + frame=frame, + categories={target_name: target_categories}, + feature_names=[curve_name], + target_names=[target_name], + DESCR=DESCR) if hasattr(fetch_medflies, "__doc__"): # docstrings can be stripped off From b31c538c4a95adfd32b734397cc5f8dc96fdcd28 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 15 Nov 2020 02:58:47 +0100 Subject: [PATCH 084/210] Update gait. --- skfda/datasets/_real_datasets.py | 50 ++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 14d41d8f8..6995cf364 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -236,7 +236,7 @@ def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): if as_frame: frame = pd.DataFrame({curve_name: curves, target_name: sound}) - curves = frame.iloc[:, 0] + curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] meta = pd.Series(speaker, name="speaker") else: @@ -310,7 +310,7 @@ def fetch_growth(return_X_y: bool = False, as_frame: bool = False): sex = pd.Categorical.from_codes(sex, categories=target_categories) frame = pd.DataFrame({curve_name: curves, target_name: sex}) - curves = frame.iloc[:, 0] + curves = frame.iloc[:, [0]] sex = frame.iloc[:, 1] if return_X_y: @@ -388,7 +388,7 @@ def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): frame = None if as_frame: - curves = pd.Series(curves, name=feature_name) + curves = pd.DataFrame({feature_name: curves}) frame = pd.concat([curves, target], axis=1) else: target = target.values @@ -476,7 +476,7 @@ def fetch_medflies(return_X_y: bool = False, as_frame: bool = False): target, categories=target_categories) frame = pd.DataFrame({curve_name: curves, target_name: target}) - curves = frame.iloc[:, 0] + curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] if return_X_y: @@ -634,9 +634,9 @@ def fetch_aemet(return_X_y: bool = False): Hubert, Mia. (2006). Robustness and Outlier Detection in Chemometrics. Critical Reviews in Analytical Chemistry. 36. 221-242. 10.1080/10408340600969403. - .. [HuRS2015] Hubert, Mia & Rousseeuw, Peter & Segaert, Pieter. (2015). - Multivariate functional outlier detection. Statistical Methods and - Applications. 24. 177-202. 10.1007/s10260-015-0297-8. + .. [HuRS2015] Hubert, Mia & Rousseeuw, Peter & Segaert, Pieter. + (2015). Multivariate functional outlier detection. Statistical + Methods and Applications. 24. 177-202. 10.1007/s10260-015-0297-8. """ @@ -650,7 +650,7 @@ def fetch_octane(return_X_y: bool = False): """ DESCR = _octane_descr - # octane file from mrfDepth R package + # octane file from mrfDepth R package raw_dataset = fetch_cran("octane", "mrfDepth", version="1.0.11") data = raw_dataset['octane'][..., 0].T @@ -685,7 +685,7 @@ def fetch_octane(return_X_y: bool = False): fetch_octane.__doc__ += _octane_descr + _param_descr _gait_descr = """ - Angles formed by the hip and knee of each of 39 children over each boy + Angles formed by the hip and knee of each of 39 children over each boy gait cycle. References: @@ -697,7 +697,7 @@ def fetch_octane(return_X_y: bool = False): """ -def fetch_gait(return_X_y: bool = False): +def fetch_gait(return_X_y: bool = False, as_frame: bool = False): """ Load the GAIT dataset. @@ -713,25 +713,33 @@ def fetch_gait(return_X_y: bool = False): data_matrix = np.asarray(data) data_matrix = np.transpose(data_matrix, axes=(1, 0, 2)) grid_points = np.asarray(data.coords.get('dim_0'), np.float64) + sample_names = np.asarray(data.coords.get('dim_1')) + feature_name = 'gait' curves = FDataGrid(data_matrix=data_matrix, grid_points=grid_points, - dataset_name="GAIT", - argument_names=("Time (proportion of gait cycle)",), - coordinate_names=("Hip angle (degrees)", - "Knee angle (degrees)")) + dataset_name=feature_name, + sample_names=sample_names, + argument_names=("time (proportion of gait cycle)",), + coordinate_names=("hip angle (degrees)", + "knee angle (degrees)")) - meta_names, meta = np.unique(np.asarray(data.coords.get('dim_1')), - return_inverse=True) + frame = None + + if as_frame: + curves = pd.DataFrame({feature_name: curves}) + frame = curves if return_X_y: return curves, None else: - return {"data": curves, - "meta": meta, - "meta_names": meta_names, - "meta_feature_names": ["boys"], - "DESCR": DESCR} + return Bunch(data=curves, + target=None, + frame=frame, + categories={}, + feature_names=[feature_name], + target_names=[], + DESCR=DESCR) if hasattr(fetch_gait, "__doc__"): # docstrings can be stripped off From 39c5e9c7d9898e959c197920de747c8b8050eda8 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 15 Nov 2020 18:16:25 +0100 Subject: [PATCH 085/210] Update octane. --- skfda/datasets/_real_datasets.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 6995cf364..3a81def14 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -641,7 +641,7 @@ def fetch_aemet(return_X_y: bool = False): """ -def fetch_octane(return_X_y: bool = False): +def fetch_octane(return_X_y: bool = False, as_frame: bool = False): """Load near infrared spectra of gasoline samples. This function fetchs the octane dataset from the R package 'mrfDepth' @@ -663,22 +663,36 @@ def fetch_octane(return_X_y: bool = False): # "The octane data set contains six outliers (25, 26, 36–39) to which # alcohol was added". - target = np.zeros(len(data), dtype=int) + target = np.zeros(len(data), dtype=np.bool_) target[24] = target[25] = target[35:39] = 1 # Outliers 1 + target_name = "is outlier" + + curve_name = "absorbances" curves = FDataGrid(data, grid_points=grid_points, - dataset_name="Octane", + dataset_name="octane", argument_names=("wavelength (nm)",), coordinate_names=("absorbances",)) + frame = None + + if as_frame: + frame = pd.DataFrame({curve_name: curves, + target_name: target}) + curves = frame.iloc[:, [0]] + target = frame.iloc[:, 1] + if return_X_y: return curves, target else: - return {"data": curves, - "target": target, - "target_names": ['inliner', 'outlier'], - "DESCR": DESCR} + return Bunch(data=curves, + target=target, + frame=frame, + categories={}, + feature_names=[curve_name], + target_names=[target_name], + DESCR=DESCR) if hasattr(fetch_octane, "__doc__"): # docstrings can be stripped off From 7d6f048f2017de2ac9a294f35180d66b5dd4b52a Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 15 Nov 2020 21:12:37 +0100 Subject: [PATCH 086/210] Update weather. --- skfda/datasets/_real_datasets.py | 69 ++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 3a81def14..8494be264 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -507,7 +507,7 @@ def fetch_medflies(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_weather(return_X_y: bool = False): +def fetch_weather(return_X_y: bool = False, as_frame: bool = False): """ Load the Canadian Weather dataset. @@ -531,30 +531,65 @@ def fetch_weather(return_X_y: bool = False): grid_points=np.arange(0, 365) + 0.5, domain_range=(0, 365), dataset_name="Canadian Weather", + sample_names=data["place"], argument_names=("day",), coordinate_names=("temperature (ºC)", "precipitation (mm.)")) - target_names, target = np.unique(data["region"], return_inverse=True) + curve_name = "daily averages" + target_name = "region" + target_categories, target = np.unique(data["region"], return_inverse=True) - if return_X_y: - return curves, target + frame = None + + if as_frame: + target = pd.Categorical.from_codes( + target, categories=target_categories) + frame = pd.DataFrame({ + curve_name: curves, + "place": data["place"], + "province": data["province"], + "latitude": np.asarray(data["coordinates"])[:, 0], + "longitude": np.asarray(data["coordinates"])[:, 1], + "index": data["geogindex"], + "monthly temperatures": np.asarray( + data["monthlyTemp"]).T.tolist(), + "monthly precipitation": np.asarray( + data["monthlyPrecip"]).T.tolist(), + target_name: target}) + X = frame.iloc[:, :-1] + target = frame.iloc[:, 1] + feature_names = list(X.columns.values) + + additional_dict = {} else: - return {"data": curves, - "target": target, - "target_names": target_names, - "target_feature_names": ["region"], - "meta": list(zip(data["place"], data["province"], - np.asarray(data["coordinates"])[0], - np.asarray(data["coordinates"])[1], + feature_names = [curve_name] + X = curves + meta = np.array(list(zip(data["place"], + data["province"], + np.asarray(data["coordinates"])[:, 0], + np.asarray(data["coordinates"])[:, 1], data["geogindex"], - np.asarray(data["monthlyTemp"]), - np.asarray(data["monthlyPrecip"]))), + np.asarray(data["monthlyTemp"]).T, + np.asarray(data["monthlyPrecip"]).T))) + meta_names = ["place", "province", "latitude", "longitude", + "index", "monthly temperatures", + "monthly precipitation"], - "meta_names": ["place", "province", "latitude", "longitude", - "ind", "monthlyTemp", "monthlyPrecip"], - "meta_feature_names": ["location"], - "DESCR": DESCR} + additional_dict = {"meta": meta, + "meta_names": meta_names} + + if return_X_y: + return X, target + else: + return Bunch(data=X, + target=target, + frame=frame, + categories={target_name: target_categories}, + feature_names=feature_names, + target_names=[target_name], + **additional_dict, + DESCR=DESCR) if hasattr(fetch_weather, "__doc__"): # docstrings can be stripped off From 91364cdaeaa450c4e4215e26d026ef50a26b3133 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 15 Nov 2020 23:48:18 +0100 Subject: [PATCH 087/210] Update aemet. --- skfda/datasets/_real_datasets.py | 46 +++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 8494be264..339246bb9 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -610,7 +610,7 @@ def fetch_weather(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_aemet(return_X_y: bool = False): +def fetch_aemet(return_X_y: bool = False, as_frame: bool = False): """ Load the Spanish Weather dataset. @@ -629,22 +629,48 @@ def fetch_aemet(return_X_y: bool = False): data_matrix[:, :, 2] = data["wind.speed"].data_matrix[:, :, 0] curves = data["temp"].copy(data_matrix=data_matrix, - dataset_name="AEMET", + dataset_name="aemet", + sample_names=data["df"].iloc[:, 1], argument_names=("day",), coordinate_names=("temperature (ºC)", "logprecipitation", "wind speed (m/s)")) + curve_name = "daily averages" + df_names = ["index", "place", "province", "altitude", + "longitude", "latitude"] + df_indexes = np.array([0, 1, 2, 3, 6, 7]) + + frame = None + + if as_frame: + frame = pd.DataFrame({ + curve_name: curves, + **{n: data["df"].iloc[:, d] for (n, d) in zip(df_names, df_indexes)}}) + X = frame + feature_names = list(X.columns.values) + + additional_dict = {} + + else: + feature_names = [curve_name] + X = curves + meta = np.asarray(data["df"])[:, df_indexes] + meta_names = df_names + additional_dict = {"meta": meta, + "meta_names": meta_names} + if return_X_y: - return curves, None + return X, None else: - return {"data": curves, - "meta": np.asarray(data["df"])[:, - np.array([0, 1, 2, 3, 6, 7])], - "meta_names": ["ind", "place", "province", "altitude", - "longitude", "latitude"], - "meta_feature_names": ["location"], - "DESCR": DESCR} + return Bunch( + data=X, + target=None, + frame=frame, + categories={}, + feature_names=feature_names, + **additional_dict, + DESCR=DESCR) if hasattr(fetch_aemet, "__doc__"): # docstrings can be stripped off From d8760a92a5a809aab5aa6b93575b0f486202bbe3 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 16 Nov 2020 20:21:25 +0100 Subject: [PATCH 088/210] Fix documentation. --- examples/plot_boxplot.py | 8 ++--- examples/plot_clustering.py | 34 ++++++++------------- examples/plot_explore.py | 8 ++--- examples/plot_k_neighbors_classification.py | 16 +++++----- examples/plot_magnitude_shape.py | 9 +++--- skfda/datasets/_real_datasets.py | 2 +- 6 files changed, 34 insertions(+), 43 deletions(-) diff --git a/examples/plot_boxplot.py b/examples/plot_boxplot.py index 03a2abf0a..7e5832f1b 100644 --- a/examples/plot_boxplot.py +++ b/examples/plot_boxplot.py @@ -24,8 +24,8 @@ # CRAN. It contains a FDataGrid with daily temperatures and precipitations, # that is, it has a 2-dimensional image. We are interested only in the daily # average temperatures, so we will use the first coordinate. -dataset = datasets.fetch_weather() -fd = dataset["data"] +X, y = datasets.fetch_weather(return_X_y=True, as_frame=True) +fd = X.iloc[:, 0].values fd_temperatures = fd.coordinates[0] ############################################################################## @@ -35,11 +35,11 @@ # Each climate is assigned a color. Defaults to grey. colormap = plt.cm.get_cmap('seismic') -label_names = dataset["target_names"] +label_names = y.values.categories nlabels = len(label_names) label_colors = colormap(np.arange(nlabels) / (nlabels - 1)) -fd_temperatures.plot(group=dataset["target"], +fd_temperatures.plot(group=y.values.codes, group_colors=label_colors, group_names=label_names) diff --git a/examples/plot_clustering.py b/examples/plot_clustering.py index a4f87b57c..f83ca5323 100644 --- a/examples/plot_clustering.py +++ b/examples/plot_clustering.py @@ -12,22 +12,24 @@ # sphinx_gallery_thumbnail_number = 6 -import matplotlib.pyplot as plt -import numpy as np from skfda import datasets from skfda.exploratory.visualization.clustering import ( plot_clusters, plot_cluster_lines, plot_cluster_bars) from skfda.ml.clustering import KMeans, FuzzyCMeans +import matplotlib.pyplot as plt +import numpy as np + ############################################################################## # First, the Canadian Weather dataset is downloaded from the package 'fda' in # CRAN. It contains a FDataGrid with daily temperatures and precipitations, # that is, it has a 2-dimensional image. We are interested only in the daily # average temperatures, so we select the first coordinate function. -dataset = datasets.fetch_weather() -fd = dataset["data"] +X, y = datasets.fetch_weather(return_X_y=True, as_frame=True) +fd = X.iloc[:, 0].values fd_temperatures = fd.coordinates[0] +target = y.values # The desired FDataGrid only contains 10 random samples, so that the example # provides clearer plots. @@ -39,27 +41,15 @@ # according to the target. In this case, it includes the different climates to # which the weather stations belong to. -climate_by_sample = [dataset["target"][i] for i in indices_samples] - -# Note that the samples chosen belong to three of the four possible target -# groups. By coincidence, these three groups correspond to indices 1, 2, 3, -# that is why the indices (´climate_by_sample´) are decremented in 1. In case -# of reproducing the example with other ´indices_samples´ and the four groups -# are not present in the sample, changes should be made in order ´indexer´ -# contains numbers in the interval [0, n_target_groups) and at least, an -# occurrence of each one. -indexer = np.asarray(climate_by_sample) - 1 - -indices_target_groups = np.unique(climate_by_sample) -climates = dataset["target_names"][indices_target_groups] +climates = target[indices_samples].remove_unused_categories() # Assigning the color to each of the groups. colormap = plt.cm.get_cmap('tab20b') -n_climates = len(climates) +n_climates = len(climates.categories) climate_colors = colormap(np.arange(n_climates) / (n_climates - 1)) -fd.plot(group=indexer, group_colors=climate_colors, - group_names=climates) +fd.plot(group=climates.codes, group_names=climates.categories, + group_colors=climate_colors) ############################################################################## # The number of clusters is set with the number of climates, in order to see @@ -89,7 +79,7 @@ # Customization of cluster colors and labels in order to match the first image # of raw data. cluster_colors = climate_colors[np.array([0, 2, 1])] -cluster_labels = climates[np.array([0, 2, 1])] +cluster_labels = climates.categories[np.array([0, 2, 1])] plot_clusters(kmeans, fd, cluster_colors=cluster_colors, cluster_labels=cluster_labels) @@ -127,7 +117,7 @@ # to each of the samples in order to identify them. In this example, the # colors are the ones of the first plot, dividing the samples by climate. -colors_by_climate = colormap(indexer / (n_climates - 1)) +colors_by_climate = colormap(climates.codes / (n_climates - 1)) plot_cluster_lines(fuzzy_kmeans, fd, cluster_labels=cluster_labels, sample_colors=colors_by_climate) diff --git a/examples/plot_explore.py b/examples/plot_explore.py index 035d502b5..dbc6b9a94 100644 --- a/examples/plot_explore.py +++ b/examples/plot_explore.py @@ -22,11 +22,9 @@ # # In this example we only want to discriminate between meat with less than 20% # of fat, and meat with a higher fat content. -dataset = skfda.datasets.fetch_tecator() -fd = dataset['data'] -y = dataset['target'] -target_feature_names = dataset['target_feature_names'] -fat = y[:, np.asarray(target_feature_names) == 'Fat'].ravel() +X, y = skfda.datasets.fetch_tecator(return_X_y=True, as_frame=True) +fd = X.iloc[:, 0].values +fat = y['fat'].values ############################################################################## # We will now plot in red samples containing less than 20% of fat and in blue diff --git a/examples/plot_k_neighbors_classification.py b/examples/plot_k_neighbors_classification.py index 296273928..16491bc95 100644 --- a/examples/plot_k_neighbors_classification.py +++ b/examples/plot_k_neighbors_classification.py @@ -8,13 +8,14 @@ # Author: Pablo Marcos Manchón # License: MIT +import skfda +from skfda.ml.classification import KNeighborsClassifier + from sklearn.model_selection import (train_test_split, GridSearchCV, StratifiedShuffleSplit) import matplotlib.pyplot as plt import numpy as np -import skfda -from skfda.ml.classification import KNeighborsClassifier ############################################################################## @@ -31,13 +32,14 @@ # The following figure shows the growth curves grouped by sex. # # Loads dataset -data = skfda.datasets.fetch_growth() -X = data['data'] -y = data['target'] -class_names = data['target_names'] +X, y = skfda.datasets.fetch_growth(return_X_y=True, as_frame=True) +X = X.iloc[:, 0].values +y = y.values # Plot samples grouped by sex -X.plot(group=y, group_names=class_names) +X.plot(group=y.codes, group_names=y.categories) + +y = y.codes ############################################################################## diff --git a/examples/plot_magnitude_shape.py b/examples/plot_magnitude_shape.py index 68555c74c..9b4e19751 100644 --- a/examples/plot_magnitude_shape.py +++ b/examples/plot_magnitude_shape.py @@ -24,9 +24,10 @@ # CRAN. It contains a FDataGrid with daily temperatures and precipitations, # that is, it has a 2-dimensional image. We are interested only in the daily # average temperatures, so we extract the first coordinate. -dataset = datasets.fetch_weather() -fd = dataset["data"] +X, y = datasets.fetch_weather(return_X_y=True, as_frame=True) +fd = X.iloc[:, 0].values fd_temperatures = fd.coordinates[0] +target = y.values ############################################################################## # The data is plotted to show the curves we are working with. They are divided @@ -35,11 +36,11 @@ # Each climate is assigned a color. Defaults to grey. colormap = plt.cm.get_cmap('seismic') -label_names = dataset["target_names"] +label_names = target.categories nlabels = len(label_names) label_colors = colormap(np.arange(nlabels) / (nlabels - 1)) -fd_temperatures.plot(group=dataset["target"], +fd_temperatures.plot(group=target.codes, group_colors=label_colors, group_names=label_names) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 339246bb9..a0bcfb116 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -558,7 +558,7 @@ def fetch_weather(return_X_y: bool = False, as_frame: bool = False): data["monthlyPrecip"]).T.tolist(), target_name: target}) X = frame.iloc[:, :-1] - target = frame.iloc[:, 1] + target = frame.iloc[:, -1] feature_names = list(X.columns.values) additional_dict = {} From acf9d5738abe606239dddb1698e41bc3ad4e28b6 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 16 Nov 2020 21:05:06 +0100 Subject: [PATCH 089/210] Maximum depth classification --- skfda/ml/classification/__init__.py | 3 +- skfda/ml/classification/maximum_depth.py | 102 +++++++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 skfda/ml/classification/maximum_depth.py diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index 7a2b9e3bb..5201231d0 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,4 +1,3 @@ - - from ..._neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid) +from .maximum_depth import MaximumDepth diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py new file mode 100644 index 000000000..7066dc27c --- /dev/null +++ b/skfda/ml/classification/maximum_depth.py @@ -0,0 +1,102 @@ +"""Maximum depth for supervised classification.""" +import numpy as np +import copy + +from sklearn.base import ClassifierMixin, BaseEstimator +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted + +from skfda.exploratory.depth import * + +class MaximumDepth(BaseEstimator, ClassifierMixin): + """Maximum depth classifier for functional data. + + Test samples are classified to the class where they are deeper. + + Parameters + ---------- + depth_class : callable, (default + :class:`IntegratedDepth `) + The depth class to use when calculating the depth of a test + samples in a class. See the documentation of the depths module + for a list of available depths. By default it is the one used + by Fraiman and Muniz. + Examples + -------- + Firstly, we will create a toy dataset with 2 classes + + >>> from skfda.datasets import make_sinusoidal_process + >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) + >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., + ... phase_std=.25, random_state=0) + >>> fd = fd1.concatenate(fd2) + >>> y = 15*[0] + 15*[1] + + We will fit a Maximum depth classifier + + >>> from skfda.ml.classification import MaximumDepth + >>> depth = MaximumDepth() + >>> depth.fit(fd, y) + MaximumDepth() + + We can predict the class of new samples + + >>> depth.predict(fd[::2]) # Predict labels for even samples + array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + See also + -------- + :class:`~skfda.ml.classification.KNeighborsClassifier` + :class:`~skfda.ml.classification.RadiusNeighborsClassifier` + + + """ + + def __init__(self, depth_class=IntegratedDepth()): + """Initialize the classifier.""" + self.depth_class = depth_class + + def fit(self, X, y): + """Fit the model using X as training data and y as target values. + + Args: + X (:class:`FDataGrid`, array_matrix): Training data. FDataGrid + with the training data or array matrix with shape + [n_samples, n_samples] if metric='precomputed'. + y (array-like or sparse matrix): Target values of + shape = [n_samples] or [n_samples, n_outputs]. + + """ + check_classification_targets(y) + + le = LabelEncoder() + y_ind = le.fit_transform(y) + self.classes_ = classes = le.classes_ + n_classes = classes.size + if n_classes < 2: + raise ValueError(f'The number of classes has to be greater than' + f' one; got {n_classes} class') + + self.distributions_ = [] + for cur_class in range(0, n_classes): + distribution = self.depth_class.fit(X[y_ind == cur_class]) + self.distributions_.append(copy.deepcopy(distribution)) + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Args: + X (:class:`FDataGrid`): FDataGrid with the test samples. + + Returns: + (np.array): y : array of shape [n_samples] or + [n_samples, n_outputs] with class labels for each data sample. + + """ + sklearn_check_is_fitted(self, 'distributions_') + + depths = [distribution.predict(X) for distribution in self.distributions_] + return np.array([self.classes_[i] for i in np.argmax(depths, axis=0)]) \ No newline at end of file From 4c83ce759586fb949bbaaafefa61b10dd7217160 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 16 Nov 2020 21:34:22 +0100 Subject: [PATCH 090/210] Add newline at the end of file --- skfda/ml/classification/maximum_depth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index 7066dc27c..6394c9075 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -99,4 +99,4 @@ def predict(self, X): sklearn_check_is_fitted(self, 'distributions_') depths = [distribution.predict(X) for distribution in self.distributions_] - return np.array([self.classes_[i] for i in np.argmax(depths, axis=0)]) \ No newline at end of file + return np.array([self.classes_[i] for i in np.argmax(depths, axis=0)]) From cebacbf64949d94d9483618bd87023c29c7896d2 Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Tue, 17 Nov 2020 13:28:15 +0100 Subject: [PATCH 091/210] Update skfda/ml/classification/maximum_depth.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/ml/classification/maximum_depth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index 6394c9075..b2783b3f7 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -36,7 +36,7 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): We will fit a Maximum depth classifier >>> from skfda.ml.classification import MaximumDepth - >>> depth = MaximumDepth() + >>> clf= MaximumDepth() >>> depth.fit(fd, y) MaximumDepth() From dc50ee566578697be03ea96e4689aa169521c9e1 Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Tue, 17 Nov 2020 13:28:35 +0100 Subject: [PATCH 092/210] Update skfda/ml/classification/maximum_depth.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/ml/classification/maximum_depth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index b2783b3f7..da987601b 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -96,7 +96,7 @@ def predict(self, X): [n_samples, n_outputs] with class labels for each data sample. """ - sklearn_check_is_fitted(self, 'distributions_') + sklearn_check_is_fitted(self) depths = [distribution.predict(X) for distribution in self.distributions_] return np.array([self.classes_[i] for i in np.argmax(depths, axis=0)]) From de526637ceaf54364708f5312a2daeb96b4eb90a Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Tue, 17 Nov 2020 13:30:09 +0100 Subject: [PATCH 093/210] Update skfda/ml/classification/maximum_depth.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/ml/classification/maximum_depth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index da987601b..ab2c5375f 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -16,7 +16,7 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): Parameters ---------- - depth_class : callable, (default + depth_method : callable, (default :class:`IntegratedDepth `) The depth class to use when calculating the depth of a test samples in a class. See the documentation of the depths module From 9eeca4a1a49be80595f2f2086dbe3b22c57feb12 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 17 Nov 2020 20:32:21 +0100 Subject: [PATCH 094/210] Update maximum_depth after comments --- skfda/ml/classification/maximum_depth.py | 43 ++++++++++++------------ 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index ab2c5375f..572281c20 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -1,8 +1,7 @@ """Maximum depth for supervised classification.""" import numpy as np -import copy -from sklearn.base import ClassifierMixin, BaseEstimator +from sklearn.base import ClassifierMixin, BaseEstimator, clone from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted @@ -24,38 +23,38 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): by Fraiman and Muniz. Examples -------- - Firstly, we will create a toy dataset with 2 classes + Firstly, we will import and split the Berkeley Growth Study dataset - >>> from skfda.datasets import make_sinusoidal_process - >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) - >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., - ... phase_std=.25, random_state=0) - >>> fd = fd1.concatenate(fd2) - >>> y = 15*[0] + 15*[1] + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> dataset = fetch_growth() + >>> fd = dataset['data'] + >>> y = dataset['target'] + >>> X_train, X_test, y_train, y_test = train_test_split( + ... fd, y, test_size=0.25, stratify=y, random_state=0) We will fit a Maximum depth classifier >>> from skfda.ml.classification import MaximumDepth - >>> clf= MaximumDepth() - >>> depth.fit(fd, y) + >>> clf = MaximumDepth() + >>> clf.fit(X_train, y_train) MaximumDepth() We can predict the class of new samples - >>> depth.predict(fd[::2]) # Predict labels for even samples - array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + >>> clf.predict(X_test) # Predict labels for even samples + array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, + 1, 1]) See also -------- - :class:`~skfda.ml.classification.KNeighborsClassifier` - :class:`~skfda.ml.classification.RadiusNeighborsClassifier` - + :class:`~skfda.ml.classification.DD-plot` """ - def __init__(self, depth_class=IntegratedDepth()): + def __init__(self, depth_method=IntegratedDepth()): """Initialize the classifier.""" - self.depth_class = depth_class + self.depth_method = depth_method def fit(self, X, y): """Fit the model using X as training data and y as target values. @@ -78,10 +77,10 @@ def fit(self, X, y): raise ValueError(f'The number of classes has to be greater than' f' one; got {n_classes} class') - self.distributions_ = [] + self.distributions_ = [None]*n_classes for cur_class in range(0, n_classes): - distribution = self.depth_class.fit(X[y_ind == cur_class]) - self.distributions_.append(copy.deepcopy(distribution)) + distribution = clone(self.depth_method).fit(X[y_ind == cur_class]) + self.distributions_[cur_class] = distribution return self @@ -99,4 +98,4 @@ def predict(self, X): sklearn_check_is_fitted(self) depths = [distribution.predict(X) for distribution in self.distributions_] - return np.array([self.classes_[i] for i in np.argmax(depths, axis=0)]) + return self.classes_[np.argmax(depths, axis=0)] From 6aedfd3b694bd443d9b84f835a273f913c767315 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 17 Nov 2020 20:34:29 +0100 Subject: [PATCH 095/210] Small typo --- skfda/ml/classification/maximum_depth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index 572281c20..9867b89b1 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -42,7 +42,7 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): We can predict the class of new samples - >>> clf.predict(X_test) # Predict labels for even samples + >>> clf.predict(X_test) # Predict labels for test samples array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1]) From b696ee2c308da2911b193a70b6ef04832495baef Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 17 Nov 2020 21:37:09 +0100 Subject: [PATCH 096/210] Private function _fit_init for preprocessing --- skfda/_utils/__init__.py | 3 ++- skfda/_utils/_utils.py | 16 ++++++++++++++++ skfda/ml/classification/maximum_depth.py | 13 ++----------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index ae95c7872..26ceedeb8 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -7,4 +7,5 @@ _reshape_eval_points, _evaluate_grid, nquad_vec, _FDataCallable, _pairwise_commutative, - _domain_range, _check_array_key) + _domain_range, _check_array_key, + _fit_init) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9241dea3b..c706b8487 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -447,3 +447,19 @@ def _check_estimator(estimator): instance = estimator() check_get_params_invariance(name, instance) check_set_params(name, instance) + +def _fit_init(y): + from sklearn.utils.multiclass import check_classification_targets + from sklearn.preprocessing import LabelEncoder + + check_classification_targets(y) + + le = LabelEncoder() + y_ind = le.fit_transform(y) + + classes = le.classes_ + n_classes = classes.size + if n_classes < 2: + raise ValueError(f'The number of classes has to be greater than' + f' one; got {n_classes} class') + return classes, n_classes, y_ind diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index 9867b89b1..045485908 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -2,11 +2,10 @@ import numpy as np from sklearn.base import ClassifierMixin, BaseEstimator, clone -from sklearn.preprocessing import LabelEncoder -from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from skfda.exploratory.depth import * +from skfda._utils import _fit_init class MaximumDepth(BaseEstimator, ClassifierMixin): """Maximum depth classifier for functional data. @@ -67,15 +66,7 @@ def fit(self, X, y): shape = [n_samples] or [n_samples, n_outputs]. """ - check_classification_targets(y) - - le = LabelEncoder() - y_ind = le.fit_transform(y) - self.classes_ = classes = le.classes_ - n_classes = classes.size - if n_classes < 2: - raise ValueError(f'The number of classes has to be greater than' - f' one; got {n_classes} class') + self.classes_, n_classes, y_ind = _fit_init(y) self.distributions_ = [None]*n_classes for cur_class in range(0, n_classes): From 5e4bc6bcc24e618ea2becada883323be33c29aed Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 18 Nov 2020 00:43:52 +0100 Subject: [PATCH 097/210] _classifier_fit_init --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 26ceedeb8..365bce5ae 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -8,4 +8,4 @@ _evaluate_grid, nquad_vec, _FDataCallable, _pairwise_commutative, _domain_range, _check_array_key, - _fit_init) + _classifier_fit_init) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index c706b8487..613595366 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -448,7 +448,7 @@ def _check_estimator(estimator): check_get_params_invariance(name, instance) check_set_params(name, instance) -def _fit_init(y): +def _classifier_fit_init(y): from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder @@ -458,8 +458,8 @@ def _fit_init(y): y_ind = le.fit_transform(y) classes = le.classes_ - n_classes = classes.size - if n_classes < 2: + + if classes.size < 2: raise ValueError(f'The number of classes has to be greater than' - f' one; got {n_classes} class') - return classes, n_classes, y_ind + f' one; got {classes.size} class') + return classes, y_ind From 4a7ace67e9e70e3b87e3d094ce2d61f3290d352b Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 18 Nov 2020 00:44:29 +0100 Subject: [PATCH 098/210] Update maximum_depth.py --- skfda/ml/classification/maximum_depth.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index 045485908..bebc53c69 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -1,11 +1,12 @@ """Maximum depth for supervised classification.""" + import numpy as np from sklearn.base import ClassifierMixin, BaseEstimator, clone from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from skfda.exploratory.depth import * -from skfda._utils import _fit_init +from skfda._utils import _classifier_fit_init class MaximumDepth(BaseEstimator, ClassifierMixin): """Maximum depth classifier for functional data. @@ -44,10 +45,14 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): >>> clf.predict(X_test) # Predict labels for test samples array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + + Finally, we calculate the mean accuracy for the test data + + >>> clf.score(X_test, y_test) + 0.7916666666666666 See also -------- - :class:`~skfda.ml.classification.DD-plot` """ @@ -59,19 +64,14 @@ def fit(self, X, y): """Fit the model using X as training data and y as target values. Args: - X (:class:`FDataGrid`, array_matrix): Training data. FDataGrid - with the training data or array matrix with shape - [n_samples, n_samples] if metric='precomputed'. - y (array-like or sparse matrix): Target values of - shape = [n_samples] or [n_samples, n_outputs]. + X (:class:`FDataGrid`): FDataGrid with the training data. + y (array-like): Target values of shape = [n_samples]. """ - self.classes_, n_classes, y_ind = _fit_init(y) - - self.distributions_ = [None]*n_classes - for cur_class in range(0, n_classes): - distribution = clone(self.depth_method).fit(X[y_ind == cur_class]) - self.distributions_[cur_class] = distribution + self.classes_, y_ind = _classifier_fit_init(y) + + self.distributions_ = [clone(self.depth_method).fit( + X[y_ind == cur_class]) for cur_class in range(self.classes_.size)] return self @@ -82,8 +82,8 @@ def predict(self, X): X (:class:`FDataGrid`): FDataGrid with the test samples. Returns: - (np.array): y : array of shape [n_samples] or - [n_samples, n_outputs] with class labels for each data sample. + y (np.array): array of shape [n_samples] with class labels + for each data sample. """ sklearn_check_is_fitted(self) From 7742a14ce2676db29c24be141014d6fefeaaf409 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 18 Nov 2020 00:52:33 +0100 Subject: [PATCH 099/210] Style --- skfda/_utils/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 613595366..7c56b0e30 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -448,6 +448,7 @@ def _check_estimator(estimator): check_get_params_invariance(name, instance) check_set_params(name, instance) + def _classifier_fit_init(y): from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder From 98ef3cc83a099479d33f28f9311645459f06f321 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 18 Nov 2020 17:38:24 +0100 Subject: [PATCH 100/210] Rename _classifier_fit_init to _classifier_get_classes --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 2 +- skfda/ml/classification/maximum_depth.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 365bce5ae..9b78fb479 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -8,4 +8,4 @@ _evaluate_grid, nquad_vec, _FDataCallable, _pairwise_commutative, _domain_range, _check_array_key, - _classifier_fit_init) + _classifier_get_classes) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 7c56b0e30..e18104784 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -449,7 +449,7 @@ def _check_estimator(estimator): check_set_params(name, instance) -def _classifier_fit_init(y): +def _classifier_get_classes(y): from sklearn.utils.multiclass import check_classification_targets from sklearn.preprocessing import LabelEncoder diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index bebc53c69..108694dec 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -6,7 +6,7 @@ from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from skfda.exploratory.depth import * -from skfda._utils import _classifier_fit_init +from skfda._utils import _classifier_get_classes class MaximumDepth(BaseEstimator, ClassifierMixin): """Maximum depth classifier for functional data. @@ -68,7 +68,7 @@ def fit(self, X, y): y (array-like): Target values of shape = [n_samples]. """ - self.classes_, y_ind = _classifier_fit_init(y) + self.classes_, y_ind = _classifier_get_classes(y) self.distributions_ = [clone(self.depth_method).fit( X[y_ind == cur_class]) for cur_class in range(self.classes_.size)] From b8aa4f8ad9c4fa3ed87bfb7c1e3ba06651a5ff77 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 18 Nov 2020 18:11:03 +0100 Subject: [PATCH 101/210] Google docstrings --- skfda/ml/classification/maximum_depth.py | 77 ++++++++++++------------ 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth.py index 108694dec..3330a1b6c 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth.py @@ -13,46 +13,43 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): Test samples are classified to the class where they are deeper. - Parameters - ---------- - depth_method : callable, (default - :class:`IntegratedDepth `) - The depth class to use when calculating the depth of a test - samples in a class. See the documentation of the depths module - for a list of available depths. By default it is the one used - by Fraiman and Muniz. - Examples - -------- - Firstly, we will import and split the Berkeley Growth Study dataset - - >>> from skfda.datasets import fetch_growth - >>> from sklearn.model_selection import train_test_split - >>> dataset = fetch_growth() - >>> fd = dataset['data'] - >>> y = dataset['target'] - >>> X_train, X_test, y_train, y_test = train_test_split( - ... fd, y, test_size=0.25, stratify=y, random_state=0) - - We will fit a Maximum depth classifier - - >>> from skfda.ml.classification import MaximumDepth - >>> clf = MaximumDepth() - >>> clf.fit(X_train, y_train) - MaximumDepth() - - We can predict the class of new samples - - >>> clf.predict(X_test) # Predict labels for test samples - array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, - 1, 1]) - - Finally, we calculate the mean accuracy for the test data - - >>> clf.score(X_test, y_test) - 0.7916666666666666 - - See also - -------- + Parameters: + depth_method (callable, (default + :class:`IntegratedDepth `)): + The depth class to use when calculating the depth of a test + samples in a class. See the documentation of the depths module + for a list of available depths. By default it is the one used + by Fraiman and Muniz. + Examples: + Firstly, we will import and split the Berkeley Growth Study dataset + + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> dataset = fetch_growth() + >>> fd = dataset['data'] + >>> y = dataset['target'] + >>> X_train, X_test, y_train, y_test = train_test_split( + ... fd, y, test_size=0.25, stratify=y, random_state=0) + + We will fit a Maximum depth classifier + + >>> from skfda.ml.classification import MaximumDepth + >>> clf = MaximumDepth() + >>> clf.fit(X_train, y_train) + MaximumDepth() + + We can predict the class of new samples + + >>> clf.predict(X_test) # Predict labels for test samples + array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, + 1, 1]) + + Finally, we calculate the mean accuracy for the test data + + >>> clf.score(X_test, y_test) + 0.7916666666666666 + + See also: """ From 1cba7fc904ddce7670d96d501541f6f6e3ad560d Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 18 Nov 2020 18:30:03 +0100 Subject: [PATCH 102/210] Fix SimplicialDepth min value. --- skfda/exploratory/depth/multivariate.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index e607bb881..3930526f2 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -226,10 +226,6 @@ def predict(self, X): return (total_pairs - comb(num_strictly_below, 2) - comb(num_strictly_above, 2)) / total_pairs - @property - def min(self): - return 1 / 2 - class OutlyingnessBasedDepth(Depth): r""" From 809f3ac96eb2b93c88ef2f63c084409cb1d80bc1 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 18 Nov 2020 21:18:35 +0100 Subject: [PATCH 103/210] Final changes to maximum_depth_classifier --- skfda/ml/classification/__init__.py | 2 +- ...m_depth.py => maximum_depth_classifier.py} | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) rename skfda/ml/classification/{maximum_depth.py => maximum_depth_classifier.py} (77%) diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index 5201231d0..f6657e92a 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,3 +1,3 @@ from ..._neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid) -from .maximum_depth import MaximumDepth +from .maximum_depth_classifier import MaximumDepthClassifier diff --git a/skfda/ml/classification/maximum_depth.py b/skfda/ml/classification/maximum_depth_classifier.py similarity index 77% rename from skfda/ml/classification/maximum_depth.py rename to skfda/ml/classification/maximum_depth_classifier.py index 3330a1b6c..dc9ad53e9 100644 --- a/skfda/ml/classification/maximum_depth.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -5,21 +5,20 @@ from sklearn.base import ClassifierMixin, BaseEstimator, clone from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted -from skfda.exploratory.depth import * -from skfda._utils import _classifier_get_classes +from ...exploratory.depth import Depth, ModifiedBandDepth +from ..._utils import _classifier_get_classes -class MaximumDepth(BaseEstimator, ClassifierMixin): +class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): """Maximum depth classifier for functional data. Test samples are classified to the class where they are deeper. Parameters: - depth_method (callable, (default - :class:`IntegratedDepth `)): - The depth class to use when calculating the depth of a test + depth_method (Depth, default + :class:`ModifiedBandDepth `): + The depth class to use when calculating the depth of a test samples in a class. See the documentation of the depths module - for a list of available depths. By default it is the one used - by Fraiman and Muniz. + for a list of available depths. By default it is ModifiedBandDepth. Examples: Firstly, we will import and split the Berkeley Growth Study dataset @@ -33,27 +32,27 @@ class MaximumDepth(BaseEstimator, ClassifierMixin): We will fit a Maximum depth classifier - >>> from skfda.ml.classification import MaximumDepth - >>> clf = MaximumDepth() + >>> from skfda.ml.classification import MaximumDepthClassifier + >>> clf = MaximumDepthClassifier() >>> clf.fit(X_train, y_train) - MaximumDepth() + MaximumDepthClassifier() We can predict the class of new samples >>> clf.predict(X_test) # Predict labels for test samples - array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, + array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) Finally, we calculate the mean accuracy for the test data >>> clf.score(X_test, y_test) - 0.7916666666666666 + 0.875 See also: """ - def __init__(self, depth_method=IntegratedDepth()): + def __init__(self, depth_method: Depth = ModifiedBandDepth()): """Initialize the classifier.""" self.depth_method = depth_method @@ -86,4 +85,5 @@ def predict(self, X): sklearn_check_is_fitted(self) depths = [distribution.predict(X) for distribution in self.distributions_] + return self.classes_[np.argmax(depths, axis=0)] From 595e321b5912c6df13002dd93280a27f5379d5ac Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 19 Nov 2020 00:02:16 +0100 Subject: [PATCH 104/210] pycodestyle (Pep8) --- .../classification/maximum_depth_classifier.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/maximum_depth_classifier.py index dc9ad53e9..438467205 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -8,6 +8,7 @@ from ...exploratory.depth import Depth, ModifiedBandDepth from ..._utils import _classifier_get_classes + class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): """Maximum depth classifier for functional data. @@ -40,9 +41,9 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): We can predict the class of new samples >>> clf.predict(X_test) # Predict labels for test samples - array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, - 1, 1]) - + array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) + Finally, we calculate the mean accuracy for the test data >>> clf.score(X_test, y_test) @@ -65,12 +66,12 @@ def fit(self, X, y): """ self.classes_, y_ind = _classifier_get_classes(y) - + self.distributions_ = [clone(self.depth_method).fit( X[y_ind == cur_class]) for cur_class in range(self.classes_.size)] return self - + def predict(self, X): """Predict the class labels for the provided data. @@ -84,6 +85,7 @@ def predict(self, X): """ sklearn_check_is_fitted(self) - depths = [distribution.predict(X) for distribution in self.distributions_] - + depths = [distribution.predict(X) + for distribution in self.distributions_] + return self.classes_[np.argmax(depths, axis=0)] From 1ac4640e3667d1633a21a7b58a4c23f740d3bc05 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 19 Nov 2020 00:09:36 +0100 Subject: [PATCH 105/210] Typo --- skfda/ml/classification/maximum_depth_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/maximum_depth_classifier.py index 438467205..8474a1f74 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -18,7 +18,7 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): depth_method (Depth, default :class:`ModifiedBandDepth `): The depth class to use when calculating the depth of a test - samples in a class. See the documentation of the depths module + sample in a class. See the documentation of the depths module for a list of available depths. By default it is ModifiedBandDepth. Examples: Firstly, we will import and split the Berkeley Growth Study dataset From 62b097e250909c0dcfd0bc429af593fd29574031 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Fri, 20 Nov 2020 12:56:45 +0100 Subject: [PATCH 106/210] pydocstyle (Pep257) --- skfda/ml/classification/maximum_depth_classifier.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/maximum_depth_classifier.py index 8474a1f74..fcd4b360e 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -49,8 +49,6 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): >>> clf.score(X_test, y_test) 0.875 - See also: - """ def __init__(self, depth_method: Depth = ModifiedBandDepth()): From 8e5545a961569b82c17b3e3822b7c26d71d87939 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 24 Nov 2020 09:56:03 +0100 Subject: [PATCH 107/210] Optimize computation of inner product matrix. --- skfda/misc/_math.py | 67 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index 51b1edda5..db4987356 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -144,7 +144,7 @@ def cumsum(fdatagrid): @multimethod.multidispatch -def inner_product(arg1, arg2, **kwargs): +def inner_product(arg1, arg2, *, matrix=False, **kwargs): r"""Return the usual (:math:`L_2`) inner product. Calculates the inner product between matching samples in two @@ -248,24 +248,31 @@ def inner_product(arg1, arg2, **kwargs): """ if callable(arg1): - return _inner_product_integrate(arg1, arg2) + return _inner_product_integrate(arg1, arg2, matrix=matrix) else: - return (arg1 * arg2).sum(axis=-1) + return (np.einsum('n...,m...->nm...', arg1, arg2).sum(axis=-1) + if matrix else (arg1 * arg2).sum(axis=-1)) @inner_product.register -def inner_product_fdatagrid(arg1: FDataGrid, arg2: FDataGrid): +def inner_product_fdatagrid(arg1: FDataGrid, arg2: FDataGrid, *, matrix=False): if not np.array_equal(arg1.grid_points, arg2.grid_points): raise ValueError("Sample points for both objects must be equal") - integrand = arg1.data_matrix * arg2.data_matrix + d1 = arg1.data_matrix + d2 = arg2.data_matrix - for s in arg1.grid_points: + if matrix: + integrand = np.einsum('n...,m...->nm...', d1, d2) + else: + integrand = d1 * d2 + + for s in arg1.grid_points[::-1]: integrand = scipy.integrate.simps(integrand, x=s, - axis=1) + axis=-2) return np.sum(integrand, axis=-1) @@ -277,6 +284,7 @@ def inner_product_fdatagrid(arg1: FDataGrid, arg2: FDataGrid): def inner_product_fdatabasis(arg1: Union[FDataBasis, Basis], arg2: Union[FDataBasis, Basis], *, + matrix=False, inner_product_matrix=None, force_numerical=False): @@ -300,7 +308,7 @@ def inner_product_fdatabasis(arg1: Union[FDataBasis, Basis], # it is usually worthwhile same_basis = arg1.basis == arg2.basis - # The number of operations is less usinf the matrix + # The number of operations is less using the matrix n_ops_best_with_matrix = max( arg1.n_samples, arg2.n_samples) > arg1.n_basis * arg2.n_basis @@ -312,24 +320,48 @@ def inner_product_fdatabasis(arg1: Union[FDataBasis, Basis], if inner_product_matrix is None: inner_product_matrix = arg1.basis.inner_product_matrix(arg2.basis) - return (arg1.coefficients @ - inner_product_matrix * - arg2.coefficients).sum(axis=-1) + coef1 = arg1.coefficients + coef2 = arg2.coefficients + + if matrix: + return np.einsum('nb,bc,mc->nm', + coef1, inner_product_matrix, coef2) + else: + return (coef1 @ + inner_product_matrix * + coef2).sum(axis=-1) + else: - return _inner_product_integrate(arg1, arg2) + return _inner_product_integrate(arg1, arg2, matrix=matrix) -def _inner_product_integrate(arg1, arg2): +def _inner_product_integrate(arg1, arg2, *, matrix=False): if not np.array_equal(arg1.domain_range, arg2.domain_range): raise ValueError("Domain range for both objects must be equal") + def integrand(*args): + f1 = arg1([*args])[:, 0, :] + f2 = arg2([*args])[:, 0, :] + + if matrix: + ret = np.einsum('n...,m...->nm...', f1, f2) + ret = ret.reshape((-1,) + ret.shape[2:]) + return ret + else: + return f1 * f2 + integral = nquad_vec( - lambda *args: arg1([*args])[:, 0, :] * arg2([*args])[:, 0, :], + integrand, arg1.domain_range) - return np.sum(integral, axis=-1) + summation = np.sum(integral, axis=-1) + + if matrix: + summation = summation.reshape((len(arg1), len(arg2))) + + return summation def inner_product_matrix(arg1, arg2=None, **kwargs): @@ -350,4 +382,7 @@ def inner_product_matrix(arg1, arg2=None, **kwargs): if isinstance(arg2, Basis): arg2 = arg2.to_basis() - return _pairwise_commutative(inner_product, arg1, arg2, **kwargs) + if arg2 is None: + arg2 = arg1 + + return inner_product(arg1, arg2, matrix=True, **kwargs) From 913bbef49798c6e01dbcba1aae687f00f35bfecb Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 24 Nov 2020 09:56:43 +0100 Subject: [PATCH 108/210] Add first version of geometric mean. --- skfda/exploratory/stats/__init__.py | 3 ++- skfda/exploratory/stats/_stats.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/skfda/exploratory/stats/__init__.py b/skfda/exploratory/stats/__init__.py index d81e0ade3..e795b513c 100644 --- a/skfda/exploratory/stats/__init__.py +++ b/skfda/exploratory/stats/__init__.py @@ -1 +1,2 @@ -from ._stats import mean, var, gmean, cov, depth_based_median, trim_mean +from ._stats import (mean, var, gmean, cov, + depth_based_median, trim_mean, geometric_median) diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 9fe2d3ce1..b59e68f0e 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,5 +1,7 @@ """Functional data descriptive statistics. """ +import numpy as np + from ..depth import ModifiedBandDepth @@ -93,6 +95,34 @@ def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): return fdatagrid[indices_descending_depth[0]] +def geometric_median(fdata, rtol=1.e-5, atol=1.e-8): + + from ...misc import inner_product_matrix + + gram = inner_product_matrix(fdata) + identity = np.eye(fdata.n_samples) + weights = np.full(fdata.n_samples, 1 / fdata.n_samples) + prod_matrix = identity - weights + distances = np.einsum('ln,nn,nl->l', prod_matrix.T, gram, prod_matrix)**0.5 + + while True: + zero_distances = (distances == 0) + n_zeros = np.sum(zero_distances) + weights_new = ((1 / distances) / np.sum(1 / distances) if n_zeros == 0 + else (1 / n_zeros) * zero_distances) + + if np.allclose(weights, weights_new, rtol=rtol, atol=atol): + return (fdata * weights_new).sum() + + prod_matrix = identity - weights_new + + np.einsum('ln,nn,nl->l', prod_matrix.T, gram, + prod_matrix, out=distances) + distances **= 0.5 + + weights[...] = weights_new + + def trim_mean(fdatagrid, proportiontocut, depth_method=ModifiedBandDepth()): From a56706b107abaa35e3bf13bd00a627ba3727e5e6 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 24 Nov 2020 18:37:55 +0100 Subject: [PATCH 109/210] Optimize inner product for FDataGrid. --- skfda/misc/_math.py | 27 +++++++++++++++++++++------ tests/test_math.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index db4987356..8af2af8c2 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -264,17 +264,32 @@ def inner_product_fdatagrid(arg1: FDataGrid, arg2: FDataGrid, *, matrix=False): d1 = arg1.data_matrix d2 = arg2.data_matrix + einsum_broadcast_list = (np.arange(d1.ndim - 1) + 2).tolist() + if matrix: - integrand = np.einsum('n...,m...->nm...', d1, d2) + + d1 = np.copy(d1) + + # Perform quadrature inside the einsum + for i, s in enumerate(arg1.grid_points[::-1]): + identity = np.eye(len(s)) + weights = scipy.integrate.simps(identity, x=s) + index = (slice(None),) + (np.newaxis,) * (i + 1) + d1 *= weights[index] + + return np.einsum(d1, [0] + einsum_broadcast_list, + d2, [1] + einsum_broadcast_list, + [0, 1]) + else: integrand = d1 * d2 - for s in arg1.grid_points[::-1]: - integrand = scipy.integrate.simps(integrand, - x=s, - axis=-2) + for s in arg1.grid_points[::-1]: + integrand = scipy.integrate.simps(integrand, + x=s, + axis=-2) - return np.sum(integrand, axis=-1) + return np.sum(integrand, axis=-1) @inner_product.register(FDataBasis, FDataBasis) diff --git a/tests/test_math.py b/tests/test_math.py index 31789f836..d86be7b40 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -1,6 +1,8 @@ import skfda +from skfda._utils import _pairwise_commutative from skfda.representation.basis import Monomial, Tensor, VectorValued import unittest + import numpy as np @@ -69,6 +71,32 @@ def g(y): np.testing.assert_allclose( skfda.misc.inner_product(fd_basis, fd_basis), res, rtol=1e-5) + def test_matrix(self): + + basis = skfda.representation.basis.BSpline(n_basis=12) + + X = skfda.datasets.make_gaussian_process( + n_samples=10, n_features=20, + cov=skfda.misc.covariances.Gaussian(), + random_state=0) + Y = skfda.datasets.make_gaussian_process( + n_samples=10, n_features=20, + cov=skfda.misc.covariances.Gaussian(), + random_state=1) + + X_basis = X.to_basis(basis) + Y_basis = Y.to_basis(basis) + + gram = skfda.misc.inner_product_matrix(X, Y) + gram_basis = skfda.misc.inner_product_matrix(X_basis, Y_basis) + + np.testing.assert_allclose(gram, gram_basis, rtol=1e-2) + + gram_pairwise = _pairwise_commutative( + skfda.misc.inner_product, X, Y) + + np.testing.assert_allclose(gram, gram_pairwise) + if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.testName'] From 137424f8cdd4fd838eef506a7157ddb8e162182b Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 25 Nov 2020 00:33:48 +0100 Subject: [PATCH 110/210] Distance to trimmed means classifier --- skfda/ml/classification/DTM_classifier.py | 107 ++++++++++++++++++ skfda/ml/classification/__init__.py | 1 + .../maximum_depth_classifier.py | 2 + 3 files changed, 110 insertions(+) create mode 100644 skfda/ml/classification/DTM_classifier.py diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py new file mode 100644 index 000000000..d4e9e5799 --- /dev/null +++ b/skfda/ml/classification/DTM_classifier.py @@ -0,0 +1,107 @@ +"""Distance to trimmed means (DTM) classification.""" + +import numpy as np + +from sklearn.base import ClassifierMixin, BaseEstimator, clone +from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted + +from ...exploratory.depth import Depth, ModifiedBandDepth +from ..._utils import _classifier_get_classes +from ...exploratory.stats import trim_mean +from ...misc.metrics import lp_distance, pairwise_distance + + +class DTMClassifier(BaseEstimator, ClassifierMixin): + """Distance to trimmed means (DTM) classification. + + Test samples are classified to the class that minimizes the distance of + the observation to the trimmed mean of the group. + + Parameters: + proportiontocut (float): indicates the percentage of functions to + remove. It is not easy to determine as it varies from dataset to + dataset. + depth_method (Depth, default + :class:`ModifiedBandDepth `): + The depth class used to order the data. See the documentation of + the depths module for a list of available depths. By default it + is ModifiedBandDepth. + metric (function, default + :func:`lp_distance `): + Distance function between two functional objects. See the + documentation of the metrics module for a list of available + metrics. + + Examples: + Firstly, we will import and split the Berkeley Growth Study dataset + + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> dataset = fetch_growth() + >>> fd = dataset['data'] + >>> y = dataset['target'] + >>> X_train, X_test, y_train, y_test = train_test_split( + ... fd, y, test_size=0.25, stratify=y, random_state=0) + + We will fit a Distance to trimmed means classifier + + >>> from skfda.ml.classification import DTMClassifier + >>> clf = DTMClassifier(proportiontocut=0.25) + >>> clf.fit(X_train, y_train) + DTMClassifier(...) + + We can predict the class of new samples + + >>> clf.predict(X_test) # Predict labels for test samples + array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) + + Finally, we calculate the mean accuracy for the test data + + >>> clf.score(X_test, y_test) + 0.875 + + See also: + :class:`~skfda.ml.classification.MaximumDepthClassifier + """ + + def __init__(self, proportiontocut, + depth_method: Depth = ModifiedBandDepth(), + metric=lp_distance): + """Initialize the classifier.""" + self.proportiontocut = proportiontocut + self.depth_method = depth_method + self.metric = metric + + def fit(self, X, y): + """Fit the model using X as training data and y as target values. + + Args: + X (:class:`FDataGrid`): FDataGrid with the training data. + y (array-like): Target values of shape = [n_samples]. + """ + self.classes_, y_ind = _classifier_get_classes(y) + + self.trim_means_ = [trim_mean(X[y_ind == cur_class], + self.proportiontocut, + self.depth_method) + for cur_class in range(self.classes_.size)] + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Args: + X (:class:`FDataGrid`): FDataGrid with the test samples. + + Returns: + y (np.array): array of shape [n_samples] with class labels + for each data sample. + """ + sklearn_check_is_fitted(self) + + distances = [self.metric(X, trim_mean) + for trim_mean in self.trim_means_] + + return self.classes_[np.argmin(distances, axis=0)] diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index f6657e92a..94d7c2bff 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,3 +1,4 @@ from ..._neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid) from .maximum_depth_classifier import MaximumDepthClassifier +from .DTM_classifier import DTMClassifier diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/maximum_depth_classifier.py index fcd4b360e..5f4a482fc 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -49,6 +49,8 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): >>> clf.score(X_test, y_test) 0.875 + See also: + :class:`~skfda.ml.classification.DTMClassifier """ def __init__(self, depth_method: Depth = ModifiedBandDepth()): From ce3c79b0378d9b2193cd0547f37c979be03c2906 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 25 Nov 2020 00:45:32 +0100 Subject: [PATCH 111/210] pydocstyle and google docstrings style --- skfda/_neighbors/classification.py | 457 ++++++++++++++--------------- skfda/exploratory/stats/_stats.py | 4 +- 2 files changed, 218 insertions(+), 243 deletions(-) diff --git a/skfda/_neighbors/classification.py b/skfda/_neighbors/classification.py index 169fbf911..7493dd87e 100644 --- a/skfda/_neighbors/classification.py +++ b/skfda/_neighbors/classification.py @@ -16,112 +16,105 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, ClassifierMixin, NeighborsClassifierMixin): """Classifier implementing the k-nearest neighbors vote. - Parameters - ---------- - n_neighbors : int, optional (default = 5) - Number of neighbors to use by default for :meth:`kneighbors` queries. - weights : str or callable, optional (default = 'uniform') - weight function used in prediction. Possible values: - - - 'uniform' : uniform weights. All points in each neighborhood - are weighted equally. - - 'distance' : weight points by the inverse of their distance. - in this case, closer neighbors of a query point will have a - greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an - array of distances, and returns an array of the same shape - containing the weights. - - algorithm : {'auto', 'ball_tree', 'brute'}, optional - Algorithm used to compute the nearest neighbors: - - - 'ball_tree' will use :class:`sklearn.neighbors.BallTree`. - - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm based on - the values passed to :meth:`fit` method. - - leaf_size : int, optional (default = 30) - Leaf size passed to BallTree or KDTree. This can affect the - speed of the construction and query, as well as the memory - required to store the tree. The optimal value depends on the - nature of the problem. - metric : string or callable, (default - :func:`lp_distance `) - the distance metric to use for the tree. The default metric is - the L2 distance. See the documentation of the metrics module - for a list of available metrics. - metric_params : dict, optional (default = None) - Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) - The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. - Doesn't affect :meth:`fit` method. - multivariate_metric : boolean, optional (default = False) - Indicates if the metric used is a sklearn distance between vectors (see - :class:`~sklearn.neighbors.DistanceMetric`) or a functional metric of - the module `skfda.misc.metrics` if ``False``. - - Examples - -------- - Firstly, we will create a toy dataset with 2 classes - - >>> from skfda.datasets import make_sinusoidal_process - >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) - >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., - ... phase_std=.25, random_state=0) - >>> fd = fd1.concatenate(fd2) - >>> y = 15*[0] + 15*[1] - - We will fit a K-Nearest Neighbors classifier - - >>> from skfda.ml.classification import KNeighborsClassifier - >>> neigh = KNeighborsClassifier() - >>> neigh.fit(fd, y) - KNeighborsClassifier(...) - - We can predict the class of new samples - - >>> neigh.predict(fd[::2]) # Predict labels for even samples - array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - - And the estimated probabilities. - - >>> neigh.predict_proba(fd[0]) # Probabilities of sample 0 - array([[ 1., 0.]]) - - See also - -------- - :class:`~skfda.ml.classification.RadiusNeighborsClassifier` - :class:`~skfda.ml.classification.NearestCentroid` - :class:`~skfda.ml.regression.KNeighborsRegressor` - :class:`~skfda.ml.regression.RadiusNeighborsRegressor` - :class:`~skfda.ml.clustering.NearestNeighbors` - - - Notes - ----- - See Nearest Neighbors in the sklearn online documentation for a discussion - of the choice of ``algorithm`` and ``leaf_size``. - - This class wraps the sklearn classifier - `sklearn.neighbors.KNeighborsClassifier`. - - .. warning:: - Regarding the Nearest Neighbors algorithms, if it is found that two - neighbors, neighbor `k+1` and `k`, have identical distances - but different labels, the results will depend on the ordering of the - training data. - - https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm - + Parameters: + n_neighbors: int, optional (default = 5) + Number of neighbors to use by default for :meth:`kneighbors` queries. + weights: str or callable, optional (default = 'uniform') + weight function used in prediction. Possible values: + + - 'uniform': uniform weights. All points in each neighborhood + are weighted equally. + - 'distance': weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable]: a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + algorithm: {'auto', 'ball_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`sklearn.neighbors.BallTree`. + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm based on + the values passed to :meth:`fit` method. + + leaf_size: int, optional (default = 30) + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + metric: string or callable, (default + :func:`lp_distance `) + the distance metric to use for the tree. The default metric is + the L2 distance. See the documentation of the metrics module + for a list of available metrics. + metric_params: dict, optional (default = None) + Additional keyword arguments for the metric function. + n_jobs: int or None, optional (default=None) + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. + Doesn't affect :meth:`fit` method. + multivariate_metric: boolean, optional (default = False) + Indicates if the metric used is a sklearn distance between vectors (see + :class:`~sklearn.neighbors.DistanceMetric`) or a functional metric of + the module `skfda.misc.metrics` if ``False``. + + Examples: + Firstly, we will create a toy dataset with 2 classes + + >>> from skfda.datasets import make_sinusoidal_process + >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) + >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., + ... phase_std=.25, random_state=0) + >>> fd = fd1.concatenate(fd2) + >>> y = 15*[0] + 15*[1] + + We will fit a K-Nearest Neighbors classifier + + >>> from skfda.ml.classification import KNeighborsClassifier + >>> neigh = KNeighborsClassifier() + >>> neigh.fit(fd, y) + KNeighborsClassifier(...) + + We can predict the class of new samples + + >>> neigh.predict(fd[::2]) # Predict labels for even samples + array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + And the estimated probabilities. + + >>> neigh.predict_proba(fd[0]) # Probabilities of sample 0 + array([[ 1., 0.]]) + + See also: + :class:`~skfda.ml.classification.RadiusNeighborsClassifier` + :class:`~skfda.ml.classification.NearestCentroid` + :class:`~skfda.ml.regression.KNeighborsRegressor` + :class:`~skfda.ml.regression.RadiusNeighborsRegressor` + :class:`~skfda.ml.clustering.NearestNeighbors` + + Notes: + See Nearest Neighbors in the sklearn online documentation for a discussion + of the choice of ``algorithm`` and ``leaf_size``. + + This class wraps the sklearn classifier + `sklearn.neighbors.KNeighborsClassifier`. + + .. warning:: + Regarding the Nearest Neighbors algorithms, if it is found that two + neighbors, neighbor `k+1` and `k`, have identical distances + but different labels, the results will depend on the ordering of the + training data. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, metric='l2', metric_params=None, n_jobs=1, multivariate_metric=False): """Initialize the classifier.""" - super().__init__(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=leaf_size, metric=metric, @@ -132,13 +125,12 @@ def _init_estimator(self, sklearn_metric): """Initialize the sklearn K neighbors estimator. Args: - sklearn_metric: (pyfunc or 'precomputed'): Metric compatible with + sklearn_metric (pyfunc or 'precomputed'): Metric compatible with sklearn API or matrix (n_samples, n_samples) with precomputed distances. Returns: Sklearn K Neighbors estimator initialized. - """ from sklearn.neighbors import (KNeighborsClassifier as _KNeighborsClassifier) @@ -157,11 +149,10 @@ def predict_proba(self, X): samples or array (n_query, n_indexed) if metric == 'precomputed'. Returns - p : array of shape = [n_samples, n_classes], or a list of n_outputs + p: array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. - """ self._check_is_fitted() @@ -175,105 +166,99 @@ class RadiusNeighborsClassifier(NeighborsBase, NeighborsMixin, NeighborsClassifierMixin): """Classifier implementing a vote among neighbors within a given radius. - Parameters - ---------- - radius : float, optional (default = 1.0) - Range of parameter space to use by default for :meth:`radius_neighbors` - queries. - weights : str or callable - weight function used in prediction. Possible values: - - - 'uniform' : uniform weights. All points in each neighborhood - are weighted equally. - - 'distance' : weight points by the inverse of their distance. - in this case, closer neighbors of a query point will have a - greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an - array of distances, and returns an array of the same shape - containing the weights. - - Uniform weights are used by default. - algorithm : {'auto', 'ball_tree', 'brute'}, optional - Algorithm used to compute the nearest neighbors: - - - 'ball_tree' will use :class:`sklearn.neighbors.BallTree`. - - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. - - leaf_size : int, optional (default = 30) - Leaf size passed to BallTree. This can affect the - speed of the construction and query, as well as the memory - required to store the tree. The optimal value depends on the - nature of the problem. - metric : string or callable, (default - :func:`lp_distance `) - the distance metric to use for the tree. The default metric is - the L2 distance. See the documentation of the metrics module - for a list of available metrics. - outlier_label : int, optional (default = None) - Label, which is given for outlier samples (samples with no - neighbors on given radius). - If set to None, ValueError is raised, when outlier is detected. - metric_params : dict, optional (default = None) - Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) - The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. - multivariate_metric : boolean, optional (default = False) - Indicates if the metric used is a sklearn distance between vectors (see - :class:`sklearn.neighbors.DistanceMetric`) or a functional metric of - the module :mod:`skfda.misc.metrics`. - Examples - -------- - Firstly, we will create a toy dataset with 2 classes. - - >>> from skfda.datasets import make_sinusoidal_process - >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) - >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., - ... phase_std=.25, random_state=0) - >>> fd = fd1.concatenate(fd2) - >>> y = 15*[0] + 15*[1] - - We will fit a Radius Nearest Neighbors classifier. - - >>> from skfda.ml.classification import RadiusNeighborsClassifier - >>> neigh = RadiusNeighborsClassifier(radius=.3) - >>> neigh.fit(fd, y) - RadiusNeighborsClassifier(...radius=0.3...) - - We can predict the class of new samples. - - >>> neigh.predict(fd[::2]) # Predict labels for even samples - array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - - See also - -------- - :class:`~skfda.ml.classification.KNeighborsClassifier` - :class:`~skfda.ml.classification.NearestCentroid` - :class:`~skfda.ml.regression.KNeighborsRegressor` - :class:`~skfda.ml.regression.RadiusNeighborsRegressor` - :class:`~skfda.ml.clustering.NearestNeighbors` - - - Notes - ----- - See Nearest Neighbors in the sklearn online documentation for a discussion - of the choice of ``algorithm`` and ``leaf_size``. - - This class wraps the sklearn classifier - `sklearn.neighbors.RadiusNeighborsClassifier`. - - https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm - + Parameters: + radius: float, optional (default = 1.0) + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + weights: str or callable + weight function used in prediction. Possible values: + + - 'uniform': uniform weights. All points in each neighborhood + are weighted equally. + - 'distance': weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable]: a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + algorithm: {'auto', 'ball_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`sklearn.neighbors.BallTree`. + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + leaf_size: int, optional (default = 30) + Leaf size passed to BallTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + metric: string or callable, (default + :func:`lp_distance `) + the distance metric to use for the tree. The default metric is + the L2 distance. See the documentation of the metrics module + for a list of available metrics. + outlier_label: int, optional (default = None) + Label, which is given for outlier samples (samples with no + neighbors on given radius). + If set to None, ValueError is raised, when outlier is detected. + metric_params: dict, optional (default = None) + Additional keyword arguments for the metric function. + n_jobs: int or None, optional (default=None) + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. + multivariate_metric: boolean, optional (default = False) + Indicates if the metric used is a sklearn distance between vectors (see + :class:`sklearn.neighbors.DistanceMetric`) or a functional metric of + the module :mod:`skfda.misc.metrics`. + + Examples: + Firstly, we will create a toy dataset with 2 classes. + + >>> from skfda.datasets import make_sinusoidal_process + >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) + >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., + ... phase_std=.25, random_state=0) + >>> fd = fd1.concatenate(fd2) + >>> y = 15*[0] + 15*[1] + + We will fit a Radius Nearest Neighbors classifier. + + >>> from skfda.ml.classification import RadiusNeighborsClassifier + >>> neigh = RadiusNeighborsClassifier(radius=.3) + >>> neigh.fit(fd, y) + RadiusNeighborsClassifier(...radius=0.3...) + + We can predict the class of new samples. + + >>> neigh.predict(fd[::2]) # Predict labels for even samples + array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + See also: + :class:`~skfda.ml.classification.KNeighborsClassifier` + :class:`~skfda.ml.classification.NearestCentroid` + :class:`~skfda.ml.regression.KNeighborsRegressor` + :class:`~skfda.ml.regression.RadiusNeighborsRegressor` + :class:`~skfda.ml.clustering.NearestNeighbors` + + Notes: + See Nearest Neighbors in the sklearn online documentation for a discussion + of the choice of ``algorithm`` and ``leaf_size``. + + This class wraps the sklearn classifier + `sklearn.neighbors.RadiusNeighborsClassifier`. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, metric='l2', metric_params=None, outlier_label=None, n_jobs=1, multivariate_metric=False): """Initialize the classifier.""" - super().__init__(radius=radius, weights=weights, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, n_jobs=n_jobs, @@ -291,7 +276,6 @@ def _init_estimator(self, sklearn_metric): Returns: Sklearn Radius Neighbors estimator initialized. - """ from sklearn.neighbors import (RadiusNeighborsClassifier as _RadiusNeighborsClassifier) @@ -309,9 +293,8 @@ class NearestCentroid(BaseEstimator, ClassifierMixin): Each class is represented by its centroid, with test samples classified to the class with the nearest centroid. - Parameters - ---------- - metric : callable, (default + Parameters: + metric: callable, (default :func:`lp_distance `) The metric to use when calculating distance between test samples and centroids. See the documentation of the metrics module @@ -326,42 +309,37 @@ class NearestCentroid(BaseEstimator, ClassifierMixin): The function must accept a :class:`FData` with the samples of one class and return a :class:`FData` object with only one sample representing the centroid. - Attributes - ---------- - centroids_ : :class:`FDataGrid` - FDatagrid containing the centroid of each class - Examples - -------- - Firstly, we will create a toy dataset with 2 classes - - >>> from skfda.datasets import make_sinusoidal_process - >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) - >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., - ... phase_std=.25, random_state=0) - >>> fd = fd1.concatenate(fd2) - >>> y = 15*[0] + 15*[1] - - We will fit a Nearest centroids classifier - - >>> from skfda.ml.classification import NearestCentroid - >>> neigh = NearestCentroid() - >>> neigh.fit(fd, y) - NearestCentroid(...) - - We can predict the class of new samples - - >>> neigh.predict(fd[::2]) # Predict labels for even samples - array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - - See also - -------- - :class:`~skfda.ml.classification.KNeighborsClassifier` - :class:`~skfda.ml.classification.RadiusNeighborsClassifier` - :class:`~skfda.ml.regression.KNeighborsRegressor` - :class:`~skfda.ml.regression.RadiusNeighborsRegressor` - :class:`~skfda.ml.clustering.NearestNeighbors` - - + Attributes: + centroids_: :class:`FDataGrid` + FDatagrid containing the centroid of each class + Examples: + Firstly, we will create a toy dataset with 2 classes + + >>> from skfda.datasets import make_sinusoidal_process + >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) + >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., + ... phase_std=.25, random_state=0) + >>> fd = fd1.concatenate(fd2) + >>> y = 15*[0] + 15*[1] + + We will fit a Nearest centroids classifier + + >>> from skfda.ml.classification import NearestCentroid + >>> neigh = NearestCentroid() + >>> neigh.fit(fd, y) + NearestCentroid(...) + + We can predict the class of new samples + + >>> neigh.predict(fd[::2]) # Predict labels for even samples + array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + See also: + :class:`~skfda.ml.classification.KNeighborsClassifier` + :class:`~skfda.ml.classification.RadiusNeighborsClassifier` + :class:`~skfda.ml.regression.KNeighborsRegressor` + :class:`~skfda.ml.regression.RadiusNeighborsRegressor` + :class:`~skfda.ml.clustering.NearestNeighbors` """ def __init__(self, metric='l2', mean='mean'): @@ -378,7 +356,6 @@ def fit(self, X, y): [n_samples, n_samples] if metric='precomputed'. y (array-like or sparse matrix): Target values of shape = [n_samples] or [n_samples, n_outputs]. - """ if self.metric == 'precomputed': raise ValueError("Precomputed is not supported.") @@ -415,10 +392,8 @@ def predict(self, X): X (:class:`FDataGrid`): FDataGrid with the test samples. Returns: - - (np.array): y : array of shape [n_samples] or + y (np.array): array of shape [n_samples] or [n_samples, n_outputs] with class labels for each data sample. - """ sklearn_check_is_fitted(self, 'centroids_') diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 9fe2d3ce1..9616840b0 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,5 +1,5 @@ -"""Functional data descriptive statistics. -""" +"""Functional data descriptive statistics.""" + from ..depth import ModifiedBandDepth From 8d85dd17b9db21f8356de407438d8e108cda693a Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Wed, 25 Nov 2020 12:42:51 +0100 Subject: [PATCH 112/210] Update skfda/_neighbors/classification.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/_neighbors/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/_neighbors/classification.py b/skfda/_neighbors/classification.py index 7493dd87e..41fe57e4b 100644 --- a/skfda/_neighbors/classification.py +++ b/skfda/_neighbors/classification.py @@ -102,7 +102,7 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, This class wraps the sklearn classifier `sklearn.neighbors.KNeighborsClassifier`. - .. warning:: + Warning: Regarding the Nearest Neighbors algorithms, if it is found that two neighbors, neighbor `k+1` and `k`, have identical distances but different labels, the results will depend on the ordering of the From 8013c4d6350f7bb507ca478f236f6c853e7c09e8 Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Wed, 25 Nov 2020 12:43:12 +0100 Subject: [PATCH 113/210] Update skfda/_neighbors/classification.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/_neighbors/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/_neighbors/classification.py b/skfda/_neighbors/classification.py index 41fe57e4b..f8b36ad90 100644 --- a/skfda/_neighbors/classification.py +++ b/skfda/_neighbors/classification.py @@ -148,7 +148,7 @@ def predict_proba(self, X): X (:class:`FDataGrid` or array-like): FDataGrid with the test samples or array (n_query, n_indexed) if metric == 'precomputed'. - Returns + Returns: p: array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are From cae5faff666c0df8a0086eaadb06763e442f89c8 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 25 Nov 2020 23:33:39 +0100 Subject: [PATCH 114/210] Nearest Centroid and DTM --- skfda/_neighbors/classification.py | 85 +++++++++-------------- skfda/_utils/_utils.py | 2 +- skfda/misc/metrics.py | 54 +++++++++++--- skfda/ml/classification/DTM_classifier.py | 51 ++------------ 4 files changed, 86 insertions(+), 106 deletions(-) diff --git a/skfda/_neighbors/classification.py b/skfda/_neighbors/classification.py index f8b36ad90..97f6391d2 100644 --- a/skfda/_neighbors/classification.py +++ b/skfda/_neighbors/classification.py @@ -1,15 +1,13 @@ """Neighbor models for supervised classification.""" - from sklearn.base import ClassifierMixin, BaseEstimator -from sklearn.preprocessing import LabelEncoder -from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted -from ..exploratory.stats import mean as l2_mean -from ..misc.metrics import lp_distance, pairwise_distance +from ..exploratory.stats import mean +from ..misc.metrics import l2_distance, pairwise_distance from .base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, NeighborsClassifierMixin, RadiusNeighborsMixin) +from .._utils import _classifier_get_classes class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, @@ -18,7 +16,8 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, Parameters: n_neighbors: int, optional (default = 5) - Number of neighbors to use by default for :meth:`kneighbors` queries. + Number of neighbors to use by default for :meth:`kneighbors` + queries. weights: str or callable, optional (default = 'uniform') weight function used in prediction. Possible values: @@ -36,8 +35,8 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, - 'ball_tree' will use :class:`sklearn.neighbors.BallTree`. - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm based on - the values passed to :meth:`fit` method. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. leaf_size: int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the @@ -53,13 +52,13 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, Additional keyword arguments for the metric function. n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. Doesn't affect :meth:`fit` method. multivariate_metric: boolean, optional (default = False) - Indicates if the metric used is a sklearn distance between vectors (see - :class:`~sklearn.neighbors.DistanceMetric`) or a functional metric of - the module `skfda.misc.metrics` if ``False``. + Indicates if the metric used is a sklearn distance between vectors + (see :class:`~sklearn.neighbors.DistanceMetric`) or a functional + metric of the module `skfda.misc.metrics` if ``False``. Examples: Firstly, we will create a toy dataset with 2 classes @@ -96,8 +95,8 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, :class:`~skfda.ml.clustering.NearestNeighbors` Notes: - See Nearest Neighbors in the sklearn online documentation for a discussion - of the choice of ``algorithm`` and ``leaf_size``. + See Nearest Neighbors in the sklearn online documentation for a + discussion of the choice of ``algorithm`` and ``leaf_size``. This class wraps the sklearn classifier `sklearn.neighbors.KNeighborsClassifier`. @@ -168,8 +167,8 @@ class RadiusNeighborsClassifier(NeighborsBase, NeighborsMixin, Parameters: radius: float, optional (default = 1.0) - Range of parameter space to use by default for :meth:`radius_neighbors` - queries. + Range of parameter space to use by default for + :meth:`radius_neighbors` queries. weights: str or callable weight function used in prediction. Possible values: @@ -209,12 +208,12 @@ class RadiusNeighborsClassifier(NeighborsBase, NeighborsMixin, Additional keyword arguments for the metric function. n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. multivariate_metric: boolean, optional (default = False) - Indicates if the metric used is a sklearn distance between vectors (see - :class:`sklearn.neighbors.DistanceMetric`) or a functional metric of - the module :mod:`skfda.misc.metrics`. + Indicates if the metric used is a sklearn distance between vectors + (see :class:`sklearn.neighbors.DistanceMetric`) or a functional + metric of the module :mod:`skfda.misc.metrics`. Examples: Firstly, we will create a toy dataset with 2 classes. @@ -246,8 +245,8 @@ class RadiusNeighborsClassifier(NeighborsBase, NeighborsMixin, :class:`~skfda.ml.clustering.NearestNeighbors` Notes: - See Nearest Neighbors in the sklearn online documentation for a discussion - of the choice of ``algorithm`` and ``leaf_size``. + See Nearest Neighbors in the sklearn online documentation for a + discussion of the choice of ``algorithm`` and ``leaf_size``. This class wraps the sklearn classifier `sklearn.neighbors.RadiusNeighborsClassifier`. @@ -342,10 +341,10 @@ class and return a :class:`FData` object with only one sample :class:`~skfda.ml.clustering.NearestNeighbors` """ - def __init__(self, metric='l2', mean='mean'): + def __init__(self, metric=l2_distance, centroid=mean): """Initialize the classifier.""" self.metric = metric - self.mean = mean + self.centroid = centroid def fit(self, X, y): """Fit the model using X as training data and y as target values. @@ -357,30 +356,12 @@ def fit(self, X, y): y (array-like or sparse matrix): Target values of shape = [n_samples] or [n_samples, n_outputs]. """ - if self.metric == 'precomputed': - raise ValueError("Precomputed is not supported.") - elif self.metric == 'l2': - self._pairwise_distance = pairwise_distance(lp_distance) - else: - self._pairwise_distance = pairwise_distance(self.metric) - - mean = l2_mean if self.mean == 'mean' else self.mean - - check_classification_targets(y) - - le = LabelEncoder() - y_ind = le.fit_transform(y) - self.classes_ = classes = le.classes_ - n_classes = classes.size - if n_classes < 2: - raise ValueError(f'The number of classes has to be greater than' - f' one; got {n_classes} class') - - self.centroids_ = mean(X[y_ind == 0]) - - for cur_class in range(1, n_classes): - center_mask = y_ind == cur_class - centroid = mean(X[center_mask]) + self.classes_, y_ind = _classifier_get_classes(y) + + self.centroids_ = self.centroid(X[y_ind == 0]) + + for cur_class in range(1, self.classes_.size): + centroid = self.centroid(X[y_ind == cur_class]) self.centroids_ = self.centroids_.concatenate(centroid) return self @@ -395,7 +376,7 @@ def predict(self, X): y (np.array): array of shape [n_samples] or [n_samples, n_outputs] with class labels for each data sample. """ - sklearn_check_is_fitted(self, 'centroids_') + sklearn_check_is_fitted(self) - return self.classes_[self._pairwise_distance( + return self.classes_[pairwise_distance(self.metric)( X, self.centroids_).argmin(axis=1)] diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index e18104784..821b440ed 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -462,5 +462,5 @@ def _classifier_get_classes(y): if classes.size < 2: raise ValueError(f'The number of classes has to be greater than' - f' one; got {classes.size} class') + f' one; got {classes.size} class') return classes, y_ind diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index a8b181923..fe564f830 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -112,12 +112,13 @@ def norm_distance(fdata1, fdata2): def pairwise_distance(distance, **kwargs): - r"""Return pairwise distance for FDataGrid objects. + r"""Returns a pairwise distance function for FData objects. - Given a distance returns the corresponding pairwise distance function. + Given a distance it returns the corresponding pairwise distance function. - The pairwise distance calculates the distance between all possible pairs of - one sample of the first FDataGrid object and one of the second one. + The returned pairwise distance function calculates the distance between + all possible pairs consisting of one sample of the first FDataGrid object + and one of the second one. The matrix returned by the pairwise distance is a matrix with as many rows as samples in the first object and as many columns as samples in the second @@ -267,15 +268,12 @@ def lp_norm(fdata, p=2, p2=None): def lp_distance(fdata1, fdata2, p=2, p2=2, *, eval_points=None, _check=True): r"""Lp distance for FDataGrid objects. - Calculates the distance between all possible pairs of one sample of - the first FDataGrid object and one of the second one. + Calculates the distance between two functional objects. For each pair of samples f and g the distance between them is defined as: .. math:: - d(f, g) = d(f, g) = \lVert f - g \rVert - - The norm is specified as a parameter but defaults to the l2 norm. + d(f, g) = d(g, f) = \lVert f - g \rVert Args: fdatagrid (FDataGrid): FDataGrid object. @@ -315,6 +313,44 @@ def lp_distance(fdata1, fdata2, p=2, p2=2, *, eval_points=None, _check=True): return lp_norm(fdata1 - fdata2, p=p, p2=p2) +def l1_distance(fdata1, fdata2, *, eval_points=None, _check=True): + r"""L1 distance for FDataGrid objects. + + Calculates the L1 distance between fdata1 and fdata2: + .. math:: + d(fdata1, fdata2) = + \left( \int_D \lvert fdata1(x)-fdata2(x) \rvert dx + \right) + """ + return lp_distance(fdata1, fdata2, p=1, p2=1, + eval_points=eval_points, _check=_check) + + +def l2_distance(fdata1, fdata2, *, eval_points=None, _check=True): + r"""L2 distance for FDataGrid objects. + + Calculates the euclidean distance between fdata1 and fdata2: + .. math:: + d(fdata1, fdata2) = + \left( \int_D \lvert fdata1(x)-fdata2(x) \rvert^2 dx + \right)^{\frac{1}{2}} + """ + return lp_distance(fdata1, fdata2, p=2, p2=2, + eval_points=eval_points, _check=_check) + + +def linf_distance(fdata1, fdata2, *, eval_points=None, _check=True): + r"""Linf distance for FDataGrid objects. + + Calculates the Linf distance between fdata1 and fdata2: + .. math:: + d(fdata1, fdata2) \equiv \inf \{ C\ge 0 : |fdata1(x)-fdata2(x)| + \le C a.e. \}. + """ + return lp_distance(fdata1, fdata2, p=np.inf, p2=np.inf, + eval_points=eval_points, _check=_check) + + def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): r"""Compute the Fisher-Rao distance between two functional objects. diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py index d4e9e5799..0aaff5da9 100644 --- a/skfda/ml/classification/DTM_classifier.py +++ b/skfda/ml/classification/DTM_classifier.py @@ -1,17 +1,12 @@ """Distance to trimmed means (DTM) classification.""" -import numpy as np - -from sklearn.base import ClassifierMixin, BaseEstimator, clone -from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted - +from ..._neighbors.classification import NearestCentroid from ...exploratory.depth import Depth, ModifiedBandDepth -from ..._utils import _classifier_get_classes from ...exploratory.stats import trim_mean -from ...misc.metrics import lp_distance, pairwise_distance +from ...misc.metrics import lp_distance -class DTMClassifier(BaseEstimator, ClassifierMixin): +class DTMClassifier(NearestCentroid): """Distance to trimmed means (DTM) classification. Test samples are classified to the class that minimizes the distance of @@ -69,39 +64,7 @@ def __init__(self, proportiontocut, depth_method: Depth = ModifiedBandDepth(), metric=lp_distance): """Initialize the classifier.""" - self.proportiontocut = proportiontocut - self.depth_method = depth_method - self.metric = metric - - def fit(self, X, y): - """Fit the model using X as training data and y as target values. - - Args: - X (:class:`FDataGrid`): FDataGrid with the training data. - y (array-like): Target values of shape = [n_samples]. - """ - self.classes_, y_ind = _classifier_get_classes(y) - - self.trim_means_ = [trim_mean(X[y_ind == cur_class], - self.proportiontocut, - self.depth_method) - for cur_class in range(self.classes_.size)] - - return self - - def predict(self, X): - """Predict the class labels for the provided data. - - Args: - X (:class:`FDataGrid`): FDataGrid with the test samples. - - Returns: - y (np.array): array of shape [n_samples] with class labels - for each data sample. - """ - sklearn_check_is_fitted(self) - - distances = [self.metric(X, trim_mean) - for trim_mean in self.trim_means_] - - return self.classes_[np.argmin(distances, axis=0)] + super().__init__(metric=metric, + centroid=lambda fdatagrid: trim_mean(fdatagrid, + proportiontocut, + depth_method)) From 723934d08e2b0fba196e20d4ba24f53dbb145a82 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 25 Nov 2020 23:38:42 +0100 Subject: [PATCH 115/210] Sytle update --- skfda/_neighbors/classification.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/skfda/_neighbors/classification.py b/skfda/_neighbors/classification.py index 97f6391d2..15c38d7e6 100644 --- a/skfda/_neighbors/classification.py +++ b/skfda/_neighbors/classification.py @@ -53,7 +53,8 @@ class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` - context. ``-1`` means using all processors. + context. + ``-1`` means using all processors. Doesn't affect :meth:`fit` method. multivariate_metric: boolean, optional (default = False) Indicates if the metric used is a sklearn distance between vectors @@ -209,7 +210,8 @@ class RadiusNeighborsClassifier(NeighborsBase, NeighborsMixin, n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` - context. ``-1`` means using all processors. + context. + ``-1`` means using all processors. multivariate_metric: boolean, optional (default = False) Indicates if the metric used is a sklearn distance between vectors (see :class:`sklearn.neighbors.DistanceMetric`) or a functional From 3d29ef908dee16859f0eeaf2e306d17cde1e6823 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 26 Nov 2020 00:17:11 +0100 Subject: [PATCH 116/210] First attempt --- skfda/_neighbors/__init__.py | 14 -- skfda/ml/__init__.py | 1 - skfda/ml/classification/DTM_classifier.py | 70 ------- skfda/ml/classification/__init__.py | 8 +- .../ml/classification/centroid_classifiers.py | 172 ++++++++++++++++++ ...pth_classifier.py => depth_classifiers.py} | 2 +- .../classification/neighbors_classifiers.py} | 106 +---------- skfda/ml/clustering/__init__.py | 7 +- skfda/ml/clustering/kmeans.py | 7 +- .../clustering/neighbors.py} | 5 +- .../base.py => ml/neighbors_base.py} | 2 +- .../outlier.py => ml/neighbors_outlier.py} | 6 +- skfda/ml/regression/__init__.py | 4 +- .../regression/neighbors.py} | 5 +- tests/test_neighbors.py | 4 +- 15 files changed, 197 insertions(+), 216 deletions(-) delete mode 100644 skfda/_neighbors/__init__.py delete mode 100644 skfda/ml/classification/DTM_classifier.py create mode 100644 skfda/ml/classification/centroid_classifiers.py rename skfda/ml/classification/{maximum_depth_classifier.py => depth_classifiers.py} (98%) rename skfda/{_neighbors/classification.py => ml/classification/neighbors_classifiers.py} (74%) rename skfda/{_neighbors/unsupervised.py => ml/clustering/neighbors.py} (97%) rename skfda/{_neighbors/base.py => ml/neighbors_base.py} (99%) rename skfda/{_neighbors/outlier.py => ml/neighbors_outlier.py} (98%) rename skfda/{_neighbors/regression.py => ml/regression/neighbors.py} (98%) diff --git a/skfda/_neighbors/__init__.py b/skfda/_neighbors/__init__.py deleted file mode 100644 index 22047b996..000000000 --- a/skfda/_neighbors/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Private module with the implementation of the neighbors estimators -Includes the following classes: - - NearestNeighbors - - KNeighborsClassifier - - RadiusNeighborsClassifier - - NearestCentroid - - KNeighborsRegressor - - RadiusNeighborsRegressor - -""" -from .unsupervised import NearestNeighbors -from .regression import KNeighborsRegressor, RadiusNeighborsRegressor -from .classification import (KNeighborsClassifier, RadiusNeighborsClassifier, - NearestCentroid) diff --git a/skfda/ml/__init__.py b/skfda/ml/__init__.py index a65e22c8c..e0cea54ed 100644 --- a/skfda/ml/__init__.py +++ b/skfda/ml/__init__.py @@ -1,2 +1 @@ - from . import classification, clustering, regression diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py deleted file mode 100644 index 0aaff5da9..000000000 --- a/skfda/ml/classification/DTM_classifier.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Distance to trimmed means (DTM) classification.""" - -from ..._neighbors.classification import NearestCentroid -from ...exploratory.depth import Depth, ModifiedBandDepth -from ...exploratory.stats import trim_mean -from ...misc.metrics import lp_distance - - -class DTMClassifier(NearestCentroid): - """Distance to trimmed means (DTM) classification. - - Test samples are classified to the class that minimizes the distance of - the observation to the trimmed mean of the group. - - Parameters: - proportiontocut (float): indicates the percentage of functions to - remove. It is not easy to determine as it varies from dataset to - dataset. - depth_method (Depth, default - :class:`ModifiedBandDepth `): - The depth class used to order the data. See the documentation of - the depths module for a list of available depths. By default it - is ModifiedBandDepth. - metric (function, default - :func:`lp_distance `): - Distance function between two functional objects. See the - documentation of the metrics module for a list of available - metrics. - - Examples: - Firstly, we will import and split the Berkeley Growth Study dataset - - >>> from skfda.datasets import fetch_growth - >>> from sklearn.model_selection import train_test_split - >>> dataset = fetch_growth() - >>> fd = dataset['data'] - >>> y = dataset['target'] - >>> X_train, X_test, y_train, y_test = train_test_split( - ... fd, y, test_size=0.25, stratify=y, random_state=0) - - We will fit a Distance to trimmed means classifier - - >>> from skfda.ml.classification import DTMClassifier - >>> clf = DTMClassifier(proportiontocut=0.25) - >>> clf.fit(X_train, y_train) - DTMClassifier(...) - - We can predict the class of new samples - - >>> clf.predict(X_test) # Predict labels for test samples - array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) - - Finally, we calculate the mean accuracy for the test data - - >>> clf.score(X_test, y_test) - 0.875 - - See also: - :class:`~skfda.ml.classification.MaximumDepthClassifier - """ - - def __init__(self, proportiontocut, - depth_method: Depth = ModifiedBandDepth(), - metric=lp_distance): - """Initialize the classifier.""" - super().__init__(metric=metric, - centroid=lambda fdatagrid: trim_mean(fdatagrid, - proportiontocut, - depth_method)) diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index 94d7c2bff..c7fab00be 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,4 +1,4 @@ -from ..._neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier, - NearestCentroid) -from .maximum_depth_classifier import MaximumDepthClassifier -from .DTM_classifier import DTMClassifier +from .neighbors_classifiers import (KNeighborsClassifier, + RadiusNeighborsClassifier) +from .depth_classifiers import MaximumDepthClassifier +from .centroid_classifiers import NearestCentroid, DTMClassifier diff --git a/skfda/ml/classification/centroid_classifiers.py b/skfda/ml/classification/centroid_classifiers.py new file mode 100644 index 000000000..3079dc321 --- /dev/null +++ b/skfda/ml/classification/centroid_classifiers.py @@ -0,0 +1,172 @@ +"""Centroid models for supervised classification.""" + +from sklearn.base import ClassifierMixin, BaseEstimator + +from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted + +from ...exploratory.stats import mean +from ...misc.metrics import l2_distance, pairwise_distance +from ...exploratory.depth import Depth, ModifiedBandDepth +from ...exploratory.stats import trim_mean +from ...misc.metrics import lp_distance +from ..._utils import _classifier_get_classes + + +class NearestCentroid(BaseEstimator, ClassifierMixin): + """Nearest centroid classifier for functional data. + + Each class is represented by its centroid, with test samples classified to + the class with the nearest centroid. + + Parameters: + metric: callable, (default + :func:`lp_distance `) + The metric to use when calculating distance between test samples + and centroids. See the documentation of the metrics module + for a list of available metrics. Defaults used L2 distance. + centroid: callable, (default + :func:`mean `) + The centroids for the samples corresponding to each class is the + point from which the sum of the distances (according to the metric) + of all samples that belong to that particular class are minimized. + By default it is used the usual mean, which minimizes the sum of L2 + distances. This parameter allows change the centroid constructor. + The function must accept a :class:`FData` with the samples of one + class and return a :class:`FData` object with only one sample + representing the centroid. + Attributes: + centroids_: :class:`FDataGrid` + FDatagrid containing the centroid of each class + Examples: + Firstly, we will create a toy dataset with 2 classes + + >>> from skfda.datasets import make_sinusoidal_process + >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) + >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., + ... phase_std=.25, random_state=0) + >>> fd = fd1.concatenate(fd2) + >>> y = 15*[0] + 15*[1] + + We will fit a Nearest centroids classifier + + >>> from skfda.ml.classification import NearestCentroid + >>> neigh = NearestCentroid() + >>> neigh.fit(fd, y) + NearestCentroid(...) + + We can predict the class of new samples + + >>> neigh.predict(fd[::2]) # Predict labels for even samples + array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + See also: + :class:`~skfda.ml.classification.KNeighborsClassifier` + :class:`~skfda.ml.classification.RadiusNeighborsClassifier` + :class:`~skfda.ml.regression.KNeighborsRegressor` + :class:`~skfda.ml.regression.RadiusNeighborsRegressor` + :class:`~skfda.ml.clustering.NearestNeighbors` + """ + + def __init__(self, metric=l2_distance, centroid=mean): + """Initialize the classifier.""" + self.metric = metric + self.centroid = centroid + + def fit(self, X, y): + """Fit the model using X as training data and y as target values. + + Args: + X (:class:`FDataGrid`, array_matrix): Training data. FDataGrid + with the training data or array matrix with shape + [n_samples, n_samples] if metric='precomputed'. + y (array-like or sparse matrix): Target values of + shape = [n_samples] or [n_samples, n_outputs]. + """ + self.classes_, y_ind = _classifier_get_classes(y) + + self.centroids_ = self.centroid(X[y_ind == 0]) + + for cur_class in range(1, self.classes_.size): + centroid = self.centroid(X[y_ind == cur_class]) + self.centroids_ = self.centroids_.concatenate(centroid) + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Args: + X (:class:`FDataGrid`): FDataGrid with the test samples. + + Returns: + y (np.array): array of shape [n_samples] or + [n_samples, n_outputs] with class labels for each data sample. + """ + sklearn_check_is_fitted(self) + + return self.classes_[pairwise_distance(self.metric)( + X, self.centroids_).argmin(axis=1)] + + +class DTMClassifier(NearestCentroid): + """Distance to trimmed means (DTM) classification. + + Test samples are classified to the class that minimizes the distance of + the observation to the trimmed mean of the group. + + Parameters: + proportiontocut (float): indicates the percentage of functions to + remove. It is not easy to determine as it varies from dataset to + dataset. + depth_method (Depth, default + :class:`ModifiedBandDepth `): + The depth class used to order the data. See the documentation of + the depths module for a list of available depths. By default it + is ModifiedBandDepth. + metric (function, default + :func:`lp_distance `): + Distance function between two functional objects. See the + documentation of the metrics module for a list of available + metrics. + + Examples: + Firstly, we will import and split the Berkeley Growth Study dataset + + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> dataset = fetch_growth() + >>> fd = dataset['data'] + >>> y = dataset['target'] + >>> X_train, X_test, y_train, y_test = train_test_split( + ... fd, y, test_size=0.25, stratify=y, random_state=0) + + We will fit a Distance to trimmed means classifier + + >>> from skfda.ml.classification import DTMClassifier + >>> clf = DTMClassifier(proportiontocut=0.25) + >>> clf.fit(X_train, y_train) + DTMClassifier(...) + + We can predict the class of new samples + + >>> clf.predict(X_test) # Predict labels for test samples + array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) + + Finally, we calculate the mean accuracy for the test data + + >>> clf.score(X_test, y_test) + 0.875 + + See also: + :class:`~skfda.ml.classification.MaximumDepthClassifier + """ + + def __init__(self, proportiontocut, + depth_method: Depth = ModifiedBandDepth(), + metric=lp_distance): + """Initialize the classifier.""" + super().__init__(metric=metric, + centroid=lambda fdatagrid: trim_mean(fdatagrid, + proportiontocut, + depth_method)) diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/depth_classifiers.py similarity index 98% rename from skfda/ml/classification/maximum_depth_classifier.py rename to skfda/ml/classification/depth_classifiers.py index 5f4a482fc..8135af714 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/depth_classifiers.py @@ -1,4 +1,4 @@ -"""Maximum depth for supervised classification.""" +"""Depth models for supervised classification.""" import numpy as np diff --git a/skfda/_neighbors/classification.py b/skfda/ml/classification/neighbors_classifiers.py similarity index 74% rename from skfda/_neighbors/classification.py rename to skfda/ml/classification/neighbors_classifiers.py index 15c38d7e6..3eac71b68 100644 --- a/skfda/_neighbors/classification.py +++ b/skfda/ml/classification/neighbors_classifiers.py @@ -1,13 +1,9 @@ """Neighbor models for supervised classification.""" -from sklearn.base import ClassifierMixin, BaseEstimator -from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted +from sklearn.base import ClassifierMixin -from ..exploratory.stats import mean -from ..misc.metrics import l2_distance, pairwise_distance -from .base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, - NeighborsClassifierMixin, RadiusNeighborsMixin) -from .._utils import _classifier_get_classes +from ..neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, + NeighborsClassifierMixin, RadiusNeighborsMixin) class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, @@ -286,99 +282,3 @@ def _init_estimator(self, sklearn_metric): algorithm=self.algorithm, leaf_size=self.leaf_size, metric=sklearn_metric, metric_params=self.metric_params, outlier_label=self.outlier_label, n_jobs=self.n_jobs) - - -class NearestCentroid(BaseEstimator, ClassifierMixin): - """Nearest centroid classifier for functional data. - - Each class is represented by its centroid, with test samples classified to - the class with the nearest centroid. - - Parameters: - metric: callable, (default - :func:`lp_distance `) - The metric to use when calculating distance between test samples - and centroids. See the documentation of the metrics module - for a list of available metrics. Defaults used L2 distance. - centroid: callable, (default - :func:`mean `) - The centroids for the samples corresponding to each class is the - point from which the sum of the distances (according to the metric) - of all samples that belong to that particular class are minimized. - By default it is used the usual mean, which minimizes the sum of L2 - distances. This parameter allows change the centroid constructor. - The function must accept a :class:`FData` with the samples of one - class and return a :class:`FData` object with only one sample - representing the centroid. - Attributes: - centroids_: :class:`FDataGrid` - FDatagrid containing the centroid of each class - Examples: - Firstly, we will create a toy dataset with 2 classes - - >>> from skfda.datasets import make_sinusoidal_process - >>> fd1 = make_sinusoidal_process(phase_std=.25, random_state=0) - >>> fd2 = make_sinusoidal_process(phase_mean=1.8, error_std=0., - ... phase_std=.25, random_state=0) - >>> fd = fd1.concatenate(fd2) - >>> y = 15*[0] + 15*[1] - - We will fit a Nearest centroids classifier - - >>> from skfda.ml.classification import NearestCentroid - >>> neigh = NearestCentroid() - >>> neigh.fit(fd, y) - NearestCentroid(...) - - We can predict the class of new samples - - >>> neigh.predict(fd[::2]) # Predict labels for even samples - array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - - See also: - :class:`~skfda.ml.classification.KNeighborsClassifier` - :class:`~skfda.ml.classification.RadiusNeighborsClassifier` - :class:`~skfda.ml.regression.KNeighborsRegressor` - :class:`~skfda.ml.regression.RadiusNeighborsRegressor` - :class:`~skfda.ml.clustering.NearestNeighbors` - """ - - def __init__(self, metric=l2_distance, centroid=mean): - """Initialize the classifier.""" - self.metric = metric - self.centroid = centroid - - def fit(self, X, y): - """Fit the model using X as training data and y as target values. - - Args: - X (:class:`FDataGrid`, array_matrix): Training data. FDataGrid - with the training data or array matrix with shape - [n_samples, n_samples] if metric='precomputed'. - y (array-like or sparse matrix): Target values of - shape = [n_samples] or [n_samples, n_outputs]. - """ - self.classes_, y_ind = _classifier_get_classes(y) - - self.centroids_ = self.centroid(X[y_ind == 0]) - - for cur_class in range(1, self.classes_.size): - centroid = self.centroid(X[y_ind == cur_class]) - self.centroids_ = self.centroids_.concatenate(centroid) - - return self - - def predict(self, X): - """Predict the class labels for the provided data. - - Args: - X (:class:`FDataGrid`): FDataGrid with the test samples. - - Returns: - y (np.array): array of shape [n_samples] or - [n_samples, n_outputs] with class labels for each data sample. - """ - sklearn_check_is_fitted(self) - - return self.classes_[pairwise_distance(self.metric)( - X, self.centroids_).argmin(axis=1)] diff --git a/skfda/ml/clustering/__init__.py b/skfda/ml/clustering/__init__.py index 01e2be6af..a996898e5 100644 --- a/skfda/ml/clustering/__init__.py +++ b/skfda/ml/clustering/__init__.py @@ -1,5 +1,2 @@ - - -from . import kmeans -from ..._neighbors import NearestNeighbors -from .kmeans import KMeans, FuzzyCMeans +from .neighbors import NearestNeighbors +from .kmeans import BaseKMeans, KMeans, FuzzyCMeans diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/kmeans.py index 3692fedd1..3070e217f 100644 --- a/skfda/ml/clustering/kmeans.py +++ b/skfda/ml/clustering/kmeans.py @@ -12,10 +12,6 @@ from ...misc.metrics import pairwise_distance, lp_distance -__author__ = "Amanda Hernando Bernabé" -__email__ = "amanda.hernando@estudiante.uam.es" - - class BaseKMeans(BaseEstimator, ClusterMixin, TransformerMixin): """Base class to implement K-Means clustering algorithms. @@ -702,7 +698,8 @@ def _check_params(self): def _compute_inertia(self, membership, centroids, distances_to_centroids): - return np.sum(membership ** self.fuzzifier * distances_to_centroids ** 2) + return np.sum( + membership ** self.fuzzifier * distances_to_centroids ** 2) def _create_membership(self, n_samples): return np.empty((n_samples, self.n_clusters)) diff --git a/skfda/_neighbors/unsupervised.py b/skfda/ml/clustering/neighbors.py similarity index 97% rename from skfda/_neighbors/unsupervised.py rename to skfda/ml/clustering/neighbors.py index dcc067ead..c02f7ed11 100644 --- a/skfda/_neighbors/unsupervised.py +++ b/skfda/ml/clustering/neighbors.py @@ -1,7 +1,8 @@ """Unsupervised learner for implementing neighbor searches.""" -from .base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, - RadiusNeighborsMixin) +from ..neighbors_base import (NeighborsBase, NeighborsMixin, + KNeighborsMixin, + RadiusNeighborsMixin) class NearestNeighbors(NeighborsBase, NeighborsMixin, KNeighborsMixin, diff --git a/skfda/_neighbors/base.py b/skfda/ml/neighbors_base.py similarity index 99% rename from skfda/_neighbors/base.py rename to skfda/ml/neighbors_base.py index 36ee1af09..afcad02b1 100644 --- a/skfda/_neighbors/base.py +++ b/skfda/ml/neighbors_base.py @@ -64,7 +64,7 @@ def _to_multivariate_metric(metric, grid_points): >>> import numpy as np >>> from skfda import FDataGrid >>> from skfda.misc.metrics import lp_distance - >>> from skfda._neighbors.base import _to_multivariate_metric + >>> from skfda.ml.neighbors_base import _to_multivariate_metric Calculate the Lp distance between fd and fd2. diff --git a/skfda/_neighbors/outlier.py b/skfda/ml/neighbors_outlier.py similarity index 98% rename from skfda/_neighbors/outlier.py rename to skfda/ml/neighbors_outlier.py index efe45108d..5e2e4756d 100644 --- a/skfda/_neighbors/outlier.py +++ b/skfda/ml/neighbors_outlier.py @@ -3,8 +3,8 @@ from sklearn.base import OutlierMixin from ..misc.metrics import lp_distance -from .base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, - _to_multivariate_metric) +from .neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, + _to_multivariate_metric) class LocalOutlierFactor(NeighborsBase, NeighborsMixin, KNeighborsMixin, @@ -102,7 +102,7 @@ class LocalOutlierFactor(NeighborsBase, NeighborsMixin, KNeighborsMixin, **Local Outlier Factor (LOF) for outlier detection**. - >>> from skfda._neighbors.outlier import LocalOutlierFactor + >>> from skfda.ml.neighbors_outlier import LocalOutlierFactor Creation of simulated dataset with 2 outliers to be used with LOF. diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index ed1ee3890..aaa0a09cc 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -1,4 +1,2 @@ - - -from ..._neighbors import KNeighborsRegressor, RadiusNeighborsRegressor +from .neighbors import KNeighborsRegressor, RadiusNeighborsRegressor from .linear import LinearRegression diff --git a/skfda/_neighbors/regression.py b/skfda/ml/regression/neighbors.py similarity index 98% rename from skfda/_neighbors/regression.py rename to skfda/ml/regression/neighbors.py index 69878cbf3..1f9808cee 100644 --- a/skfda/_neighbors/regression.py +++ b/skfda/ml/regression/neighbors.py @@ -1,7 +1,8 @@ """Neighbor models for regression.""" -from .base import (NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin, - NeighborsRegressorMixin) +from ..neighbors_base import (NeighborsBase, KNeighborsMixin, + RadiusNeighborsMixin, + NeighborsRegressorMixin) class KNeighborsRegressor(NeighborsBase, NeighborsRegressorMixin, diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index efea8946a..f737321e1 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -1,6 +1,6 @@ """Test neighbors classifiers and regressors""" -from skfda._neighbors.outlier import LocalOutlierFactor # Pending theory +from skfda.ml.neighbors_outlier import LocalOutlierFactor # Pending theory from skfda.datasets import make_multimodal_samples, make_sinusoidal_process from skfda.exploratory.stats import mean as l2_mean from skfda.misc.metrics import lp_distance, pairwise_distance @@ -15,7 +15,7 @@ import numpy as np -#from skfda.exploratory.outliers import LocalOutlierFactor +# from skfda.exploratory.outliers import LocalOutlierFactor class TestNeighbors(unittest.TestCase): def setUp(self): From ace05cbb43fa8878dfdd91809a76d63f567ab7c2 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 26 Nov 2020 00:43:16 +0100 Subject: [PATCH 117/210] update docstrings --- skfda/ml/classification/centroid_classifiers.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/skfda/ml/classification/centroid_classifiers.py b/skfda/ml/classification/centroid_classifiers.py index 3079dc321..6da2b86e2 100644 --- a/skfda/ml/classification/centroid_classifiers.py +++ b/skfda/ml/classification/centroid_classifiers.py @@ -60,11 +60,7 @@ class and return a :class:`FData` object with only one sample array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) See also: - :class:`~skfda.ml.classification.KNeighborsClassifier` - :class:`~skfda.ml.classification.RadiusNeighborsClassifier` - :class:`~skfda.ml.regression.KNeighborsRegressor` - :class:`~skfda.ml.regression.RadiusNeighborsRegressor` - :class:`~skfda.ml.clustering.NearestNeighbors` + :class:`~skfda.ml.classification.DTMClassifier """ def __init__(self, metric=l2_distance, centroid=mean): @@ -160,6 +156,7 @@ class DTMClassifier(NearestCentroid): See also: :class:`~skfda.ml.classification.MaximumDepthClassifier + :class:`~skfda.ml.classification.NearestCentroid """ def __init__(self, proportiontocut, From f8b2789904cdccbcaee7e62f6873ffce925e1389 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 26 Nov 2020 00:50:45 +0100 Subject: [PATCH 118/210] fix neighbors test --- tests/test_neighbors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index f737321e1..f1cd494e9 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -2,7 +2,7 @@ from skfda.ml.neighbors_outlier import LocalOutlierFactor # Pending theory from skfda.datasets import make_multimodal_samples, make_sinusoidal_process -from skfda.exploratory.stats import mean as l2_mean +from skfda.exploratory.stats import mean from skfda.misc.metrics import lp_distance, pairwise_distance from skfda.ml.classification import (KNeighborsClassifier, RadiusNeighborsClassifier, @@ -56,7 +56,7 @@ def test_predict_classifier(self): for neigh in (KNeighborsClassifier(), RadiusNeighborsClassifier(radius=.1), NearestCentroid(), - NearestCentroid(metric=lp_distance, mean=l2_mean)): + NearestCentroid(metric=lp_distance, centroid=mean)): neigh.fit(self.X, self.y) pred = neigh.predict(self.X) @@ -347,7 +347,7 @@ def test_lof_fit_predict(self): res = lof.fit_predict(self.fd_lof) np.testing.assert_array_equal(expected, res) - # With explicit l2 distance + # With explicit l2 distance lof2 = LocalOutlierFactor(metric=lp_distance) res2 = lof2.fit_predict(self.fd_lof) np.testing.assert_array_equal(expected, res2) From a206ec2d470103a8ba534615d0346b2d93ebae63 Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Thu, 26 Nov 2020 19:48:37 +0100 Subject: [PATCH 119/210] Feature/GitHub actions (#282) * Create github_actions * Delete .travis.yml * Delete settings.json * Update action * Remove Windows * Testing * Beautify * Update steps * Typo * Beautifying * Updating for coherence * Testing windows * Renaming to Tests * Upload travis * Typo * Delete .github\.travis.yml * Typo * Update Co-authored-by: pedrorponga --- .github/workflows/tests.yml | 37 +++++++++++++++++++++++++++++++++++++ .travis.yml | 3 ++- 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..521edf338 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,37 @@ +name: Tests + +on: + push: + pull_request: + +jobs: + build: + runs-on: ${{ matrix.os }} + name: Python ${{ matrix.python-version }} on ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.6', '3.7'] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip3 install --upgrade pip cython numpy || pip3 install --upgrade --user pip cython numpy; + pip3 install flake8 || pip3 install --user flake8; + pip3 install codecov pytest-cov || pip3 install --user codecov pytest-cov; + + - name: Run tests + run: | + pip3 install . + flake8 --exit-zero skfda; + coverage run --source=skfda/ setup.py test; + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 diff --git a/.travis.yml b/.travis.yml index 412fb3656..db248d6a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,5 @@ language: python + matrix: include: - name: "Python 3.6 on Linux" @@ -24,6 +25,7 @@ matrix: dist: xenial # required for Python >= 3.7 env: - PEP8COVERAGE=true # coverage test are only + install: - pip3 install --upgrade pip cython numpy || pip3 install --upgrade --user pip cython numpy # all three OSes agree about 'pip3' - | @@ -44,7 +46,6 @@ script: python3 setup.py test || python setup.py test; fi - after_success: - | if [[ $PEP8COVERAGE == true ]]; then From b58da464f1502eb1b57880a4e71c733843e934e0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 26 Nov 2020 20:28:14 +0100 Subject: [PATCH 120/210] Update test_neighbors --- tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index efea8946a..257fada67 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -56,7 +56,7 @@ def test_predict_classifier(self): for neigh in (KNeighborsClassifier(), RadiusNeighborsClassifier(radius=.1), NearestCentroid(), - NearestCentroid(metric=lp_distance, mean=l2_mean)): + NearestCentroid(metric=lp_distance, centroid=l2_mean)): neigh.fit(self.X, self.y) pred = neigh.predict(self.X) From 18dc0985dc834ce29c488e2d293c038ece180f00 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 27 Nov 2020 00:44:35 +0100 Subject: [PATCH 121/210] Fix error in distance computation. An infinite loop has been detected but not fixed yet. --- skfda/exploratory/stats/_stats.py | 51 ++++++++++++++++++++++++++----- tests/test_stats.py | 42 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 tests/test_stats.py diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index b59e68f0e..a60a08bc0 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,7 +1,13 @@ """Functional data descriptive statistics. """ +from builtins import isinstance + +from IPython.extensions.autoreload import isinstance2 + import numpy as np +from ...misc.metrics import lp_norm +from ...representation import FData from ..depth import ModifiedBandDepth @@ -95,15 +101,42 @@ def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): return fdatagrid[indices_descending_depth[0]] -def geometric_median(fdata, rtol=1.e-5, atol=1.e-8): +def geometric_median(fdata: FData, tol: float=1.e-8): + r"""Compute the geometric median. + + The sample geometric median is the point that minimizes the :math:`L_1` + norm of the vector of Euclidean distances to all observations: + + .. math:: + + \underset{y \in L(\mathcal{T})}{\arg \min} + \sum_{i=1}^m \left \| x_i-y \right \|_2 + + Args: + fdata (FData): Object containing different samples of a + functional variable. + tol (float): tolerance used to check convergence. + + Returns: + FData: object containing the computed geometric median. + + """ from ...misc import inner_product_matrix + def weighted_average(fdata, weights): + if isinstance(fdata, FData): + return (fdata * weights).sum() + else: + # To support also multivariate data + return (fdata.T * weights).T.sum(axis=0) + gram = inner_product_matrix(fdata) - identity = np.eye(fdata.n_samples) - weights = np.full(fdata.n_samples, 1 / fdata.n_samples) + identity = np.eye(len(fdata)) + weights = np.full(len(fdata), 1 / len(fdata)) prod_matrix = identity - weights - distances = np.einsum('ln,nn,nl->l', prod_matrix.T, gram, prod_matrix)**0.5 + distances = np.einsum('ln,nm,ml->l', prod_matrix.T, gram, prod_matrix)**0.5 + median = weighted_average(fdata, weights) while True: zero_distances = (distances == 0) @@ -111,16 +144,18 @@ def geometric_median(fdata, rtol=1.e-5, atol=1.e-8): weights_new = ((1 / distances) / np.sum(1 / distances) if n_zeros == 0 else (1 / n_zeros) * zero_distances) - if np.allclose(weights, weights_new, rtol=rtol, atol=atol): - return (fdata * weights_new).sum() + median_new = weighted_average(fdata, weights_new) + + if lp_norm(median_new - median) < tol: + return median_new prod_matrix = identity - weights_new - np.einsum('ln,nn,nl->l', prod_matrix.T, gram, + np.einsum('ln,nm,ml->l', prod_matrix.T, gram, prod_matrix, out=distances) distances **= 0.5 - weights[...] = weights_new + weights, median = (weights_new, median_new) def trim_mean(fdatagrid, diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 000000000..8b35caa8e --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,42 @@ +import skfda +from skfda.exploratory.stats import geometric_median +import unittest +import numpy as np + + +class TestGeometricMedian(unittest.TestCase): + + def test_R_comparison(self): + """ + Compare the results obtained using a real-world dataset with those in + R (Gmedian package). + + """ + + X, _ = skfda.datasets.fetch_tecator(return_X_y=True) + + r_res = [2.750514, 2.752771, 2.755024, 2.75733, 2.759735, 2.762285, + 2.76502, 2.767978, 2.771194, 2.774686, 2.778441, 2.782469, + 2.786811, 2.791514, 2.796613, 2.802125, 2.808058, 2.814479, + 2.821395, 2.828766, 2.836444, 2.844187, 2.851768, 2.859055, + 2.86607, 2.872991, 2.880089, 2.887727, 2.896155, 2.905454, + 2.915467, 2.925852, 2.936333, 2.946924, 2.95798, 2.970123, + 2.983961, 2.999372, 3.015869, 3.032706, 3.049022, 3.064058, + 3.077409, 3.089294, 3.100633, 3.112871, 3.127676, 3.147024, + 3.171922, 3.203067, 3.240606, 3.283713, 3.330258, 3.376808, + 3.41942, 3.454856, 3.481628, 3.500368, 3.512892, 3.521134, + 3.526557, 3.530016, 3.531786, 3.531848, 3.530082, 3.526385, + 3.520757, 3.513308, 3.504218, 3.493666, 3.481803, 3.468755, + 3.454654, 3.439589, 3.423664, 3.406963, 3.389647, 3.371963, + 3.354073, 3.336043, 3.317809, 3.299259, 3.280295, 3.260775, + 3.240553, 3.219589, 3.198045, 3.176265, 3.15465, 3.133493, + 3.112882, 3.09274, 3.072943, 3.053437, 3.034223, 3.015319, + 2.996664, 2.978161, 2.959728, 2.941405] + + #median = geometric_median(X) + #median_multivariate = geometric_median(X.data_matrix[..., 0]) + + np.testing.assert_allclose( + median.data_matrix[0, :, 0], median_multivariate, rtol=1e-5) + + np.testing.assert_allclose(median_multivariate, r_res) From 62b4c3c25a7ad78339c5c343864a6d9a8f17ed55 Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Sat, 28 Nov 2020 16:27:45 +0100 Subject: [PATCH 122/210] Update skfda/misc/metrics.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/misc/metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index fe564f830..0a3bab538 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -273,7 +273,9 @@ def lp_distance(fdata1, fdata2, p=2, p2=2, *, eval_points=None, _check=True): For each pair of samples f and g the distance between them is defined as: .. math:: - d(f, g) = d(g, f) = \lVert f - g \rVert + d(f, g) = d(g, f) = \| f - g \|_p + + where :math:`\| {}\cdot{} \|_p` denotes the :func:`Lp norm `. Args: fdatagrid (FDataGrid): FDataGrid object. From ff47f945473926e43f0a90aab3480cd4746280ad Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sat, 28 Nov 2020 16:51:01 +0100 Subject: [PATCH 123/210] References and docstrings in general --- skfda/misc/metrics.py | 28 +++++++++---------- skfda/ml/classification/DTM_classifier.py | 4 +++ .../maximum_depth_classifier.py | 4 +++ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index 0a3bab538..36e73a857 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -20,8 +20,10 @@ def _check_compatible(fdata1, fdata2): def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): - """Checks if the fdatas passed as argument are unidimensional and - compatible and converts them to FDatagrid to compute their distances. + """Convert fdata1 and fdata2 to FDatagrid. + + Checks if the fdatas passed as argument are unidimensional and compatible + and converts them to FDatagrid to compute their distances. Args: fdata1: (:obj:`FData`): First functional object. @@ -30,7 +32,6 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): Returns: tuple: Tuple with two :obj:`FDataGrid` with the same sample points. """ - # Dont perform any check if not _check: return fdata1, fdata2 @@ -64,7 +65,7 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): def distance_from_norm(norm, **kwargs): - r"""Returns the distance induced by a norm. + r"""Return the distance induced by a norm. Given a norm :math:`\| \cdot \|: X \rightarrow \mathbb{R}`, returns the distance :math:`d: X \times X \rightarrow \mathbb{R}` induced @@ -112,7 +113,7 @@ def norm_distance(fdata1, fdata2): def pairwise_distance(distance, **kwargs): - r"""Returns a pairwise distance function for FData objects. + r"""Return a pairwise distance function for FData objects. Given a distance it returns the corresponding pairwise distance function. @@ -274,7 +275,7 @@ def lp_distance(fdata1, fdata2, p=2, p2=2, *, eval_points=None, _check=True): .. math:: d(f, g) = d(g, f) = \| f - g \|_p - + where :math:`\| {}\cdot{} \|_p` denotes the :func:`Lp norm `. Args: @@ -309,6 +310,11 @@ def lp_distance(fdata1, fdata2, p=2, p2=2, *, eval_points=None, _check=True): .... ValueError: ... + See also: + :func:`~skfda.misc.metrics.l1_distance + :func:`~skfda.misc.metrics.l2_distance + :func:`~skfda.misc.metrics.linf_distance + """ _check_compatible(fdata1, fdata2) @@ -342,9 +348,9 @@ def l2_distance(fdata1, fdata2, *, eval_points=None, _check=True): def linf_distance(fdata1, fdata2, *, eval_points=None, _check=True): - r"""Linf distance for FDataGrid objects. + r"""L_infinity distance for FDataGrid objects. - Calculates the Linf distance between fdata1 and fdata2: + Calculates the L_infinity distance between fdata1 and fdata2: .. math:: d(fdata1, fdata2) \equiv \inf \{ C\ge 0 : |fdata1(x)-fdata2(x)| \le C a.e. \}. @@ -389,7 +395,6 @@ def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): Metric* (pp. 5-7). arXiv:1103.3817v2. """ - fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) @@ -459,7 +464,6 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, Functional and shape data analysis. In *Amplitude Space and a Metric Structure* (pp. 107-109). Springer. """ - fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) @@ -532,14 +536,11 @@ def phase_distance(fdata1, fdata2, *, lam=0., eval_points=None, _check=True, Raises: ValueError: If the objects are not unidimensional. - Refereces: .. [SK16-4-10-2] Srivastava, Anuj & Klassen, Eric P. (2016). Functional and shape data analysis. In *Phase Space and a Metric Structure* (pp. 109-111). Springer. - """ - fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) @@ -604,7 +605,6 @@ def warping_distance(warping1, warping2, *, eval_points=None, _check=True): Functions* (pp. 113-117). Springer. """ - warping1, warping2 = _cast_to_grid(warping1, warping2, eval_points=eval_points, _check=_check) diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py index 0aaff5da9..7b0f467a6 100644 --- a/skfda/ml/classification/DTM_classifier.py +++ b/skfda/ml/classification/DTM_classifier.py @@ -58,6 +58,10 @@ class DTMClassifier(NearestCentroid): See also: :class:`~skfda.ml.classification.MaximumDepthClassifier + + References: + Fraiman, R. and Muniz, G. (2001). Trimmed means for functional + data. Test, 10, 419-440. """ def __init__(self, proportiontocut, diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/maximum_depth_classifier.py index 5f4a482fc..ca71fb6c7 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -51,6 +51,10 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): See also: :class:`~skfda.ml.classification.DTMClassifier + + References: + Ghosh, A. K. and Chaudhuri, P. (2005b). On maximum depth and + related classifiers. Scandinavian Journal of Statistics, 32, 327–350. """ def __init__(self, depth_method: Depth = ModifiedBandDepth()): From 00448441f2d9810eed56733d4582fb528da80f15 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sat, 28 Nov 2020 17:59:41 +0100 Subject: [PATCH 124/210] DTMClassifier without inheritance from NearestCentroid --- skfda/misc/metrics.py | 12 +++---- skfda/ml/classification/DTM_classifier.py | 40 ++++++++++++++++++++--- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index 36e73a857..d337f012e 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -148,10 +148,10 @@ def pairwise(fdata1, fdata2=None): def lp_norm(fdata, p=2, p2=None): r"""Calculate the norm of all the samples in a FDataGrid object. - For each sample sample f the Lp norm is defined as: + For each sample f the Lp norm is defined as: .. math:: - \lVert f \rVert = \left( \int_D \lvert f \rvert^p dx \right)^{ + \| f \| = \left( \int_D \| f \|^p dx \right)^{ \frac{1}{p}} Where D is the domain over which the functions are defined. @@ -174,8 +174,8 @@ def lp_norm(fdata, p=2, p2=None): :math:`\| (x,y) \|_* = \sqrt{x^2 + y^2}`, the lp norm applied is .. math:: - \lVert f \rVert = \left( \int \int_D \left ( \sqrt{ \lvert f_1(x,y) - \rvert^2 + \lvert f_2(x,y) \rvert^2 } \right )^p dxdy \right)^{ + \| f \| = \left( \int \int_D \left ( \sqrt{ \| f_1(x,y) + \|^2 + \| f_2(x,y) \|^2 } \right )^p dxdy \right)^{ \frac{1}{p}} @@ -327,7 +327,7 @@ def l1_distance(fdata1, fdata2, *, eval_points=None, _check=True): Calculates the L1 distance between fdata1 and fdata2: .. math:: d(fdata1, fdata2) = - \left( \int_D \lvert fdata1(x)-fdata2(x) \rvert dx + \left( \int_D \| fdata1(x)-fdata2(x) \| dx \right) """ return lp_distance(fdata1, fdata2, p=1, p2=1, @@ -340,7 +340,7 @@ def l2_distance(fdata1, fdata2, *, eval_points=None, _check=True): Calculates the euclidean distance between fdata1 and fdata2: .. math:: d(fdata1, fdata2) = - \left( \int_D \lvert fdata1(x)-fdata2(x) \rvert^2 dx + \left( \int_D \| fdata1(x)-fdata2(x) \|^2 dx \right)^{\frac{1}{2}} """ return lp_distance(fdata1, fdata2, p=2, p2=2, diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py index 7b0f467a6..489da1463 100644 --- a/skfda/ml/classification/DTM_classifier.py +++ b/skfda/ml/classification/DTM_classifier.py @@ -1,12 +1,14 @@ """Distance to trimmed means (DTM) classification.""" +from sklearn.base import ClassifierMixin, BaseEstimator + from ..._neighbors.classification import NearestCentroid from ...exploratory.depth import Depth, ModifiedBandDepth from ...exploratory.stats import trim_mean from ...misc.metrics import lp_distance -class DTMClassifier(NearestCentroid): +class DTMClassifier(BaseEstimator, ClassifierMixin): """Distance to trimmed means (DTM) classification. Test samples are classified to the class that minimizes the distance of @@ -68,7 +70,35 @@ def __init__(self, proportiontocut, depth_method: Depth = ModifiedBandDepth(), metric=lp_distance): """Initialize the classifier.""" - super().__init__(metric=metric, - centroid=lambda fdatagrid: trim_mean(fdatagrid, - proportiontocut, - depth_method)) + self.proportiontocut = proportiontocut + self.depth_method = depth_method + self.metric = metric + + def fit(self, X, y): + """Fit the model using X as training data and y as target values. + + Args: + X (:class:`FDataGrid`): FDataGrid with the training data. + y (array-like): Target values of shape = [n_samples]. + + """ + self.clf = NearestCentroid( + metric=self.metric, + centroid=lambda fdatagrid: trim_mean(fdatagrid, + self.proportiontocut, + self.depth_method)) + self.clf.fit(X, y) + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Args: + X (:class:`FDataGrid`): FDataGrid with the test samples. + + Returns: + y (np.array): array of shape [n_samples] or + [n_samples, n_outputs] with class labels for each data sample. + """ + return self.clf.predict(X) From 05d8175538a1b1d36e57872e64673d45bff68a93 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sat, 28 Nov 2020 18:07:48 +0100 Subject: [PATCH 125/210] sample -> observation --- skfda/misc/metrics.py | 51 ++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index d337f012e..4fe7c9625 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -30,7 +30,8 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): fdata2: (:obj:`FData`): Second functional object. Returns: - tuple: Tuple with two :obj:`FDataGrid` with the same sample points. + tuple: Tuple with two :obj:`FDataGrid` with the same observation + points. """ # Dont perform any check if not _check: @@ -58,7 +59,7 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): elif not np.array_equal(fdata1.grid_points, fdata2.grid_points): - raise ValueError("Sample points for both objects must be equal or" + raise ValueError("Observation points for both objects must be equal or" "a new list evaluation points must be specified") return fdata1, fdata2 @@ -118,13 +119,14 @@ def pairwise_distance(distance, **kwargs): Given a distance it returns the corresponding pairwise distance function. The returned pairwise distance function calculates the distance between - all possible pairs consisting of one sample of the first FDataGrid object - and one of the second one. + all possible pairs consisting of one observation of the first FDataGrid + object and one of the second one. The matrix returned by the pairwise distance is a matrix with as many rows - as samples in the first object and as many columns as samples in the second - one. Each element (i, j) of the matrix is the distance between the ith - sample of the first object and the jth sample of the second one. + as observations in the first object and as many columns as observations in + the second one. Each element (i, j) of the matrix is the distance between + the ith observation of the first object and the jth observation of the + second one. Args: distance (:obj:`Function`): Distance functions between two functional @@ -146,9 +148,9 @@ def pairwise(fdata1, fdata2=None): def lp_norm(fdata, p=2, p2=None): - r"""Calculate the norm of all the samples in a FDataGrid object. + r"""Calculate the norm of all the observations in a FDataGrid object. - For each sample f the Lp norm is defined as: + For each observation f the Lp norm is defined as: .. math:: \| f \| = \left( \int_D \| f \|^p dx \right)^{ @@ -188,10 +190,11 @@ def lp_norm(fdata, p=2, p2=None): multivariate objects. Defaults to 2. Returns: - numpy.darray: Matrix with as many rows as samples in the first - object and as many columns as samples in the second one. Each - element (i, j) of the matrix is the inner product of the ith sample - of the first object and the jth sample of the second one. + numpy.darray: Matrix with as many rows as observations in the first + object and as many columns as observations in the second one. Each + element (i, j) of the matrix is the inner product of the ith + observation of the first object and the jth observation of the second + one. Examples: Calculates the norm of a FDataGrid containing the functions y = 1 @@ -247,7 +250,8 @@ def lp_norm(fdata, p=2, p2=None): if fdata.dim_domain == 1: res = np.max(data_matrix[..., 0], axis=1) else: - res = np.array([np.max(sample) for sample in data_matrix]) + res = np.array([np.max(observation) + for observation in data_matrix]) elif fdata.dim_domain == 1: @@ -271,7 +275,8 @@ def lp_distance(fdata1, fdata2, p=2, p2=2, *, eval_points=None, _check=True): Calculates the distance between two functional objects. - For each pair of samples f and g the distance between them is defined as: + For each pair of observations f and g the distance between them is defined + as: .. math:: d(f, g) = d(g, f) = \| f - g \|_p @@ -375,8 +380,8 @@ def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): match with the usual fisher-rao distance in non-parametric form for probability distributions [S11-2]_. - If the samples are defined in a domain different than (0,1) their domains - are normalized to this interval with an affine transformation. + If the observations are defined in a domain different than (0,1) their + domains are normalized to this interval with an affine transformation. Args: fdata1 (FData): First FData object. @@ -398,7 +403,7 @@ def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) - # Both should have the same sample points + # Both should have the same observation points eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) @@ -442,8 +447,8 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, See [SK16-4-10-1]_ for a detailed explanation. - If the samples are defined in a domain different than (0,1) their domains - are normalized to this interval with an affine transformation. + If the observations are defined in a domain different than (0,1) their + domains are normalized to this interval with an affine transformation. Args: fdata1 (FData): First FData object. @@ -467,7 +472,7 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) - # Both should have the same sample points + # Both should have the same observation points eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) @@ -519,8 +524,8 @@ def phase_distance(fdata1, fdata2, *, lam=0., eval_points=None, _check=True, See [SK16-4-10-2]_ for a detailed explanation. - If the samples are defined in a domain different than (0,1) their domains - are normalized to this interval with an affine transformation. + If the observations are defined in a domain different than (0,1) their + domains are normalized to this interval with an affine transformation. Args: fdata1 (FData): First FData object. From 1baa39d6762cc5ae50aff1d94330a345b2f2bb42 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 01:18:11 +0100 Subject: [PATCH 126/210] Requested changes --- skfda/misc/metrics.py | 29 ++++++++++++++++++----- skfda/ml/classification/DTM_classifier.py | 10 ++++---- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index 4fe7c9625..eaeef2c4c 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -59,7 +59,7 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): elif not np.array_equal(fdata1.grid_points, fdata2.grid_points): - raise ValueError("Observation points for both objects must be equal or" + raise ValueError("Grid points for both objects must be equal or" "a new list evaluation points must be specified") return fdata1, fdata2 @@ -334,6 +334,11 @@ def l1_distance(fdata1, fdata2, *, eval_points=None, _check=True): d(fdata1, fdata2) = \left( \int_D \| fdata1(x)-fdata2(x) \| dx \right) + + See also: + :func:`~skfda.misc.metrics.lp_distance + :func:`~skfda.misc.metrics.l2_distance + :func:`~skfda.misc.metrics.linf_distance """ return lp_distance(fdata1, fdata2, p=1, p2=1, eval_points=eval_points, _check=_check) @@ -347,6 +352,11 @@ def l2_distance(fdata1, fdata2, *, eval_points=None, _check=True): d(fdata1, fdata2) = \left( \int_D \| fdata1(x)-fdata2(x) \|^2 dx \right)^{\frac{1}{2}} + + See also: + :func:`~skfda.misc.metrics.lp_distance + :func:`~skfda.misc.metrics.l1_distance + :func:`~skfda.misc.metrics.linf_distance """ return lp_distance(fdata1, fdata2, p=2, p2=2, eval_points=eval_points, _check=_check) @@ -359,6 +369,11 @@ def linf_distance(fdata1, fdata2, *, eval_points=None, _check=True): .. math:: d(fdata1, fdata2) \equiv \inf \{ C\ge 0 : |fdata1(x)-fdata2(x)| \le C a.e. \}. + + See also: + :func:`~skfda.misc.metrics.lp_distance + :func:`~skfda.misc.metrics.l1_distance + :func:`~skfda.misc.metrics.l2_distance """ return lp_distance(fdata1, fdata2, p=np.inf, p2=np.inf, eval_points=eval_points, _check=_check) @@ -380,8 +395,9 @@ def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): match with the usual fisher-rao distance in non-parametric form for probability distributions [S11-2]_. - If the observations are defined in a domain different than (0,1) their - domains are normalized to this interval with an affine transformation. + If the observations are defined in a :term:`domain` different than (0,1) + their domains are normalized to this interval with an affine + transformation. Args: fdata1 (FData): First FData object. @@ -472,7 +488,7 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) - # Both should have the same observation points + # Both should have the same grid points eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) @@ -524,8 +540,9 @@ def phase_distance(fdata1, fdata2, *, lam=0., eval_points=None, _check=True, See [SK16-4-10-2]_ for a detailed explanation. - If the observations are defined in a domain different than (0,1) their - domains are normalized to this interval with an affine transformation. + If the observations are defined in a :term:`domain` different than (0,1) + their domains are normalized to this interval with an affine + transformation. Args: fdata1 (FData): First FData object. diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py index 489da1463..904116186 100644 --- a/skfda/ml/classification/DTM_classifier.py +++ b/skfda/ml/classification/DTM_classifier.py @@ -66,9 +66,9 @@ class DTMClassifier(BaseEstimator, ClassifierMixin): data. Test, 10, 419-440. """ - def __init__(self, proportiontocut, + def __init__(self, proportiontocut: float, depth_method: Depth = ModifiedBandDepth(), - metric=lp_distance): + metric: function = lp_distance): """Initialize the classifier.""" self.proportiontocut = proportiontocut self.depth_method = depth_method @@ -82,12 +82,12 @@ def fit(self, X, y): y (array-like): Target values of shape = [n_samples]. """ - self.clf = NearestCentroid( + self._clf = NearestCentroid( metric=self.metric, centroid=lambda fdatagrid: trim_mean(fdatagrid, self.proportiontocut, self.depth_method)) - self.clf.fit(X, y) + self._clf.fit(X, y) return self @@ -101,4 +101,4 @@ def predict(self, X): y (np.array): array of shape [n_samples] or [n_samples, n_outputs] with class labels for each data sample. """ - return self.clf.predict(X) + return self._clf.predict(X) From 9cea4b969c51ace813767996fcebc5d1cf6e550c Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 01:19:57 +0100 Subject: [PATCH 127/210] Style --- skfda/ml/classification/maximum_depth_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/maximum_depth_classifier.py b/skfda/ml/classification/maximum_depth_classifier.py index ca71fb6c7..533cbefd7 100644 --- a/skfda/ml/classification/maximum_depth_classifier.py +++ b/skfda/ml/classification/maximum_depth_classifier.py @@ -51,7 +51,7 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): See also: :class:`~skfda.ml.classification.DTMClassifier - + References: Ghosh, A. K. and Chaudhuri, P. (2005b). On maximum depth and related classifiers. Scandinavian Journal of Statistics, 32, 327–350. From d365a3cd3ea04b398147459211a7d07d27c0436f Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 01:29:59 +0100 Subject: [PATCH 128/210] Fixed types --- skfda/ml/classification/DTM_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py index 904116186..12b67f3ae 100644 --- a/skfda/ml/classification/DTM_classifier.py +++ b/skfda/ml/classification/DTM_classifier.py @@ -1,5 +1,6 @@ """Distance to trimmed means (DTM) classification.""" +from typing import Callable from sklearn.base import ClassifierMixin, BaseEstimator from ..._neighbors.classification import NearestCentroid @@ -68,7 +69,7 @@ class DTMClassifier(BaseEstimator, ClassifierMixin): def __init__(self, proportiontocut: float, depth_method: Depth = ModifiedBandDepth(), - metric: function = lp_distance): + metric: Callable = lp_distance) -> None: """Initialize the classifier.""" self.proportiontocut = proportiontocut self.depth_method = depth_method From 33eec62986c98dfbdefd8d9ff02e170227eac5c1 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 01:31:40 +0100 Subject: [PATCH 129/210] Docstrings type update --- skfda/ml/classification/DTM_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/DTM_classifier.py b/skfda/ml/classification/DTM_classifier.py index 12b67f3ae..95b839cb2 100644 --- a/skfda/ml/classification/DTM_classifier.py +++ b/skfda/ml/classification/DTM_classifier.py @@ -24,7 +24,7 @@ class DTMClassifier(BaseEstimator, ClassifierMixin): The depth class used to order the data. See the documentation of the depths module for a list of available depths. By default it is ModifiedBandDepth. - metric (function, default + metric (Callable, default :func:`lp_distance `): Distance function between two functional objects. See the documentation of the metrics module for a list of available From 946b6f79d6f5a11f78481e916b770cfc0d3dc045 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 29 Nov 2020 17:44:00 +0100 Subject: [PATCH 130/210] Fix geometric median. --- skfda/exploratory/stats/_stats.py | 2 +- tests/test_stats.py | 47 ++++++++++++++++--------------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index a60a08bc0..23cb4b9e4 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -149,7 +149,7 @@ def weighted_average(fdata, weights): if lp_norm(median_new - median) < tol: return median_new - prod_matrix = identity - weights_new + prod_matrix = (identity - weights_new).T np.einsum('ln,nm,ml->l', prod_matrix.T, gram, prod_matrix, out=distances) diff --git a/tests/test_stats.py b/tests/test_stats.py index 8b35caa8e..0e7ad8a43 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -15,28 +15,31 @@ def test_R_comparison(self): X, _ = skfda.datasets.fetch_tecator(return_X_y=True) - r_res = [2.750514, 2.752771, 2.755024, 2.75733, 2.759735, 2.762285, - 2.76502, 2.767978, 2.771194, 2.774686, 2.778441, 2.782469, - 2.786811, 2.791514, 2.796613, 2.802125, 2.808058, 2.814479, - 2.821395, 2.828766, 2.836444, 2.844187, 2.851768, 2.859055, - 2.86607, 2.872991, 2.880089, 2.887727, 2.896155, 2.905454, - 2.915467, 2.925852, 2.936333, 2.946924, 2.95798, 2.970123, - 2.983961, 2.999372, 3.015869, 3.032706, 3.049022, 3.064058, - 3.077409, 3.089294, 3.100633, 3.112871, 3.127676, 3.147024, - 3.171922, 3.203067, 3.240606, 3.283713, 3.330258, 3.376808, - 3.41942, 3.454856, 3.481628, 3.500368, 3.512892, 3.521134, - 3.526557, 3.530016, 3.531786, 3.531848, 3.530082, 3.526385, - 3.520757, 3.513308, 3.504218, 3.493666, 3.481803, 3.468755, - 3.454654, 3.439589, 3.423664, 3.406963, 3.389647, 3.371963, - 3.354073, 3.336043, 3.317809, 3.299259, 3.280295, 3.260775, - 3.240553, 3.219589, 3.198045, 3.176265, 3.15465, 3.133493, - 3.112882, 3.09274, 3.072943, 3.053437, 3.034223, 3.015319, - 2.996664, 2.978161, 2.959728, 2.941405] - - #median = geometric_median(X) - #median_multivariate = geometric_median(X.data_matrix[..., 0]) + r_res = [2.74083, 2.742715, 2.744627, 2.74659, 2.748656, + 2.750879, 2.753307, 2.755984, 2.758927, 2.762182, + 2.765724, 2.76957, 2.773756, 2.778333, 2.783346, + 2.788818, 2.794758, 2.801225, 2.808233, 2.815714, + 2.82351, 2.831355, 2.838997, 2.846298, 2.853295, + 2.860186, 2.867332, 2.875107, 2.883778, 2.893419, + 2.903851, 2.914717, 2.925698, 2.936765, 2.948293, + 2.960908, 2.97526, 2.991206, 3.008222, 3.02552, + 3.042172, 3.057356, 3.070666, 3.082351, 3.093396, + 3.105338, 3.119946, 3.139307, 3.164418, 3.196014, + 3.234248, 3.278306, 3.326051, 3.374015, 3.418148, + 3.455051, 3.483095, 3.502789, 3.515961, 3.524557, + 3.530135, 3.53364, 3.535369, 3.535305, 3.533326, + 3.529343, 3.523357, 3.51548, 3.5059, 3.494807, + 3.482358, 3.468695, 3.453939, 3.438202, 3.421574, + 3.404169, 3.386148, 3.367751, 3.349166, 3.330441, + 3.311532, 3.292318, 3.272683, 3.252482, 3.23157, + 3.2099, 3.187632, 3.165129, 3.14282, 3.121008, + 3.099793, 3.079092, 3.058772, 3.038755, 3.019038, + 2.99963, 2.980476, 2.961467, 2.94252, 2.923682] + + median = geometric_median(X) + median_multivariate = geometric_median(X.data_matrix[..., 0]) np.testing.assert_allclose( - median.data_matrix[0, :, 0], median_multivariate, rtol=1e-5) + median.data_matrix[0, :, 0], median_multivariate, rtol=1e-4) - np.testing.assert_allclose(median_multivariate, r_res) + np.testing.assert_allclose(median_multivariate, r_res, rtol=1e-6) From 3225ff24570c775d6b0bebace2e28fe5118794ef Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 18:59:42 +0100 Subject: [PATCH 131/210] Domain and grid points --- skfda/misc/metrics.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index eaeef2c4c..498766d64 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -30,8 +30,7 @@ def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): fdata2: (:obj:`FData`): Second functional object. Returns: - tuple: Tuple with two :obj:`FDataGrid` with the same observation - points. + tuple: Tuple with two :obj:`FDataGrid` with the same grid points. """ # Dont perform any check if not _check: @@ -156,7 +155,7 @@ def lp_norm(fdata, p=2, p2=None): \| f \| = \left( \int_D \| f \|^p dx \right)^{ \frac{1}{p}} - Where D is the domain over which the functions are defined. + Where D is the :term:`domain` over which the functions are defined. The integral is approximated using Simpson's rule. @@ -419,7 +418,7 @@ def fisher_rao_distance(fdata1, fdata2, *, eval_points=None, _check=True): fdata1, fdata2 = _cast_to_grid(fdata1, fdata2, eval_points=eval_points, _check=_check) - # Both should have the same observation points + # Both should have the same grid points eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) @@ -463,7 +462,7 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, See [SK16-4-10-1]_ for a detailed explanation. - If the observations are defined in a domain different than (0,1) their + If the observations are defined in a :term:`domain` different than (0,1) their domains are normalized to this interval with an affine transformation. Args: @@ -608,7 +607,7 @@ def warping_distance(warping1, warping2, *, eval_points=None, _check=True): See [SK16-4-11-2]_ for a detailed explanation. If the warpings are not defined in [0,1], an affine transformation is maked - to change the domain. + to change the :term:`domain`. Args: fdata1 (:obj:`FData`): First warping. From e64afbfb3ad886e69c02e4c277b91a6d0d570891 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 19:00:11 +0100 Subject: [PATCH 132/210] Line too long --- skfda/misc/metrics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index 498766d64..1cec68db2 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -462,8 +462,9 @@ def amplitude_distance(fdata1, fdata2, *, lam=0., eval_points=None, See [SK16-4-10-1]_ for a detailed explanation. - If the observations are defined in a :term:`domain` different than (0,1) their - domains are normalized to this interval with an affine transformation. + If the observations are defined in a :term:`domain` different than (0,1) + their domains are normalized to this interval with an affine + transformation. Args: fdata1 (FData): First FData object. From 39db2e4efc2fef593e07ec9e9a13f05d05210eff Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 19:26:49 +0100 Subject: [PATCH 133/210] Update centroid_classifier --- .../ml/classification/centroid_classifiers.py | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/skfda/ml/classification/centroid_classifiers.py b/skfda/ml/classification/centroid_classifiers.py index 6da2b86e2..14fdcdc01 100644 --- a/skfda/ml/classification/centroid_classifiers.py +++ b/skfda/ml/classification/centroid_classifiers.py @@ -1,5 +1,6 @@ """Centroid models for supervised classification.""" +from typing import Callable from sklearn.base import ClassifierMixin, BaseEstimator from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted @@ -104,7 +105,7 @@ def predict(self, X): X, self.centroids_).argmin(axis=1)] -class DTMClassifier(NearestCentroid): +class DTMClassifier(BaseEstimator, ClassifierMixin): """Distance to trimmed means (DTM) classification. Test samples are classified to the class that minimizes the distance of @@ -119,7 +120,7 @@ class DTMClassifier(NearestCentroid): The depth class used to order the data. See the documentation of the depths module for a list of available depths. By default it is ModifiedBandDepth. - metric (function, default + metric (Callable, default :func:`lp_distance `): Distance function between two functional objects. See the documentation of the metrics module for a list of available @@ -156,14 +157,45 @@ class DTMClassifier(NearestCentroid): See also: :class:`~skfda.ml.classification.MaximumDepthClassifier - :class:`~skfda.ml.classification.NearestCentroid + + References: + Fraiman, R. and Muniz, G. (2001). Trimmed means for functional + data. Test, 10, 419-440. """ - def __init__(self, proportiontocut, + def __init__(self, proportiontocut: float, depth_method: Depth = ModifiedBandDepth(), - metric=lp_distance): + metric: Callable = lp_distance) -> None: """Initialize the classifier.""" - super().__init__(metric=metric, - centroid=lambda fdatagrid: trim_mean(fdatagrid, - proportiontocut, - depth_method)) + self.proportiontocut = proportiontocut + self.depth_method = depth_method + self.metric = metric + + def fit(self, X, y): + """Fit the model using X as training data and y as target values. + + Args: + X (:class:`FDataGrid`): FDataGrid with the training data. + y (array-like): Target values of shape = [n_samples]. + + """ + self._clf = NearestCentroid( + metric=self.metric, + centroid=lambda fdatagrid: trim_mean(fdatagrid, + self.proportiontocut, + self.depth_method)) + self._clf.fit(X, y) + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Args: + X (:class:`FDataGrid`): FDataGrid with the test samples. + + Returns: + y (np.array): array of shape [n_samples] or + [n_samples, n_outputs] with class labels for each data sample. + """ + return self._clf.predict(X) From 81ef7298e47ba5401edd5398e8817882f128e85e Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Sun, 29 Nov 2020 20:31:34 +0100 Subject: [PATCH 134/210] fit_transform had no coverage --- tests/test_clustering.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index 2499237dd..21d306581 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -18,7 +18,12 @@ def test_kmeans_univariate(self): init = np.array([[0, 0, 0, 0, 0, 0], [2, 1, -1, 0.5, 0, -0.5]]) init_fd = FDataGrid(init, grid_points) kmeans = KMeans(init=init_fd) - kmeans.fit(fd) + distances_to_centers = kmeans.fit_transform(fd) + np.testing.assert_allclose(distances_to_centers, + np.array([[2.98142397, 9.23534876], + [0.68718427, 6.50960828], + [3.31243449, 4.39222798], + [6.49679408, 0.]])) np.testing.assert_array_equal(kmeans.predict(fd), np.array([0, 0, 0, 1])) np.testing.assert_allclose(kmeans.transform(fd), From 37ce626ccc2299628e76a74cd9ac6247d1c14f40 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 29 Nov 2020 23:23:38 +0100 Subject: [PATCH 135/210] Optimize geometric mean. --- skfda/exploratory/stats/_stats.py | 65 ++++++++++++++++++----------- skfda/misc/metrics.py | 15 ++++--- tests/test_stats.py | 68 ++++++++++++++++++++++++++++++- 3 files changed, 118 insertions(+), 30 deletions(-) diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 23cb4b9e4..8b663025c 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -2,11 +2,11 @@ """ from builtins import isinstance -from IPython.extensions.autoreload import isinstance2 +from typing import Callable, TypeVar, Union import numpy as np -from ...misc.metrics import lp_norm +from ...misc.metrics import lp_norm, lp_distance from ...representation import FData from ..depth import ModifiedBandDepth @@ -93,6 +93,9 @@ def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): Returns: FDataGrid: object containing the computed depth_based median. + See also: + geometric_median + """ depth = depth_method(fdatagrid) indices_descending_depth = (-depth).argsort(axis=0) @@ -101,7 +104,11 @@ def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): return fdatagrid[indices_descending_depth[0]] -def geometric_median(fdata: FData, tol: float=1.e-8): +T = TypeVar('T', bound=Union[np.array, FData]) + + +def geometric_median(X: T, tol: float=1.e-8, + metric: Callable = lp_distance) -> T: r"""Compute the geometric median. The sample geometric median is the point that minimizes the :math:`L_1` @@ -112,31 +119,47 @@ def geometric_median(fdata: FData, tol: float=1.e-8): \underset{y \in L(\mathcal{T})}{\arg \min} \sum_{i=1}^m \left \| x_i-y \right \|_2 + It uses the corrected Weiszfeld algorithm to compute the median, + precalculating the inner product matrix in order to compute the + distances. + Args: - fdata (FData): Object containing different samples of a + X: Object containing different samples of a functional variable. - tol (float): tolerance used to check convergence. + tol: tolerance used to check convergence. Returns: FData: object containing the computed geometric median. - """ + Example: - from ...misc import inner_product_matrix + >>> from skfda import FDataGrid + >>> data_matrix = [[0.5, 1, 2, .5], [1.5, 1, 4, .5]] + >>> X = FDataGrid(data_matrix) + >>> median = geometric_median(X) + >>> median.data_matrix[0, ..., 0] + array([ 1. , 1. , 3. , 0.5]) - def weighted_average(fdata, weights): - if isinstance(fdata, FData): - return (fdata * weights).sum() + See also: + depth_based_median + + References: + Gervini, D. (2008). Robust functional estimation using the median and + spherical principal components. Biometrika, 95(3), 587–600. + https://doi.org/10.1093/biomet/asn031 + + """ + + def weighted_average(X, weights): + if isinstance(X, FData): + return (X * weights).sum() else: # To support also multivariate data - return (fdata.T * weights).T.sum(axis=0) + return (X.T * weights).T.sum(axis=0) - gram = inner_product_matrix(fdata) - identity = np.eye(len(fdata)) - weights = np.full(len(fdata), 1 / len(fdata)) - prod_matrix = identity - weights - distances = np.einsum('ln,nm,ml->l', prod_matrix.T, gram, prod_matrix)**0.5 - median = weighted_average(fdata, weights) + weights = np.full(len(X), 1 / len(X)) + median = weighted_average(X, weights) + distances = metric(X, median) while True: zero_distances = (distances == 0) @@ -144,16 +167,12 @@ def weighted_average(fdata, weights): weights_new = ((1 / distances) / np.sum(1 / distances) if n_zeros == 0 else (1 / n_zeros) * zero_distances) - median_new = weighted_average(fdata, weights_new) + median_new = weighted_average(X, weights_new) if lp_norm(median_new - median) < tol: return median_new - prod_matrix = (identity - weights_new).T - - np.einsum('ln,nm,ml->l', prod_matrix.T, gram, - prod_matrix, out=distances) - distances **= 0.5 + distances = metric(X, median_new) weights, median = (weights_new, median_new) diff --git a/skfda/misc/metrics.py b/skfda/misc/metrics.py index a8b181923..3d76d9c18 100644 --- a/skfda/misc/metrics.py +++ b/skfda/misc/metrics.py @@ -1,3 +1,5 @@ +from builtins import isinstance + import scipy.integrate import numpy as np @@ -6,17 +8,18 @@ from ..preprocessing.registration import normalize_warping, ElasticRegistration from ..preprocessing.registration._warping import _normalize_scale from ..preprocessing.registration.elastic import SRSF -from ..representation import FDataGrid, FDataBasis +from ..representation import FData, FDataGrid, FDataBasis def _check_compatible(fdata1, fdata2): - if (fdata2.dim_codomain != fdata1.dim_codomain or - fdata2.dim_domain != fdata1.dim_domain): - raise ValueError("Objects should have the same dimensions") + if isinstance(fdata1, FData) and isinstance(fdata2, FData): + if (fdata2.dim_codomain != fdata1.dim_codomain or + fdata2.dim_domain != fdata1.dim_domain): + raise ValueError("Objects should have the same dimensions") - if not np.array_equal(fdata1.domain_range, fdata2.domain_range): - raise ValueError("Domain ranges for both objects must be equal") + if not np.array_equal(fdata1.domain_range, fdata2.domain_range): + raise ValueError("Domain ranges for both objects must be equal") def _cast_to_grid(fdata1, fdata2, eval_points=None, _check=True, **kwargs): diff --git a/tests/test_stats.py b/tests/test_stats.py index 0e7ad8a43..234aa909c 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -36,10 +36,76 @@ def test_R_comparison(self): 3.099793, 3.079092, 3.058772, 3.038755, 3.019038, 2.99963, 2.980476, 2.961467, 2.94252, 2.923682] - median = geometric_median(X) median_multivariate = geometric_median(X.data_matrix[..., 0]) + median = geometric_median(X) np.testing.assert_allclose( median.data_matrix[0, :, 0], median_multivariate, rtol=1e-4) np.testing.assert_allclose(median_multivariate, r_res, rtol=1e-6) + + def test_big(self): + + X, _ = skfda.datasets.fetch_phoneme(return_X_y=True) + + res = np.array( + [10.87814495, 12.10539654, 15.19841961, 16.29929599, 15.52206033, + 15.35123923, 16.44119775, 16.92255038, 16.70263134, 16.62235371, + 16.76616863, 16.80691414, 16.67460045, 16.64628944, 16.60898231, + 16.64735698, 16.7749517, 16.84533289, 16.8134475, 16.69540395, + 16.56083649, 16.3716527, 16.13744993, 15.95246457, 15.78934047, + 15.64383354, 15.55120344, 15.4363593, 15.36998848, 15.35300094, + 15.23606121, 15.16001392, 15.07326127, 14.92863818, 14.77405828, + 14.63772985, 14.4496911, 14.22752646, 14.07162908, 13.90989422, + 13.68979176, 13.53664058, 13.45465055, 13.40192835, 13.39111557, + 13.32592256, 13.26068118, 13.2314264, 13.29364741, 13.30700552, + 13.30579737, 13.35277966, 13.36572257, 13.45244228, 13.50615096, + 13.54872786, 13.65412519, 13.74737364, 13.79203753, 13.87827636, + 13.97728725, 14.06989886, 14.09950082, 14.13697733, 14.18414727, + 14.1914785, 14.17973283, 14.19655855, 14.20551814, 14.23059727, + 14.23195262, 14.21091905, 14.22234481, 14.17687285, 14.1732165, + 14.13488535, 14.11564007, 14.0296303, 13.99540104, 13.9383672, + 13.85056848, 13.73195466, 13.66840843, 13.64387247, 13.52972191, + 13.43092629, 13.37470213, 13.31847522, 13.21687255, 13.15170299, + 13.15372387, 13.1059763, 13.09445287, 13.09041529, 13.11710243, + 13.14386673, 13.22359963, 13.27466107, 13.31319886, 13.34650331, + 13.45574711, 13.50415149, 13.53131719, 13.58150982, 13.65962685, + 13.63699657, 13.61248827, 13.60584663, 13.61072488, 13.54361538, + 13.48274699, 13.39589291, 13.33557961, 13.27237689, 13.15525989, + 13.0201153, 12.92930916, 12.81669859, 12.67134652, 12.58933066, + 12.48431933, 12.35395795, 12.23358723, 12.1604567, 12.02565859, + 11.92888167, 11.81510299, 11.74115444, 11.62986853, 11.51119027, + 11.41922977, 11.32781545, 11.23709771, 11.1553455, 11.06238304, + 10.97654662, 10.89217886, 10.837813, 10.76259305, 10.74123747, + 10.63519376, 10.58236217, 10.50270085, 10.43664285, 10.36198002, + 10.29128265, 10.27590625, 10.21337539, 10.14368936, 10.11450364, + 10.12276595, 10.0811153, 10.03603621, 10.00381717, 9.94299925, + 9.91830306, 9.90583771, 9.87254886, 9.84294024, 9.85472138, + 9.82047669, 9.8222713, 9.82272407, 9.78949033, 9.78038714, + 9.78720474, 9.81027704, 9.77565195, 9.80675363, 9.77084177, + 9.75289156, 9.75404079, 9.72316608, 9.7325137, 9.70562447, + 9.74528393, 9.70416261, 9.67298074, 9.6888954, 9.6765554, + 9.62346413, 9.65547732, 9.59897653, 9.64655533, 9.57719677, + 9.52660027, 9.54591084, 9.5389796, 9.53577489, 9.50843709, + 9.4889757, 9.46656255, 9.46875593, 9.48179707, 9.44946697, + 9.4798432, 9.46992684, 9.47672347, 9.50141949, 9.45946886, + 9.48043777, 9.49121177, 9.48771047, 9.51135703, 9.5309805, + 9.52914508, 9.54184114, 9.49902134, 9.5184432, 9.48091512, + 9.4951481, 9.51101019, 9.49815911, 9.48404411, 9.45754481, + 9.43717866, 9.38444679, 9.39625792, 9.38149371, 9.40279467, + 9.37378114, 9.31453485, 9.29494997, 9.30214391, 9.24839539, + 9.25834154, 9.24655115, 9.25298293, 9.22182526, 9.18142295, + 9.16692765, 9.1253291, 9.17396507, 9.11561516, 9.13792622, + 9.14151424, 9.10477211, 9.13132802, 9.10557653, 9.10442614, + 9.09571574, 9.13986784, 9.08555206, 9.11363748, 9.14300157, + 9.13020252, 9.15901185, 9.15329127, 9.19107506, 9.19507704, + 9.16421159, 9.18975673, 9.14399055, 9.15376256, 9.17409705, + 8.50360777]) + + median_multivariate = geometric_median(X.data_matrix[..., 0]) + median = geometric_median(X) + + np.testing.assert_allclose( + median.data_matrix[0, :, 0], median_multivariate, rtol=1e-2) + + np.testing.assert_allclose(median_multivariate, res, rtol=1e-6) From f81de247362067b2541f40f5dfc11c54bd89ce91 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 30 Nov 2020 00:17:05 +0100 Subject: [PATCH 136/210] Add documentation for statistics. --- docs/modules/exploratory.rst | 3 ++- docs/modules/exploratory/stats.rst | 34 ++++++++++++++++++++++++++++++ skfda/exploratory/stats/_stats.py | 14 ++++++------ 3 files changed, 43 insertions(+), 8 deletions(-) create mode 100644 docs/modules/exploratory/stats.rst diff --git a/docs/modules/exploratory.rst b/docs/modules/exploratory.rst index 832b93193..90a719e24 100644 --- a/docs/modules/exploratory.rst +++ b/docs/modules/exploratory.rst @@ -8,6 +8,7 @@ and visualize functional data. :maxdepth: 4 :caption: Modules: - exploratory/visualization exploratory/depth exploratory/outliers + exploratory/stats + exploratory/visualization diff --git a/docs/modules/exploratory/stats.rst b/docs/modules/exploratory/stats.rst new file mode 100644 index 000000000..13eba1c95 --- /dev/null +++ b/docs/modules/exploratory/stats.rst @@ -0,0 +1,34 @@ +Summary statistics +================== + +As in univariate and multivariate analysis, in :term:`FDA` summary statistics +can be used to summarize a set of :term:`functional observations`. + +Location +-------- + +The following statistics are available in scikit-fda in order to obtain a +measure of the location or central tendency of :term:`functional data`. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.stats.mean + skfda.exploratory.stats.gmean + skfda.exploratory.stats.trim_mean + skfda.exploratory.stats.depth_based_median + skfda.exploratory.stats.geometric_median + +Dispersion +---------- + +For obtaining a measure of the dispersion of the data, the following +statistics can be used. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.stats.cov + skfda.exploratory.stats.var + + diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 8b663025c..816c6b07f 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -94,7 +94,7 @@ def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): FDataGrid: object containing the computed depth_based median. See also: - geometric_median + :func:`geometric_median` """ depth = depth_method(fdatagrid) @@ -112,21 +112,21 @@ def geometric_median(X: T, tol: float=1.e-8, r"""Compute the geometric median. The sample geometric median is the point that minimizes the :math:`L_1` - norm of the vector of Euclidean distances to all observations: + norm of the vector of distances to all observations: .. math:: \underset{y \in L(\mathcal{T})}{\arg \min} - \sum_{i=1}^m \left \| x_i-y \right \|_2 + \sum_{i=1}^N \left \| x_i-y \right \| - It uses the corrected Weiszfeld algorithm to compute the median, - precalculating the inner product matrix in order to compute the - distances. + It uses the corrected Weiszfeld algorithm to compute the median. Args: X: Object containing different samples of a functional variable. tol: tolerance used to check convergence. + metric: metric used to compute the vector of distances. By + default is the :math:`L_2` distance. Returns: FData: object containing the computed geometric median. @@ -141,7 +141,7 @@ def geometric_median(X: T, tol: float=1.e-8, array([ 1. , 1. , 3. , 0.5]) See also: - depth_based_median + :func:`depth_based_median` References: Gervini, D. (2008). Robust functional estimation using the median and From ea0267d610ea3a3e18723884768309bcbbac3ad4 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 30 Nov 2020 00:20:11 +0100 Subject: [PATCH 137/210] Change default metric to `l2_distance`. --- skfda/exploratory/stats/_stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 872adbbd1..8a23d505a 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -7,7 +7,7 @@ import numpy as np -from ...misc.metrics import lp_norm, lp_distance +from ...misc.metrics import lp_norm, l2_distance from ...representation import FData from ..depth import ModifiedBandDepth @@ -109,7 +109,7 @@ def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): def geometric_median(X: T, tol: float=1.e-8, - metric: Callable = lp_distance) -> T: + metric: Callable = l2_distance) -> T: r"""Compute the geometric median. The sample geometric median is the point that minimizes the :math:`L_1` From ff377338c388bdd45f4e0f482613c5503cb6460e Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 30 Nov 2020 17:28:17 +0100 Subject: [PATCH 138/210] Improve reclassification --- skfda/exploratory/outliers/__init__.py | 1 + skfda/{ml => exploratory/outliers}/neighbors_outlier.py | 8 +++----- skfda/ml/{neighbors_base.py => _neighbors_base.py} | 4 ++-- skfda/ml/classification/__init__.py | 2 +- .../{centroid_classifiers.py => _centroid_classifiers.py} | 0 skfda/ml/classification/neighbors_classifiers.py | 2 +- skfda/ml/clustering/neighbors.py | 2 +- skfda/ml/regression/neighbors.py | 2 +- tests/test_neighbors.py | 6 +++--- 9 files changed, 13 insertions(+), 14 deletions(-) rename skfda/{ml => exploratory/outliers}/neighbors_outlier.py (98%) rename skfda/ml/{neighbors_base.py => _neighbors_base.py} (99%) rename skfda/ml/classification/{centroid_classifiers.py => _centroid_classifiers.py} (100%) diff --git a/skfda/exploratory/outliers/__init__.py b/skfda/exploratory/outliers/__init__.py index 666ee83f6..71515c3bd 100644 --- a/skfda/exploratory/outliers/__init__.py +++ b/skfda/exploratory/outliers/__init__.py @@ -1,3 +1,4 @@ from ._directional_outlyingness import (directional_outlyingness_stats, DirectionalOutlierDetector) from ._iqr import IQROutlierDetector +from .neighbors_outlier import LocalOutlierFactor diff --git a/skfda/ml/neighbors_outlier.py b/skfda/exploratory/outliers/neighbors_outlier.py similarity index 98% rename from skfda/ml/neighbors_outlier.py rename to skfda/exploratory/outliers/neighbors_outlier.py index 5e2e4756d..70a4b6ea3 100644 --- a/skfda/ml/neighbors_outlier.py +++ b/skfda/exploratory/outliers/neighbors_outlier.py @@ -1,9 +1,7 @@ - - from sklearn.base import OutlierMixin -from ..misc.metrics import lp_distance -from .neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, +from ...misc.metrics import lp_distance +from ...ml._neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, _to_multivariate_metric) @@ -102,7 +100,7 @@ class LocalOutlierFactor(NeighborsBase, NeighborsMixin, KNeighborsMixin, **Local Outlier Factor (LOF) for outlier detection**. - >>> from skfda.ml.neighbors_outlier import LocalOutlierFactor + >>> from skfda.exploratory.outliers import LocalOutlierFactor Creation of simulated dataset with 2 outliers to be used with LOF. diff --git a/skfda/ml/neighbors_base.py b/skfda/ml/_neighbors_base.py similarity index 99% rename from skfda/ml/neighbors_base.py rename to skfda/ml/_neighbors_base.py index afcad02b1..ac7b02662 100644 --- a/skfda/ml/neighbors_base.py +++ b/skfda/ml/_neighbors_base.py @@ -1,6 +1,6 @@ """Base classes for the neighbor estimators""" -from abc import ABC, abstractmethod +from abc import ABC from sklearn.base import BaseEstimator from sklearn.base import RegressorMixin @@ -64,7 +64,7 @@ def _to_multivariate_metric(metric, grid_points): >>> import numpy as np >>> from skfda import FDataGrid >>> from skfda.misc.metrics import lp_distance - >>> from skfda.ml.neighbors_base import _to_multivariate_metric + >>> from skfda.ml._neighbors_base import _to_multivariate_metric Calculate the Lp distance between fd and fd2. diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index c7fab00be..a28cb78b1 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,4 +1,4 @@ from .neighbors_classifiers import (KNeighborsClassifier, RadiusNeighborsClassifier) from .depth_classifiers import MaximumDepthClassifier -from .centroid_classifiers import NearestCentroid, DTMClassifier +from ._centroid_classifiers import NearestCentroid, DTMClassifier diff --git a/skfda/ml/classification/centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py similarity index 100% rename from skfda/ml/classification/centroid_classifiers.py rename to skfda/ml/classification/_centroid_classifiers.py diff --git a/skfda/ml/classification/neighbors_classifiers.py b/skfda/ml/classification/neighbors_classifiers.py index 3eac71b68..9c92cdc68 100644 --- a/skfda/ml/classification/neighbors_classifiers.py +++ b/skfda/ml/classification/neighbors_classifiers.py @@ -2,7 +2,7 @@ from sklearn.base import ClassifierMixin -from ..neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, +from .._neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, NeighborsClassifierMixin, RadiusNeighborsMixin) diff --git a/skfda/ml/clustering/neighbors.py b/skfda/ml/clustering/neighbors.py index c02f7ed11..b7b78f4ff 100644 --- a/skfda/ml/clustering/neighbors.py +++ b/skfda/ml/clustering/neighbors.py @@ -1,6 +1,6 @@ """Unsupervised learner for implementing neighbor searches.""" -from ..neighbors_base import (NeighborsBase, NeighborsMixin, +from .._neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, RadiusNeighborsMixin) diff --git a/skfda/ml/regression/neighbors.py b/skfda/ml/regression/neighbors.py index 1f9808cee..b874237da 100644 --- a/skfda/ml/regression/neighbors.py +++ b/skfda/ml/regression/neighbors.py @@ -1,6 +1,6 @@ """Neighbor models for regression.""" -from ..neighbors_base import (NeighborsBase, KNeighborsMixin, +from .._neighbors_base import (NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin, NeighborsRegressorMixin) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index f1cd494e9..3d29c6506 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -1,9 +1,9 @@ """Test neighbors classifiers and regressors""" -from skfda.ml.neighbors_outlier import LocalOutlierFactor # Pending theory +from skfda.exploratory.outliers import LocalOutlierFactor # Pending theory from skfda.datasets import make_multimodal_samples, make_sinusoidal_process from skfda.exploratory.stats import mean -from skfda.misc.metrics import lp_distance, pairwise_distance +from skfda.misc.metrics import lp_distance, l2_distance, pairwise_distance from skfda.ml.classification import (KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid) @@ -348,7 +348,7 @@ def test_lof_fit_predict(self): np.testing.assert_array_equal(expected, res) # With explicit l2 distance - lof2 = LocalOutlierFactor(metric=lp_distance) + lof2 = LocalOutlierFactor(metric=l2_distance) res2 = lof2.fit_predict(self.fd_lof) np.testing.assert_array_equal(expected, res2) From 1ea8953dffb26cac9bc991c480e06c0462d8ece9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 30 Nov 2020 18:40:48 +0100 Subject: [PATCH 139/210] Move type information to the description. --- docs/conf.py | 6 ++++-- skfda/exploratory/stats/_stats.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index a5b3f2d07..2be688a8e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,7 +53,8 @@ 'sphinx_gallery.gen_gallery', 'sphinx.ext.intersphinx', 'sphinx.ext.doctest', - 'jupyter_sphinx'] + 'jupyter_sphinx', + 'sphinx.ext.autodoc.typehints'] autodoc_default_flags = ['members', 'inherited-members'] @@ -237,7 +238,8 @@ } autosummary_generate = True - +autodoc_typehints = "description" +napoleon_use_rtype = True # Napoleon fix for attributes # Taken from diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 8a23d505a..1a67b9676 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -130,7 +130,7 @@ def geometric_median(X: T, tol: float=1.e-8, default is the :math:`L_2` distance. Returns: - FData: object containing the computed geometric median. + Object containing the computed geometric median. Example: From 70d6f0b5bf32de45c3a714782307b788d4e9ca11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 30 Nov 2020 19:13:53 +0100 Subject: [PATCH 140/210] Try style Github action --- .github/workflows/main.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000..0bf1f37bb --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,35 @@ +name: Style + +# Controls when the action will run. +on: + # Triggers the workflow on push or pull request events but only for the develop branch + push: + pull_request: + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v2 + + # Runs a single command using the runners shell + - name: wemake-python-styleguide + # You may pin to the exact commit or the version. + # uses: wemake-services/wemake-python-styleguide@8068e6634aaacf1eecba3f27a529213df3bd6284 + uses: wemake-services/wemake-python-styleguide@0.14.1 + with: + # Path or space-separated list of paths to lint + path: # optional, default is . + # How would you like the results to be displayed? + reporter: 'github-pr-review' # optional, default is terminal + env: + GITHUB_TOKEN: ${{ secrets.github_token }} From cf82e9c3eac81ab29c404af039450797fc075a19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 30 Nov 2020 19:15:59 +0100 Subject: [PATCH 141/210] Update main.yml --- .github/workflows/main.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0bf1f37bb..465a43479 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,8 +2,6 @@ name: Style # Controls when the action will run. on: - # Triggers the workflow on push or pull request events but only for the develop branch - push: pull_request: # Allows you to run this workflow manually from the Actions tab From e72161c240c6fe30bdff28fecaea67c806a7dc90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 30 Nov 2020 19:23:12 +0100 Subject: [PATCH 142/210] Update main.yml --- .github/workflows/main.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 465a43479..bfd73a7fd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,8 +25,6 @@ jobs: # uses: wemake-services/wemake-python-styleguide@8068e6634aaacf1eecba3f27a529213df3bd6284 uses: wemake-services/wemake-python-styleguide@0.14.1 with: - # Path or space-separated list of paths to lint - path: # optional, default is . # How would you like the results to be displayed? reporter: 'github-pr-review' # optional, default is terminal env: From 4392b9e9f73e9a0418b093e00ef556250d0d976c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 30 Nov 2020 20:28:52 +0100 Subject: [PATCH 143/210] Update and rename main.yml to style.yml --- .github/workflows/{main.yml => style.yml} | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) rename .github/workflows/{main.yml => style.yml} (82%) diff --git a/.github/workflows/main.yml b/.github/workflows/style.yml similarity index 82% rename from .github/workflows/main.yml rename to .github/workflows/style.yml index bfd73a7fd..4e87796a7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/style.yml @@ -18,13 +18,18 @@ jobs: steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 + + - id: changed-files + uses: jitterbit/get-changed-files@v1 # Runs a single command using the runners shell - - name: wemake-python-styleguide + - id: wemake-python-styleguide # You may pin to the exact commit or the version. # uses: wemake-services/wemake-python-styleguide@8068e6634aaacf1eecba3f27a529213df3bd6284 uses: wemake-services/wemake-python-styleguide@0.14.1 with: + # Path or space-separated list of paths to lint + path: ${{ steps.changed-files.outputs.added_modified }} # How would you like the results to be displayed? reporter: 'github-pr-review' # optional, default is terminal env: From 007d3190bad15730999aec35b925c1c75e142f39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 30 Nov 2020 20:47:39 +0100 Subject: [PATCH 144/210] Update style.yml --- .github/workflows/style.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 4e87796a7..9704a352c 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -18,9 +18,6 @@ jobs: steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - - - id: changed-files - uses: jitterbit/get-changed-files@v1 # Runs a single command using the runners shell - id: wemake-python-styleguide @@ -28,8 +25,6 @@ jobs: # uses: wemake-services/wemake-python-styleguide@8068e6634aaacf1eecba3f27a529213df3bd6284 uses: wemake-services/wemake-python-styleguide@0.14.1 with: - # Path or space-separated list of paths to lint - path: ${{ steps.changed-files.outputs.added_modified }} # How would you like the results to be displayed? reporter: 'github-pr-review' # optional, default is terminal env: From 3fbc965c6b864308dbc44883142c62037ae21e18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 30 Nov 2020 22:08:11 +0100 Subject: [PATCH 145/210] Add Mypy checks --- .github/workflows/mypy.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/mypy.yml diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml new file mode 100644 index 000000000..21bd6b384 --- /dev/null +++ b/.github/workflows/mypy.yml @@ -0,0 +1,24 @@ +name: Mypy + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + name: Mypy + steps: + - uses: actions/checkout@v1 + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Install Dependencies + run: | + pip install mypy + pip install . + - name: mypy + run: | + mypy . From 356d20f3cd58a983f45a1cbf54c755dd5f6453c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Tue, 1 Dec 2020 12:30:02 +0100 Subject: [PATCH 146/210] Update style.yml --- .github/workflows/style.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 9704a352c..002a8a953 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -18,14 +18,18 @@ jobs: steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 + + - run: | + echo "_CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }} -- '*.py' | tr -s '\n' ' ' )" >> ${GITHUB_ENV} # Runs a single command using the runners shell - id: wemake-python-styleguide # You may pin to the exact commit or the version. # uses: wemake-services/wemake-python-styleguide@8068e6634aaacf1eecba3f27a529213df3bd6284 uses: wemake-services/wemake-python-styleguide@0.14.1 + if: ${{ env._CHANGED_FILES }} with: - # How would you like the results to be displayed? reporter: 'github-pr-review' # optional, default is terminal + path: "${{ env._CHANGED_FILES }}" env: GITHUB_TOKEN: ${{ secrets.github_token }} From 0ad2d5d959766bc35a8c3eac549e0bc24e5cc1a0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 1 Dec 2020 14:08:29 +0100 Subject: [PATCH 147/210] Style --- skfda/ml/regression/neighbors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/skfda/ml/regression/neighbors.py b/skfda/ml/regression/neighbors.py index b874237da..872d4df5c 100644 --- a/skfda/ml/regression/neighbors.py +++ b/skfda/ml/regression/neighbors.py @@ -135,7 +135,6 @@ def __init__(self, n_neighbors=5, weights='uniform', regressor='mean', algorithm='auto', leaf_size=30, metric='l2', metric_params=None, n_jobs=1, multivariate_metric=False): """Initialize the regressor.""" - super().__init__(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=leaf_size, metric=metric, @@ -300,7 +299,6 @@ def __init__(self, radius=1.0, weights='uniform', regressor='mean', metric_params=None, outlier_response=None, n_jobs=1, multivariate_metric=False): """Initialize the classifier.""" - super().__init__(radius=radius, weights=weights, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, n_jobs=n_jobs, @@ -330,5 +328,5 @@ def _init_multivariate_estimator(self, sklearn_metric): n_jobs=self.n_jobs) def _query(self, X): - """Return distances and neighbors of given sample""" + """Return distances and neighbors of given sample.""" return self.estimator_.radius_neighbors(X) From a62ffb1e14f6cee5e3dc54bcef63b3aeea52f155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Tue, 1 Dec 2020 15:19:49 +0100 Subject: [PATCH 148/210] Update style.yml --- .github/workflows/style.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 002a8a953..ad2ecdbbb 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -17,7 +17,7 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v2 + - uses: actions/checkout@v1 - run: | echo "_CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }} -- '*.py' | tr -s '\n' ' ' )" >> ${GITHUB_ENV} From 949b32bc4ef8f73bef7b5ea5bab93d1d071530fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Tue, 1 Dec 2020 15:26:17 +0100 Subject: [PATCH 149/210] Use master branch of the style guide --- .github/workflows/style.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index ad2ecdbbb..b3d1088de 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -26,7 +26,7 @@ jobs: - id: wemake-python-styleguide # You may pin to the exact commit or the version. # uses: wemake-services/wemake-python-styleguide@8068e6634aaacf1eecba3f27a529213df3bd6284 - uses: wemake-services/wemake-python-styleguide@0.14.1 + uses: wemake-services/wemake-python-styleguide@master if: ${{ env._CHANGED_FILES }} with: reporter: 'github-pr-review' # optional, default is terminal From d66d49556acc351946bd11447cca01250e6ad171 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 1 Dec 2020 15:46:04 +0100 Subject: [PATCH 150/210] Add skips for mypy. --- setup.cfg | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/setup.cfg b/setup.cfg index 6ceed60a0..d370ac7bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,3 +15,30 @@ omit = */datasets/* # Omit reporting for __init__.py files */__init__.py + +[mypy-dcor.*] +ignore_missing_imports = True + +[mypy-matplotlib.*] +ignore_missing_imports = True + +[mypy-multimethod.*] +ignore_missing_imports = True + +[mypy-numpy.*] +ignore_missing_imports = True + +[mypy-pandas.*] +ignore_missing_imports = True + +[mypy-pytest.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True + +[mypy-skdatasets.*] +ignore_missing_imports = True + +[mypy-sklearn.*] +ignore_missing_imports = True From f77c48f380dea98270aff38dd6770173b5a67f25 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 1 Dec 2020 16:14:22 +0100 Subject: [PATCH 151/210] Add errors to ignore in the style. --- setup.cfg | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 6ceed60a0..98cb844fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,20 @@ doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS norecursedirs = '.*', 'build', 'dist' '*.egg' 'venv' .svn _build docs/auto_examples examples [flake8] -ignore = F401,W504,W503 +ignore = + # Line break occurred before a binary operator (antipattern) + W503, + # We like local imports, thanks + WPS300, + # Our private modules are fine to import + # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) + WPS436, + # Our private objects are fine to import + WPS450 + +per-file-ignores = + # Unused modules are allowed in __init__, to reduce imports + __init__.py: F401 [coverage:run] omit = From 347b7fb79d1043ca9788d6b6af8ab4effb8fcdf0 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 1 Dec 2020 16:49:01 +0100 Subject: [PATCH 152/210] Add more ignored style violations. --- setup.cfg | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/setup.cfg b/setup.cfg index 98cb844fd..ea7df5ade 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,8 +8,14 @@ norecursedirs = '.*', 'build', 'dist' '*.egg' 'venv' .svn _build docs/auto_examp [flake8] ignore = + # Uppercase arguments like X are common in scikit-learn + N803, # Line break occurred before a binary operator (antipattern) W503, + # Short names like X or y are common in scikit-learn + WPS111, + # Trailing underscores are a scikit-learn convention + WPS120, # We like local imports, thanks WPS300, # Our private modules are fine to import @@ -21,6 +27,15 @@ ignore = per-file-ignores = # Unused modules are allowed in __init__, to reduce imports __init__.py: F401 + +rst-directives = + # These are sorted alphabetically - but that does not matter + autosummary,data,currentmodule,deprecated, + glossary,moduleauthor,plot,testcode, + versionadded,versionchanged, + +rst-roles = + attr,class,func,meth,mod,obj,ref,term, [coverage:run] omit = From 6c7f8c62ed46b3ec31f89fca23af2d75155a05f2 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 1 Dec 2020 20:01:49 +0100 Subject: [PATCH 153/210] Small fixes --- skfda/ml/classification/_centroid_classifiers.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 14fdcdc01..308613174 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -61,7 +61,7 @@ class and return a :class:`FData` object with only one sample array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) See also: - :class:`~skfda.ml.classification.DTMClassifier + :class:`~skfda.ml.classification.DTMClassifier` """ def __init__(self, metric=l2_distance, centroid=mean): @@ -156,7 +156,7 @@ class DTMClassifier(BaseEstimator, ClassifierMixin): 0.875 See also: - :class:`~skfda.ml.classification.MaximumDepthClassifier + :class:`~skfda.ml.classification.MaximumDepthClassifier` References: Fraiman, R. and Muniz, G. (2001). Trimmed means for functional @@ -180,10 +180,10 @@ def fit(self, X, y): """ self._clf = NearestCentroid( - metric=self.metric, - centroid=lambda fdatagrid: trim_mean(fdatagrid, - self.proportiontocut, - self.depth_method)) + metric=self.metric, + centroid=lambda fdatagrid: trim_mean(fdatagrid, + self.proportiontocut, + self.depth_method)) self._clf.fit(X, y) return self From 2c827fe4bd9b5f0899f6317c3055bb614045907d Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 1 Dec 2020 20:17:41 +0100 Subject: [PATCH 154/210] More changes --- skfda/exploratory/outliers/__init__.py | 4 ++-- skfda/exploratory/outliers/neighbors_outlier.py | 4 ++-- skfda/ml/classification/__init__.py | 4 ++-- skfda/ml/classification/_centroid_classifiers.py | 10 ++++------ skfda/ml/classification/neighbors_classifiers.py | 13 +++++++------ skfda/ml/clustering/__init__.py | 2 +- skfda/ml/clustering/kmeans.py | 3 ++- skfda/ml/clustering/neighbors.py | 5 ++--- skfda/ml/regression/neighbors.py | 14 +++++++------- tests/test_clustering.py | 3 ++- 10 files changed, 31 insertions(+), 31 deletions(-) diff --git a/skfda/exploratory/outliers/__init__.py b/skfda/exploratory/outliers/__init__.py index 71515c3bd..d8adfc7b2 100644 --- a/skfda/exploratory/outliers/__init__.py +++ b/skfda/exploratory/outliers/__init__.py @@ -1,4 +1,4 @@ -from ._directional_outlyingness import (directional_outlyingness_stats, - DirectionalOutlierDetector) +from ._directional_outlyingness import (DirectionalOutlierDetector, + directional_outlyingness_stats) from ._iqr import IQROutlierDetector from .neighbors_outlier import LocalOutlierFactor diff --git a/skfda/exploratory/outliers/neighbors_outlier.py b/skfda/exploratory/outliers/neighbors_outlier.py index 70a4b6ea3..f7fe5817c 100644 --- a/skfda/exploratory/outliers/neighbors_outlier.py +++ b/skfda/exploratory/outliers/neighbors_outlier.py @@ -1,8 +1,8 @@ from sklearn.base import OutlierMixin from ...misc.metrics import lp_distance -from ...ml._neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, - _to_multivariate_metric) +from ...ml._neighbors_base import (KNeighborsMixin, NeighborsBase, + NeighborsMixin, _to_multivariate_metric) class LocalOutlierFactor(NeighborsBase, NeighborsMixin, KNeighborsMixin, diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index a28cb78b1..80a2546c4 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,4 +1,4 @@ +from ._centroid_classifiers import DTMClassifier, NearestCentroid +from .depth_classifiers import MaximumDepthClassifier from .neighbors_classifiers import (KNeighborsClassifier, RadiusNeighborsClassifier) -from .depth_classifiers import MaximumDepthClassifier -from ._centroid_classifiers import NearestCentroid, DTMClassifier diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 308613174..d09d748fd 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -1,16 +1,14 @@ """Centroid models for supervised classification.""" from typing import Callable -from sklearn.base import ClassifierMixin, BaseEstimator +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted -from ...exploratory.stats import mean -from ...misc.metrics import l2_distance, pairwise_distance -from ...exploratory.depth import Depth, ModifiedBandDepth -from ...exploratory.stats import trim_mean -from ...misc.metrics import lp_distance from ..._utils import _classifier_get_classes +from ...exploratory.depth import Depth, ModifiedBandDepth +from ...exploratory.stats import mean, trim_mean +from ...misc.metrics import l2_distance, lp_distance, pairwise_distance class NearestCentroid(BaseEstimator, ClassifierMixin): diff --git a/skfda/ml/classification/neighbors_classifiers.py b/skfda/ml/classification/neighbors_classifiers.py index 9c92cdc68..ed933fcd3 100644 --- a/skfda/ml/classification/neighbors_classifiers.py +++ b/skfda/ml/classification/neighbors_classifiers.py @@ -2,8 +2,9 @@ from sklearn.base import ClassifierMixin -from .._neighbors_base import (NeighborsBase, NeighborsMixin, KNeighborsMixin, - NeighborsClassifierMixin, RadiusNeighborsMixin) +from .._neighbors_base import (KNeighborsMixin, NeighborsBase, + NeighborsClassifierMixin, NeighborsMixin, + RadiusNeighborsMixin) class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, @@ -128,8 +129,8 @@ def _init_estimator(self, sklearn_metric): Returns: Sklearn K Neighbors estimator initialized. """ - from sklearn.neighbors import (KNeighborsClassifier as - _KNeighborsClassifier) + from sklearn.neighbors import \ + KNeighborsClassifier as _KNeighborsClassifier return _KNeighborsClassifier( n_neighbors=self.n_neighbors, weights=self.weights, @@ -274,8 +275,8 @@ def _init_estimator(self, sklearn_metric): Returns: Sklearn Radius Neighbors estimator initialized. """ - from sklearn.neighbors import (RadiusNeighborsClassifier as - _RadiusNeighborsClassifier) + from sklearn.neighbors import \ + RadiusNeighborsClassifier as _RadiusNeighborsClassifier return _RadiusNeighborsClassifier( radius=self.radius, weights=self.weights, diff --git a/skfda/ml/clustering/__init__.py b/skfda/ml/clustering/__init__.py index a996898e5..6d73ea1f0 100644 --- a/skfda/ml/clustering/__init__.py +++ b/skfda/ml/clustering/__init__.py @@ -1,2 +1,2 @@ +from .kmeans import BaseKMeans, FuzzyCMeans, KMeans from .neighbors import NearestNeighbors -from .kmeans import BaseKMeans, KMeans, FuzzyCMeans diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/kmeans.py index 3070e217f..94605b3c8 100644 --- a/skfda/ml/clustering/kmeans.py +++ b/skfda/ml/clustering/kmeans.py @@ -699,7 +699,8 @@ def _check_params(self): def _compute_inertia(self, membership, centroids, distances_to_centroids): return np.sum( - membership ** self.fuzzifier * distances_to_centroids ** 2) + membership ** self.fuzzifier * distances_to_centroids ** 2 + ) def _create_membership(self, n_samples): return np.empty((n_samples, self.n_clusters)) diff --git a/skfda/ml/clustering/neighbors.py b/skfda/ml/clustering/neighbors.py index b7b78f4ff..1dcfa24e9 100644 --- a/skfda/ml/clustering/neighbors.py +++ b/skfda/ml/clustering/neighbors.py @@ -1,8 +1,7 @@ """Unsupervised learner for implementing neighbor searches.""" -from .._neighbors_base import (NeighborsBase, NeighborsMixin, - KNeighborsMixin, - RadiusNeighborsMixin) +from .._neighbors_base import (KNeighborsMixin, NeighborsBase, NeighborsMixin, + RadiusNeighborsMixin) class NearestNeighbors(NeighborsBase, NeighborsMixin, KNeighborsMixin, diff --git a/skfda/ml/regression/neighbors.py b/skfda/ml/regression/neighbors.py index 872d4df5c..02cd1ff85 100644 --- a/skfda/ml/regression/neighbors.py +++ b/skfda/ml/regression/neighbors.py @@ -1,8 +1,7 @@ """Neighbor models for regression.""" -from .._neighbors_base import (NeighborsBase, KNeighborsMixin, - RadiusNeighborsMixin, - NeighborsRegressorMixin) +from .._neighbors_base import (KNeighborsMixin, NeighborsBase, + NeighborsRegressorMixin, RadiusNeighborsMixin) class KNeighborsRegressor(NeighborsBase, NeighborsRegressorMixin, @@ -154,8 +153,8 @@ def _init_multivariate_estimator(self, sklearn_metric): Sklearn K Neighbors estimator initialized. """ - from sklearn.neighbors import (KNeighborsRegressor as - _KNeighborsRegressor) + from sklearn.neighbors import \ + KNeighborsRegressor as _KNeighborsRegressor return _KNeighborsRegressor( n_neighbors=self.n_neighbors, weights=self.weights, @@ -318,8 +317,8 @@ def _init_multivariate_estimator(self, sklearn_metric): Sklearn Radius Neighbors estimator initialized. """ - from sklearn.neighbors import (RadiusNeighborsRegressor as - _RadiusNeighborsRegressor) + from sklearn.neighbors import \ + RadiusNeighborsRegressor as _RadiusNeighborsRegressor return _RadiusNeighborsRegressor( radius=self.radius, weights=self.weights, @@ -329,4 +328,5 @@ def _init_multivariate_estimator(self, sklearn_metric): def _query(self, X): """Return distances and neighbors of given sample.""" + return self.estimator_.radius_neighbors(X) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index 21d306581..dc93bb4e9 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -23,7 +23,8 @@ def test_kmeans_univariate(self): np.array([[2.98142397, 9.23534876], [0.68718427, 6.50960828], [3.31243449, 4.39222798], - [6.49679408, 0.]])) + [6.49679408, 0.] + ])) np.testing.assert_array_equal(kmeans.predict(fd), np.array([0, 0, 0, 1])) np.testing.assert_allclose(kmeans.transform(fd), From bbf1d10ac4c64ed6830c5d06c9889ed10cec3083 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 1 Dec 2020 21:13:07 +0100 Subject: [PATCH 155/210] More changes --- tests/test_neighbors.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 3d29c6506..30786409c 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -1,21 +1,19 @@ """Test neighbors classifiers and regressors""" -from skfda.exploratory.outliers import LocalOutlierFactor # Pending theory +import unittest + +import numpy as np from skfda.datasets import make_multimodal_samples, make_sinusoidal_process +from skfda.exploratory.outliers import LocalOutlierFactor # Pending theory from skfda.exploratory.stats import mean -from skfda.misc.metrics import lp_distance, l2_distance, pairwise_distance -from skfda.ml.classification import (KNeighborsClassifier, - RadiusNeighborsClassifier, - NearestCentroid) +from skfda.misc.metrics import l2_distance, lp_distance, pairwise_distance +from skfda.ml.classification import (KNeighborsClassifier, NearestCentroid, + RadiusNeighborsClassifier) from skfda.ml.clustering import NearestNeighbors from skfda.ml.regression import KNeighborsRegressor, RadiusNeighborsRegressor from skfda.representation.basis import Fourier -import unittest - -import numpy as np -# from skfda.exploratory.outliers import LocalOutlierFactor class TestNeighbors(unittest.TestCase): def setUp(self): @@ -74,7 +72,7 @@ def test_predict_proba_classifier(self): np.testing.assert_array_almost_equal(probs, self.probs) def test_predict_regressor(self): - """Test scalar regression, predics mode location""" + """Test scalar regression, predics mode location.""" # Dummy test, with weight = distance, only the sample with distance 0 # will be returned, obtaining the exact location @@ -90,8 +88,7 @@ def test_predict_regressor(self): self.modes_location) def test_kneighbors(self): - """Test k neighbor searches for all k-neighbors estimators""" - + """Test k neighbor searches for all k-neighbors estimators.""" nn = NearestNeighbors() nn.fit(self.X) @@ -124,7 +121,7 @@ def test_kneighbors(self): self.assertEqual(graph[0, i] == 0.0, i not in links[0]) def test_radius_neighbors(self): - """Test query with radius""" + """Test query with radius.""" nn = NearestNeighbors(radius=.1) nn.fit(self.X) From 6283dcc7a4bd22bdf9123dc8eb26035eb7e8b4b9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Dec 2020 01:02:51 +0100 Subject: [PATCH 156/210] Mypy configuration. Add mypy ignores of untyped modules. Typing of basis. --- setup.cfg | 9 +++++ skfda/representation/basis/_basis.py | 58 ++++++++++++++-------------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/setup.cfg b/setup.cfg index d370ac7bd..bf91bd7c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,9 @@ omit = [mypy-dcor.*] ignore_missing_imports = True +[mypy-findiff.*] +ignore_missing_imports = True + [mypy-matplotlib.*] ignore_missing_imports = True @@ -37,8 +40,14 @@ ignore_missing_imports = True [mypy-scipy.*] ignore_missing_imports = True +[mypy-setuptools.*] +ignore_missing_imports = True + [mypy-skdatasets.*] ignore_missing_imports = True [mypy-sklearn.*] ignore_missing_imports = True + +[mypy-sphinx.*] +ignore_missing_imports = True diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index f634b8a8b..aa9701b68 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -4,15 +4,15 @@ the corresponding basis classes. """ -from abc import ABC, abstractmethod import copy import warnings +from abc import ABC, abstractmethod +from typing import Tuple import numpy as np -from ..._utils import (_domain_range, _same_domain, - _reshape_eval_points) - +from ..._utils import _domain_range, _reshape_eval_points, _same_domain +from . import _fdatabasis __author__ = "Miguel Carbajo Berrocal" __email__ = "miguel.carbajo@estudiante.uam.es" @@ -36,7 +36,7 @@ class Basis(ABC): """ - def __init__(self, domain_range=None, n_basis=1): + def __init__(self, domain_range=None, n_basis: int=1): """Basis constructor. Args: @@ -62,30 +62,30 @@ def __init__(self, domain_range=None, n_basis=1): super().__init__() @property - def dim_domain(self): + def dim_domain(self) -> int: return 1 @property - def dim_codomain(self): + def dim_codomain(self) -> int: return 1 @property - def domain_range(self): + def domain_range(self) -> Tuple[Tuple[float, float], ...]: if self._domain_range is None: return ((0, 1),) * self.dim_domain else: return self._domain_range @property - def n_basis(self): + def n_basis(self) -> int: return self._n_basis @abstractmethod - def _evaluate(self, eval_points): + def _evaluate(self, eval_points) -> np.ndarray: """Subclasses must override this to provide basis evaluation.""" pass - def evaluate(self, eval_points, *, derivative=0): + def evaluate(self, eval_points, *, derivative: int = 0) -> np.ndarray: """Evaluate Basis objects and its derivatives. Evaluates the basis function system or its derivatives at a list of @@ -96,7 +96,7 @@ def evaluate(self, eval_points, *, derivative=0): evaluated. Returns: - (numpy.darray): Matrix whose rows are the values of the each + Matrix whose rows are the values of the each basis function or its derivatives at the values specified in eval_points. @@ -116,26 +116,26 @@ def evaluate(self, eval_points, *, derivative=0): return self._evaluate(eval_points).reshape( (self.n_basis, len(eval_points), self.dim_codomain)) - def __call__(self, *args, **kwargs): + def __call__(self, *args, **kwargs) -> np.ndarray: return self.evaluate(*args, **kwargs) - def __len__(self): + def __len__(self) -> int: return self.n_basis - def derivative(self, *, order=1): + def derivative(self, *, order: int = 1) -> '_fdatabasis.FDataBasis': """Construct a FDataBasis object containing the derivative. Args: - order (int, optional): Order of the derivative. Defaults to 1. + order: Order of the derivative. Defaults to 1. Returns: - (FDataBasis): Derivative object. + Derivative object. """ return self.to_basis().derivative(order=order) - def _derivative_basis_and_coefs(self, coefs, order=1): + def _derivative_basis_and_coefs(self, coefs: np.ndarray, order: int = 1): """ Subclasses can override this to provide derivative construction. @@ -222,7 +222,7 @@ def copy(self, domain_range=None): return new_copy - def to_basis(self): + def to_basis(self) -> '_fdatabasis.FDataBasis': from . import FDataBasis return FDataBasis(self.copy(), np.identity(self.n_basis)) @@ -235,7 +235,7 @@ def _list_to_R(self, knots): def _to_R(self): raise NotImplementedError - def inner_product_matrix(self, other=None): + def inner_product_matrix(self, other: 'Basis' = None) -> np.array: r"""Return the Inner Product Matrix of a pair of basis. The Inner Product Matrix is defined as @@ -249,12 +249,12 @@ def inner_product_matrix(self, other=None): between objects on two basis and for the change of basis. Args: - other (:class:`Basis`): Basis to compute the inner product + other: Basis to compute the inner product matrix. If not basis is given, it computes the matrix with itself returning the Gram Matrix Returns: - numpy.array: Inner Product Matrix of two basis + Inner Product Matrix of two basis """ from ...misc import inner_product_matrix @@ -264,7 +264,7 @@ def inner_product_matrix(self, other=None): return inner_product_matrix(self, other) - def _gram_matrix_numerical(self): + def _gram_matrix_numerical(self) -> np.array: """ Compute the Gram matrix numerically. @@ -273,7 +273,7 @@ def _gram_matrix_numerical(self): return inner_product_matrix(self, force_numerical=True) - def _gram_matrix(self): + def _gram_matrix(self) -> np.array: """ Compute the Gram matrix. @@ -283,7 +283,7 @@ def _gram_matrix(self): """ return self._gram_matrix_numerical() - def gram_matrix(self): + def gram_matrix(self) -> np.array: r"""Return the Gram Matrix of a basis The Gram Matrix is defined as @@ -295,7 +295,7 @@ def gram_matrix(self): symmetric matrix and positive-semidefinite. Returns: - numpy.array: Gram Matrix of the basis. + Gram Matrix of the basis. """ @@ -334,17 +334,17 @@ def _mul_constant(self, coefs, other): return self.copy(), coefs - def __repr__(self): + def __repr__(self) -> str: """Representation of a Basis object.""" return (f"{self.__class__.__name__}(domain_range={self.domain_range}, " f"n_basis={self.n_basis})") - def __eq__(self, other): + def __eq__(self, other) -> bool: """Equality of Basis""" return (type(self) == type(other) and _same_domain(self, other) and self.n_basis == other.n_basis) - def __hash__(self): + def __hash__(self) -> int: """Hash of Basis""" return hash((self.domain_range, self.n_basis)) From a6cb08dc648796b3c0980470e571039a7985e4a0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 02:58:23 +0100 Subject: [PATCH 157/210] Style --- setup.cfg | 17 +- .../classification/_centroid_classifiers.py | 33 ++- tests/test_neighbors.py | 271 +++++++++++------- 3 files changed, 209 insertions(+), 112 deletions(-) diff --git a/setup.cfg b/setup.cfg index ea7df5ade..9f74f46bb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,11 +22,19 @@ ignore = # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) WPS436, # Our private objects are fine to import - WPS450 + WPS450, + # We need multine loops + WPS352, + # We love f-strings + WPS305, + # There are no bad quotes + Q000 per-file-ignores = # Unused modules are allowed in __init__, to reduce imports __init__.py: F401 + # Tests benefit from magic numbers + test_*.py: WPS432 rst-directives = # These are sorted alphabetically - but that does not matter @@ -37,6 +45,13 @@ rst-directives = rst-roles = attr,class,func,meth,mod,obj,ref,term, +# Needs to be tuned +max-line-complexity = 25 +max-methods = 25 +max-local-variables = 15 +max-expressions = 15 +max-module-expressions = 15 + [coverage:run] omit = # Omit reporting for dataset module diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index d09d748fd..f12736987 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -76,9 +76,13 @@ def fit(self, X, y): [n_samples, n_samples] if metric='precomputed'. y (array-like or sparse matrix): Target values of shape = [n_samples] or [n_samples, n_outputs]. + + Returns: + self """ - self.classes_, y_ind = _classifier_get_classes(y) + classes_, y_ind = _classifier_get_classes(y) + self.classes_ = classes_ self.centroids_ = self.centroid(X[y_ind == 0]) for cur_class in range(1, self.classes_.size): @@ -100,7 +104,10 @@ def predict(self, X): sklearn_check_is_fitted(self) return self.classes_[pairwise_distance(self.metric)( - X, self.centroids_).argmin(axis=1)] + X, + self.centroids_, + ).argmin(axis=1) + ] class DTMClassifier(BaseEstimator, ClassifierMixin): @@ -161,11 +168,16 @@ class DTMClassifier(BaseEstimator, ClassifierMixin): data. Test, 10, 419-440. """ - def __init__(self, proportiontocut: float, - depth_method: Depth = ModifiedBandDepth(), - metric: Callable = lp_distance) -> None: + def __init__( + self, + proportiontocut: float, + depth_method: Depth = None, + metric: Callable = lp_distance, + ) -> None: """Initialize the classifier.""" self.proportiontocut = proportiontocut + if depth_method is None: + self.depth_method = ModifiedBandDepth() self.depth_method = depth_method self.metric = metric @@ -176,12 +188,17 @@ def fit(self, X, y): X (:class:`FDataGrid`): FDataGrid with the training data. y (array-like): Target values of shape = [n_samples]. + Returns: + self """ self._clf = NearestCentroid( metric=self.metric, - centroid=lambda fdatagrid: trim_mean(fdatagrid, - self.proportiontocut, - self.depth_method)) + centroid=lambda fdatagrid: trim_mean( + fdatagrid, + self.proportiontocut, + self.depth_method, + ), + ) self._clf.fit(X, y) return self diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 30786409c..423e6da98 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -1,8 +1,9 @@ -"""Test neighbors classifiers and regressors""" +"""Test neighbors classifiers and regressors.""" import unittest import numpy as np + from skfda.datasets import make_multimodal_samples, make_sinusoidal_process from skfda.exploratory.outliers import LocalOutlierFactor # Pending theory from skfda.exploratory.stats import mean @@ -17,11 +18,12 @@ class TestNeighbors(unittest.TestCase): def setUp(self): - """Creates test data""" + """Create test data.""" random_state = np.random.RandomState(0) - modes_location = np.concatenate( - (random_state.normal(-.3, .04, size=15), - random_state.normal(.3, .04, size=15))) + modes_location = np.concatenate(( + random_state.normal(-0.3, 0.04, size=15), + random_state.normal(0.3, 0.04, size=15), + )) idx = np.arange(30) random_state.shuffle(idx) @@ -30,40 +32,55 @@ def setUp(self): self.modes_location = modes_location self.y = np.array(15 * [0] + 15 * [1])[idx] - self.X = make_multimodal_samples(n_samples=30, - modes_location=modes_location, - noise=.05, - random_state=random_state) - self.X2 = make_multimodal_samples(n_samples=30, - modes_location=modes_location, - noise=.05, - random_state=1) - - self.probs = np.array(15 * [[1., 0.]] + 15 * [[0., 1.]])[idx] + self.X = make_multimodal_samples( + n_samples=30, + modes_location=modes_location, + noise=0.05, + random_state=random_state, + ) + self.X2 = make_multimodal_samples( + n_samples=30, + modes_location=modes_location, + noise=0.05, + random_state=1, + ) + + self.probs = np.array(15 * [[1.0, 0.0]] + 15 * [[0.0, 1.0]])[idx] # Dataset with outliers - fd_clean = make_sinusoidal_process(n_samples=25, error_std=0, - phase_std=0.1, random_state=0) - fd_outliers = make_sinusoidal_process(n_samples=2, error_std=0, - phase_mean=0.5, random_state=5) + fd_clean = make_sinusoidal_process( + n_samples=25, + error_std=0, + phase_std=0.1, + random_state=0 + ) + fd_outliers = make_sinusoidal_process( + n_samples=2, + error_std=0, + phase_mean=0.5, + random_state=5, + ) self.fd_lof = fd_outliers.concatenate(fd_clean) def test_predict_classifier(self): - """Tests predict for neighbors classifier""" - - for neigh in (KNeighborsClassifier(), - RadiusNeighborsClassifier(radius=.1), - NearestCentroid(), - NearestCentroid(metric=lp_distance, centroid=mean)): + """Tests predict for neighbors classifier.""" + for neigh in ( + KNeighborsClassifier(), + RadiusNeighborsClassifier(radius=0.1), + NearestCentroid(), + NearestCentroid(metric=lp_distance, centroid=mean), + ): neigh.fit(self.X, self.y) pred = neigh.predict(self.X) - np.testing.assert_array_equal(pred, self.y, - err_msg=f"fail in {type(neigh)}") + np.testing.assert_array_equal( + pred, + self.y, + err_msg=f'fail in {type(neigh)}', + ) def test_predict_proba_classifier(self): - """Tests predict proba for k neighbors classifier""" - + """Tests predict proba for k neighbors classifier.""" neigh = KNeighborsClassifier(metric=lp_distance) neigh.fit(self.X, self.y) @@ -73,19 +90,22 @@ def test_predict_proba_classifier(self): def test_predict_regressor(self): """Test scalar regression, predics mode location.""" - # Dummy test, with weight = distance, only the sample with distance 0 # will be returned, obtaining the exact location knnr = KNeighborsRegressor(weights='distance') - rnnr = RadiusNeighborsRegressor(weights='distance', radius=.1) + rnnr = RadiusNeighborsRegressor(weights='distance', radius=0.1) knnr.fit(self.X, self.modes_location) rnnr.fit(self.X, self.modes_location) - np.testing.assert_array_almost_equal(knnr.predict(self.X), - self.modes_location) - np.testing.assert_array_almost_equal(rnnr.predict(self.X), - self.modes_location) + np.testing.assert_array_almost_equal( + knnr.predict(self.X), + self.modes_location, + ) + np.testing.assert_array_almost_equal( + rnnr.predict(self.X), + self.modes_location, + ) def test_kneighbors(self): """Test k neighbor searches for all k-neighbors estimators.""" @@ -101,14 +121,18 @@ def test_kneighbors(self): knnr = KNeighborsRegressor() knnr.fit(self.X, self.modes_location) - for neigh in [nn, knn, knnr, lof]: + for neigh in (nn, knn, knnr, lof): dist, links = neigh.kneighbors(self.X[:4]) - np.testing.assert_array_equal(links, [[0, 7, 21, 23, 15], - [1, 12, 19, 18, 17], - [2, 17, 22, 27, 26], - [3, 4, 9, 5, 25]]) + np.testing.assert_array_equal( + links, + [[0, 7, 21, 23, 15], + [1, 12, 19, 18, 17], + [2, 17, 22, 27, 26], + [3, 4, 9, 5, 25], + ], + ) graph = neigh.kneighbors_graph(self.X[:4]) @@ -122,16 +146,16 @@ def test_kneighbors(self): def test_radius_neighbors(self): """Test query with radius.""" - nn = NearestNeighbors(radius=.1) + nn = NearestNeighbors(radius=0.1) nn.fit(self.X) - knn = RadiusNeighborsClassifier(radius=.1) + knn = RadiusNeighborsClassifier(radius=0.1) knn.fit(self.X, self.y) - knnr = RadiusNeighborsRegressor(radius=.1) + knnr = RadiusNeighborsRegressor(radius=0.1) knnr.fit(self.X, self.modes_location) - for neigh in [nn, knn, knnr]: + for neigh in (nn, knn, knnr): dist, links = neigh.radius_neighbors(self.X[:4]) @@ -156,40 +180,54 @@ def test_knn_functional_response(self): knnr.fit(self.X, self.X) res = knnr.predict(self.X) - np.testing.assert_array_almost_equal(res.data_matrix, - self.X.data_matrix) + np.testing.assert_array_almost_equal( + res.data_matrix, + self.X.data_matrix, + ) def test_knn_functional_response_sklearn(self): # Check sklearn metric - knnr = KNeighborsRegressor(n_neighbors=1, metric='euclidean', - multivariate_metric=True) + knnr = KNeighborsRegressor( + n_neighbors=1, + metric='euclidean', + multivariate_metric=True, + ) knnr.fit(self.X, self.X) res = knnr.predict(self.X) - np.testing.assert_array_almost_equal(res.data_matrix, - self.X.data_matrix) + np.testing.assert_array_almost_equal( + res.data_matrix, + self.X.data_matrix, + ) def test_knn_functional_response_precomputed(self): - knnr = KNeighborsRegressor(n_neighbors=4, weights='distance', - metric='precomputed') + knnr = KNeighborsRegressor( + n_neighbors=4, + weights='distance', + metric='precomputed', + ) d = pairwise_distance(lp_distance) distances = d(self.X[:4], self.X[:4]) knnr.fit(distances, self.X[:4]) res = knnr.predict(distances) - np.testing.assert_array_almost_equal(res.data_matrix, - self.X[:4].data_matrix) + np.testing.assert_array_almost_equal( + res.data_matrix, self.X[:4].data_matrix, + ) def test_radius_functional_response(self): - knnr = RadiusNeighborsRegressor(metric=lp_distance, - weights='distance') + knnr = RadiusNeighborsRegressor( + metric=lp_distance, + weights='distance', + ) knnr.fit(self.X, self.X) res = knnr.predict(self.X) - np.testing.assert_array_almost_equal(res.data_matrix, - self.X.data_matrix) + np.testing.assert_array_almost_equal( + res.data_matrix, self.X.data_matrix, + ) def test_functional_response_custom_weights(self): @@ -202,13 +240,15 @@ def weights(weights): knnr.fit(self.X, response) res = knnr.predict(self.X) - np.testing.assert_array_almost_equal(res.coefficients, - response.coefficients) + np.testing.assert_array_almost_equal( + res.coefficients, response.coefficients, + ) def test_functional_regression_distance_weights(self): knnr = KNeighborsRegressor( - weights='distance', n_neighbors=10) + weights='distance', n_neighbors=10, + ) knnr.fit(self.X[:10], self.X[:10]) res = knnr.predict(self.X[11]) @@ -219,8 +259,9 @@ def test_functional_regression_distance_weights(self): weights /= weights.sum() response = (self.X[:10] * weights).sum() - np.testing.assert_array_almost_equal(res.data_matrix, - response.data_matrix) + np.testing.assert_array_almost_equal( + res.data_matrix, response.data_matrix, + ) def test_functional_response_basis(self): knnr = KNeighborsRegressor(weights='distance', n_neighbors=5) @@ -228,8 +269,9 @@ def test_functional_response_basis(self): knnr.fit(self.X, response) res = knnr.predict(self.X) - np.testing.assert_array_almost_equal(res.coefficients, - response.coefficients) + np.testing.assert_array_almost_equal( + res.coefficients, response.coefficients, + ) def test_radius_outlier_functional_response(self): knnr = RadiusNeighborsRegressor(radius=0.001) @@ -240,25 +282,27 @@ def test_radius_outlier_functional_response(self): knnr.predict(self.X[:10]) # Test response - knnr = RadiusNeighborsRegressor(radius=0.001, - outlier_response=self.X[0]) + knnr = RadiusNeighborsRegressor( + radius=0.001, outlier_response=self.X[0], + ) knnr.fit(self.X[:6], self.X[:6]) res = knnr.predict(self.X[:7]) - np.testing.assert_array_almost_equal(self.X[0].data_matrix, - res[6].data_matrix) + np.testing.assert_array_almost_equal( + self.X[0].data_matrix, res[6].data_matrix, + ) def test_nearest_centroids_exceptions(self): # Test more than one class nn = NearestCentroid() with np.testing.assert_raises(ValueError): - nn.fit(self.X[0:3], 3 * [0]) + nn.fit(self.X[:3], 3 * [0]) # Precomputed not supported nn = NearestCentroid(metric='precomputed') with np.testing.assert_raises(ValueError): - nn.fit(self.X[0:3], 3 * [0]) + nn.fit(self.X[:3], 3 * [0]) def test_functional_regressor_exceptions(self): @@ -276,19 +320,26 @@ def test_search_neighbors_precomputed(self): _, neighbors = nn.kneighbors(distances) - result = np.array([[0, 3], [1, 2], [2, 1], [3, 0]]) - np.testing.assert_array_almost_equal(neighbors, result) + np.testing.assert_array_almost_equal( + neighbors, + np.array([[0, 3], [1, 2], [2, 1], [3, 0]]), + ) def test_search_neighbors_sklearn(self): - nn = NearestNeighbors(metric='euclidean', multivariate_metric=True, - n_neighbors=2) + nn = NearestNeighbors( + metric='euclidean', + multivariate_metric=True, + n_neighbors=2, + ) nn.fit(self.X[:4], self.y[:4]) _, neighbors = nn.kneighbors(self.X[:4]) - result = np.array([[0, 3], [1, 2], [2, 1], [3, 0]]) - np.testing.assert_array_almost_equal(neighbors, result) + np.testing.assert_array_almost_equal( + neighbors, + np.array([[0, 3], [1, 2], [2, 1], [3, 0]]), + ) def test_score_scalar_response(self): @@ -311,8 +362,11 @@ def test_score_functional_response(self): y = y.to_basis(Fourier(domain_range=y.domain_range[0], n_basis=5)) neigh.fit(self.X, y) - r = neigh.score(self.X[:7], y[:7], - sample_weight=4 * [1. / 5] + 3 * [1. / 15]) + r = neigh.score( + self.X[:7], + y[:7], + sample_weight=4 * [1.0 / 5] + 3 * [1.0 / 15], + ) np.testing.assert_almost_equal(r, 0.9982527586114364) def test_score_functional_response_exceptions(self): @@ -333,11 +387,10 @@ def test_multivariate_response_score(self): neigh.score(self.X[:5], y) def test_lof_fit_predict(self): - """ Test same results with different forms to call fit_predict""" - + """ Test same results with different forms to call fit_predict.""" # Outliers expected = np.ones(len(self.fd_lof)) - expected[0:2] = -1 + expected[:2] = -1 # With default l2 distance lof = LocalOutlierFactor() @@ -353,12 +406,12 @@ def test_lof_fit_predict(self): distances = d(self.fd_lof, self.fd_lof) # With precompute distances - lof3 = LocalOutlierFactor(metric="precomputed") + lof3 = LocalOutlierFactor(metric='precomputed') res3 = lof3.fit_predict(distances) np.testing.assert_array_equal(expected, res3) # With multivariate sklearn - lof4 = LocalOutlierFactor(metric="euclidean", multivariate_metric=True) + lof4 = LocalOutlierFactor(metric='euclidean', multivariate_metric=True) res4 = lof4.fit_predict(self.fd_lof) np.testing.assert_array_equal(expected, res4) @@ -368,42 +421,55 @@ def test_lof_fit_predict(self): np.testing.assert_array_equal(expected, res5) # Check values of negative outlier factor - negative_lof = [-7.1068, -1.5412, -0.9961, -0.9854, -0.9896, -1.0993, - -1.065, -0.9871, -0.9821, -0.9955, -1.0385, -1.0072, - -0.9832, -1.0134, -0.9939, -1.0074, -0.992, -0.992, - -0.9883, -1.0012, -1.1149, -1.002, -0.9994, -0.9869, - -0.9726, -0.9989, -0.9904] + negative_lof = [ # noqa: WPS317 + -7.1068, -1.5412, -0.9961, + -0.9854, -0.9896, -1.0993, + -1.065, -0.9871, -0.9821, + -0.9955, -1.0385, -1.0072, + -0.9832, -1.0134, -0.9939, + -1.0074, -0.992, -0.992, + -0.9883, -1.0012, -1.1149, + -1.002, -0.9994, -0.9869, + -0.9726, -0.9989, -0.9904, + ] np.testing.assert_array_almost_equal( - lof.negative_outlier_factor_.round(4), negative_lof) + lof.negative_outlier_factor_.round(4), negative_lof, + ) # Check same negative outlier factor - np.testing.assert_array_almost_equal(lof.negative_outlier_factor_, - lof2.negative_outlier_factor_) + np.testing.assert_array_almost_equal( + lof.negative_outlier_factor_, + lof2.negative_outlier_factor_, + ) - np.testing.assert_array_almost_equal(lof.negative_outlier_factor_, - lof3.negative_outlier_factor_) + np.testing.assert_array_almost_equal( + lof.negative_outlier_factor_, + lof3.negative_outlier_factor_, + ) def test_lof_decision_function(self): - """ Test decision function and score samples of LOF""" - + """ Test decision function and score samples of LOF.""" lof = LocalOutlierFactor(novelty=True) lof.fit(self.fd_lof[5:]) score = lof.score_samples(self.fd_lof[:5]) np.testing.assert_array_almost_equal( - score.round(4), [-5.9726, -1.3445, -0.9853, -0.9817, -0.985], - err_msg='Error in LocalOutlierFactor.score_samples') + score.round(4), + [-5.9726, -1.3445, -0.9853, -0.9817, -0.985], + err_msg='Error in LocalOutlierFactor.score_samples', + ) # Test decision_function = score_function - offset np.testing.assert_array_almost_equal( - lof.decision_function(self.fd_lof[:5]), score - lof.offset_, - err_msg='Error in LocalOutlierFactor.decision_function') + lof.decision_function(self.fd_lof[:5]), + score - lof.offset_, + err_msg='Error in LocalOutlierFactor.decision_function', + ) def test_lof_exceptions(self): - """ Test error due to novelty attribute""" - + """ Test error due to novelty attribute.""" lof = LocalOutlierFactor(novelty=True) # Error in fit_predict function @@ -419,5 +485,4 @@ def test_lof_exceptions(self): if __name__ == '__main__': - print() unittest.main() From 6963f4445d0c15f6c6cb5d5a80d33ec9d23db65e Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 03:12:39 +0100 Subject: [PATCH 158/210] isort --- setup.cfg | 5 +++++ tests/test_neighbors.py | 12 +++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 9f74f46bb..3823a58ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ max-methods = 25 max-local-variables = 15 max-expressions = 15 max-module-expressions = 15 +max-string-usages = 10 [coverage:run] omit = @@ -58,3 +59,7 @@ omit = */datasets/* # Omit reporting for __init__.py files */__init__.py + +[isort] +multi_line_output = 3 +include_trailing_comma = true \ No newline at end of file diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 423e6da98..3bd20decc 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -8,8 +8,11 @@ from skfda.exploratory.outliers import LocalOutlierFactor # Pending theory from skfda.exploratory.stats import mean from skfda.misc.metrics import l2_distance, lp_distance, pairwise_distance -from skfda.ml.classification import (KNeighborsClassifier, NearestCentroid, - RadiusNeighborsClassifier) +from skfda.ml.classification import ( + KNeighborsClassifier, + NearestCentroid, + RadiusNeighborsClassifier, +) from skfda.ml.clustering import NearestNeighbors from skfda.ml.regression import KNeighborsRegressor, RadiusNeighborsRegressor from skfda.representation.basis import Fourier @@ -231,9 +234,8 @@ def test_radius_functional_response(self): def test_functional_response_custom_weights(self): - def weights(weights): - - return np.array([w == 0 for w in weights], dtype=float) + def weights(weights_): + return np.array([w == 0 for w in weights_], dtype=float) knnr = KNeighborsRegressor(weights=weights, n_neighbors=5) response = self.X.to_basis(Fourier(domain_range=(-1, 1), n_basis=10)) From 7fb8770de44d60938f5a6dbb3f0d9fdd34d48f43 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 03:18:06 +0100 Subject: [PATCH 159/210] test_neighbors without warnings --- setup.cfg | 2 +- tests/test_neighbors.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 3823a58ee..5067fd991 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ rst-roles = # Needs to be tuned max-line-complexity = 25 -max-methods = 25 +max-methods = 30 max-local-variables = 15 max-expressions = 15 max-module-expressions = 15 diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 3bd20decc..ca09054bd 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -232,12 +232,12 @@ def test_radius_functional_response(self): res.data_matrix, self.X.data_matrix, ) - def test_functional_response_custom_weights(self): + def weights(self, weights_): + return np.array([w == 0 for w in weights_], dtype=float) - def weights(weights_): - return np.array([w == 0 for w in weights_], dtype=float) + def test_functional_response_custom_weights(self): - knnr = KNeighborsRegressor(weights=weights, n_neighbors=5) + knnr = KNeighborsRegressor(weights=self.weights, n_neighbors=5) response = self.X.to_basis(Fourier(domain_range=(-1, 1), n_basis=10)) knnr.fit(self.X, response) From 25a7d789bcb47bcd9c14d1473f2758f8cc246ea3 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 03:45:10 +0100 Subject: [PATCH 160/210] More changes --- skfda/exploratory/outliers/__init__.py | 6 ++-- .../exploratory/outliers/neighbors_outlier.py | 8 +++-- skfda/ml/classification/__init__.py | 6 ++-- .../classification/neighbors_classifiers.py | 14 +++++---- skfda/ml/clustering/kmeans.py | 2 +- skfda/ml/clustering/neighbors.py | 8 +++-- skfda/ml/regression/neighbors.py | 29 ++++++++++++------- tests/test_clustering.py | 15 ++++++---- tests/test_neighbors.py | 8 ++--- 9 files changed, 61 insertions(+), 35 deletions(-) diff --git a/skfda/exploratory/outliers/__init__.py b/skfda/exploratory/outliers/__init__.py index d8adfc7b2..862e66fd9 100644 --- a/skfda/exploratory/outliers/__init__.py +++ b/skfda/exploratory/outliers/__init__.py @@ -1,4 +1,6 @@ -from ._directional_outlyingness import (DirectionalOutlierDetector, - directional_outlyingness_stats) +from ._directional_outlyingness import ( + DirectionalOutlierDetector, + directional_outlyingness_stats, +) from ._iqr import IQROutlierDetector from .neighbors_outlier import LocalOutlierFactor diff --git a/skfda/exploratory/outliers/neighbors_outlier.py b/skfda/exploratory/outliers/neighbors_outlier.py index f7fe5817c..41a92e56e 100644 --- a/skfda/exploratory/outliers/neighbors_outlier.py +++ b/skfda/exploratory/outliers/neighbors_outlier.py @@ -1,8 +1,12 @@ from sklearn.base import OutlierMixin from ...misc.metrics import lp_distance -from ...ml._neighbors_base import (KNeighborsMixin, NeighborsBase, - NeighborsMixin, _to_multivariate_metric) +from ...ml._neighbors_base import ( + KNeighborsMixin, + NeighborsBase, + NeighborsMixin, + _to_multivariate_metric, +) class LocalOutlierFactor(NeighborsBase, NeighborsMixin, KNeighborsMixin, diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index 80a2546c4..f90822dbf 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,4 +1,6 @@ from ._centroid_classifiers import DTMClassifier, NearestCentroid from .depth_classifiers import MaximumDepthClassifier -from .neighbors_classifiers import (KNeighborsClassifier, - RadiusNeighborsClassifier) +from .neighbors_classifiers import ( + KNeighborsClassifier, + RadiusNeighborsClassifier, +) diff --git a/skfda/ml/classification/neighbors_classifiers.py b/skfda/ml/classification/neighbors_classifiers.py index ed933fcd3..e5165e083 100644 --- a/skfda/ml/classification/neighbors_classifiers.py +++ b/skfda/ml/classification/neighbors_classifiers.py @@ -1,10 +1,15 @@ """Neighbor models for supervised classification.""" from sklearn.base import ClassifierMixin +from sklearn.neighbors import KNeighborsClassifier as _KNeighborsClassifier -from .._neighbors_base import (KNeighborsMixin, NeighborsBase, - NeighborsClassifierMixin, NeighborsMixin, - RadiusNeighborsMixin) +from .._neighbors_base import ( + KNeighborsMixin, + NeighborsBase, + NeighborsClassifierMixin, + NeighborsMixin, + RadiusNeighborsMixin, +) class KNeighborsClassifier(NeighborsBase, NeighborsMixin, KNeighborsMixin, @@ -129,9 +134,6 @@ def _init_estimator(self, sklearn_metric): Returns: Sklearn K Neighbors estimator initialized. """ - from sklearn.neighbors import \ - KNeighborsClassifier as _KNeighborsClassifier - return _KNeighborsClassifier( n_neighbors=self.n_neighbors, weights=self.weights, algorithm=self.algorithm, leaf_size=self.leaf_size, diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/kmeans.py index 94605b3c8..285658dc6 100644 --- a/skfda/ml/clustering/kmeans.py +++ b/skfda/ml/clustering/kmeans.py @@ -699,7 +699,7 @@ def _check_params(self): def _compute_inertia(self, membership, centroids, distances_to_centroids): return np.sum( - membership ** self.fuzzifier * distances_to_centroids ** 2 + membership ** self.fuzzifier * distances_to_centroids ** 2, ) def _create_membership(self, n_samples): diff --git a/skfda/ml/clustering/neighbors.py b/skfda/ml/clustering/neighbors.py index 1dcfa24e9..3b305b961 100644 --- a/skfda/ml/clustering/neighbors.py +++ b/skfda/ml/clustering/neighbors.py @@ -1,7 +1,11 @@ """Unsupervised learner for implementing neighbor searches.""" -from .._neighbors_base import (KNeighborsMixin, NeighborsBase, NeighborsMixin, - RadiusNeighborsMixin) +from .._neighbors_base import ( + KNeighborsMixin, + NeighborsBase, + NeighborsMixin, + RadiusNeighborsMixin, +) class NearestNeighbors(NeighborsBase, NeighborsMixin, KNeighborsMixin, diff --git a/skfda/ml/regression/neighbors.py b/skfda/ml/regression/neighbors.py index 02cd1ff85..091b7507b 100644 --- a/skfda/ml/regression/neighbors.py +++ b/skfda/ml/regression/neighbors.py @@ -1,7 +1,15 @@ """Neighbor models for regression.""" -from .._neighbors_base import (KNeighborsMixin, NeighborsBase, - NeighborsRegressorMixin, RadiusNeighborsMixin) +from sklearn.neighbors import KNeighborsRegressor as _KNeighborsRegressor +from sklearn.neighbors import \ + RadiusNeighborsRegressor as _RadiusNeighborsRegressor + +from .._neighbors_base import ( + KNeighborsMixin, + NeighborsBase, + NeighborsRegressorMixin, + RadiusNeighborsMixin, +) class KNeighborsRegressor(NeighborsBase, NeighborsRegressorMixin, @@ -153,9 +161,6 @@ def _init_multivariate_estimator(self, sklearn_metric): Sklearn K Neighbors estimator initialized. """ - from sklearn.neighbors import \ - KNeighborsRegressor as _KNeighborsRegressor - return _KNeighborsRegressor( n_neighbors=self.n_neighbors, weights=self.weights, algorithm=self.algorithm, leaf_size=self.leaf_size, @@ -317,9 +322,6 @@ def _init_multivariate_estimator(self, sklearn_metric): Sklearn Radius Neighbors estimator initialized. """ - from sklearn.neighbors import \ - RadiusNeighborsRegressor as _RadiusNeighborsRegressor - return _RadiusNeighborsRegressor( radius=self.radius, weights=self.weights, algorithm=self.algorithm, leaf_size=self.leaf_size, @@ -327,6 +329,13 @@ def _init_multivariate_estimator(self, sklearn_metric): n_jobs=self.n_jobs) def _query(self, X): - """Return distances and neighbors of given sample.""" - + """Return distances and neighbors of given sample. + + Args: + X: the sample + + Returns: + Distances and neighbors of a given sample + + """ return self.estimator_.radius_neighbors(X) diff --git a/tests/test_clustering.py b/tests/test_clustering.py index dc93bb4e9..b29c06d2b 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -19,12 +19,15 @@ def test_kmeans_univariate(self): init_fd = FDataGrid(init, grid_points) kmeans = KMeans(init=init_fd) distances_to_centers = kmeans.fit_transform(fd) - np.testing.assert_allclose(distances_to_centers, - np.array([[2.98142397, 9.23534876], - [0.68718427, 6.50960828], - [3.31243449, 4.39222798], - [6.49679408, 0.] - ])) + np.testing.assert_allclose( + distances_to_centers, + np.array([ + [2.98142397, 9.23534876], + [0.68718427, 6.50960828], + [3.31243449, 4.39222798], + [6.49679408, 0.0], + ]), + ) np.testing.assert_array_equal(kmeans.predict(fd), np.array([0, 0, 0, 1])) np.testing.assert_allclose(kmeans.transform(fd), diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index ca09054bd..46c6b486a 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -55,7 +55,7 @@ def setUp(self): n_samples=25, error_std=0, phase_std=0.1, - random_state=0 + random_state=0, ) fd_outliers = make_sinusoidal_process( n_samples=2, @@ -389,7 +389,7 @@ def test_multivariate_response_score(self): neigh.score(self.X[:5], y) def test_lof_fit_predict(self): - """ Test same results with different forms to call fit_predict.""" + """Test same results with different forms to call fit_predict.""" # Outliers expected = np.ones(len(self.fd_lof)) expected[:2] = -1 @@ -451,7 +451,7 @@ def test_lof_fit_predict(self): ) def test_lof_decision_function(self): - """ Test decision function and score samples of LOF.""" + """Test decision function and score samples of LOF.""" lof = LocalOutlierFactor(novelty=True) lof.fit(self.fd_lof[5:]) @@ -471,7 +471,7 @@ def test_lof_decision_function(self): ) def test_lof_exceptions(self): - """ Test error due to novelty attribute.""" + """Test error due to novelty attribute.""" lof = LocalOutlierFactor(novelty=True) # Error in fit_predict function From 5182fb8b76a946ead07e2d0fd5dbce844f9bcc1d Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 04:00:29 +0100 Subject: [PATCH 161/210] More updates --- skfda/exploratory/outliers/neighbors_outlier.py | 1 + skfda/ml/classification/_centroid_classifiers.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/skfda/exploratory/outliers/neighbors_outlier.py b/skfda/exploratory/outliers/neighbors_outlier.py index 41a92e56e..d9fd591a0 100644 --- a/skfda/exploratory/outliers/neighbors_outlier.py +++ b/skfda/exploratory/outliers/neighbors_outlier.py @@ -1,3 +1,4 @@ +"""Neighbors outlier detection methods.""" from sklearn.base import OutlierMixin from ...misc.metrics import lp_distance diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index f12736987..fd95d096e 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -33,9 +33,6 @@ class NearestCentroid(BaseEstimator, ClassifierMixin): The function must accept a :class:`FData` with the samples of one class and return a :class:`FData` object with only one sample representing the centroid. - Attributes: - centroids_: :class:`FDataGrid` - FDatagrid containing the centroid of each class Examples: Firstly, we will create a toy dataset with 2 classes @@ -63,7 +60,15 @@ class and return a :class:`FData` object with only one sample """ def __init__(self, metric=l2_distance, centroid=mean): - """Initialize the classifier.""" + """Initialize the classifier. + + Args: + metric: The metric to use when calculating distance between test + samples and centroids. + centroid: Point from which the sum of the distances (according to + the metric) of all samples that belong to that particular class + are minimized. + """ self.metric = metric self.centroid = centroid From 9718c5b90c15d4fa7a1c15959593f25d553a9eb5 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 04:04:03 +0100 Subject: [PATCH 162/210] No doctrings for __init__.py --- setup.cfg | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 5067fd991..fe1afa846 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,8 +31,11 @@ ignore = Q000 per-file-ignores = - # Unused modules are allowed in __init__, to reduce imports - __init__.py: F401 + __init__.py: + # Unused modules are allowed in __init__, to reduce imports + F401, + # No docstrings are allowed in __init__ + D104 # Tests benefit from magic numbers test_*.py: WPS432 From 831b0522d085394468748aa83f4ab1f563169f47 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 04:23:23 +0100 Subject: [PATCH 163/210] __init__ method --- skfda/ml/classification/_centroid_classifiers.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index fd95d096e..cae9b45d3 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -60,15 +60,7 @@ class and return a :class:`FData` object with only one sample """ def __init__(self, metric=l2_distance, centroid=mean): - """Initialize the classifier. - - Args: - metric: The metric to use when calculating distance between test - samples and centroids. - centroid: Point from which the sum of the distances (according to - the metric) of all samples that belong to that particular class - are minimized. - """ + """Initialize the classifier.""" self.metric = metric self.centroid = centroid From e479110652fa37d45068766b8f74f3be8660cce8 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 14:00:15 +0100 Subject: [PATCH 164/210] Imports style --- setup.cfg | 25 +++++++++++-------- skfda/ml/classification/depth_classifiers.py | 5 ++-- .../classification/neighbors_classifiers.py | 8 +++--- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/setup.cfg b/setup.cfg index fe1afa846..95d346135 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,6 +10,8 @@ norecursedirs = '.*', 'build', 'dist' '*.egg' 'venv' .svn _build docs/auto_examp ignore = # Uppercase arguments like X are common in scikit-learn N803, + # There are no bad quotes + Q000, # Line break occurred before a binary operator (antipattern) W503, # Short names like X or y are common in scikit-learn @@ -18,24 +20,23 @@ ignore = WPS120, # We like local imports, thanks WPS300, + # We love f-strings + WPS305, + # We need multine loops + WPS352, # Our private modules are fine to import # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) WPS436, # Our private objects are fine to import - WPS450, - # We need multine loops - WPS352, - # We love f-strings - WPS305, - # There are no bad quotes - Q000 + WPS450 per-file-ignores = __init__.py: - # Unused modules are allowed in __init__, to reduce imports - F401, # No docstrings are allowed in __init__ - D104 + D104, + # Unused modules are allowed in __init__, to reduce imports + F401 + # Tests benefit from magic numbers test_*.py: WPS432 @@ -65,4 +66,6 @@ omit = [isort] multi_line_output = 3 -include_trailing_comma = true \ No newline at end of file +include_trailing_comma = true +use_parentheses = true +combine_as_imports = 1 diff --git a/skfda/ml/classification/depth_classifiers.py b/skfda/ml/classification/depth_classifiers.py index 429ec3267..80478976a 100644 --- a/skfda/ml/classification/depth_classifiers.py +++ b/skfda/ml/classification/depth_classifiers.py @@ -1,12 +1,11 @@ """Depth models for supervised classification.""" import numpy as np - -from sklearn.base import ClassifierMixin, BaseEstimator, clone +from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted -from ...exploratory.depth import Depth, ModifiedBandDepth from ..._utils import _classifier_get_classes +from ...exploratory.depth import Depth, ModifiedBandDepth class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): diff --git a/skfda/ml/classification/neighbors_classifiers.py b/skfda/ml/classification/neighbors_classifiers.py index e5165e083..743d269ee 100644 --- a/skfda/ml/classification/neighbors_classifiers.py +++ b/skfda/ml/classification/neighbors_classifiers.py @@ -1,7 +1,10 @@ """Neighbor models for supervised classification.""" from sklearn.base import ClassifierMixin -from sklearn.neighbors import KNeighborsClassifier as _KNeighborsClassifier +from sklearn.neighbors import ( + KNeighborsClassifier as _KNeighborsClassifier, + RadiusNeighborsClassifier as _RadiusNeighborsClassifier, +) from .._neighbors_base import ( KNeighborsMixin, @@ -277,9 +280,6 @@ def _init_estimator(self, sklearn_metric): Returns: Sklearn Radius Neighbors estimator initialized. """ - from sklearn.neighbors import \ - RadiusNeighborsClassifier as _RadiusNeighborsClassifier - return _RadiusNeighborsClassifier( radius=self.radius, weights=self.weights, algorithm=self.algorithm, leaf_size=self.leaf_size, From 3e6c94c073f91663b8f81ee4da12b2b12b63f6d6 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 14:02:16 +0100 Subject: [PATCH 165/210] More isort --- skfda/ml/regression/neighbors.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/skfda/ml/regression/neighbors.py b/skfda/ml/regression/neighbors.py index 091b7507b..9308f824b 100644 --- a/skfda/ml/regression/neighbors.py +++ b/skfda/ml/regression/neighbors.py @@ -1,8 +1,9 @@ """Neighbor models for regression.""" -from sklearn.neighbors import KNeighborsRegressor as _KNeighborsRegressor -from sklearn.neighbors import \ - RadiusNeighborsRegressor as _RadiusNeighborsRegressor +from sklearn.neighbors import ( + KNeighborsRegressor as _KNeighborsRegressor, + RadiusNeighborsRegressor as _RadiusNeighborsRegressor, +) from .._neighbors_base import ( KNeighborsMixin, From d062336bd6698050501bedfa8f18f81e12bb2ca6 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 14:12:23 +0100 Subject: [PATCH 166/210] No docstrings in __init__ method --- setup.cfg | 2 ++ skfda/ml/classification/_centroid_classifiers.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 95d346135..c6d6e7487 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,6 +8,8 @@ norecursedirs = '.*', 'build', 'dist' '*.egg' 'venv' .svn _build docs/auto_examp [flake8] ignore = + # No docstrings in __init__ + D107, # Uppercase arguments like X are common in scikit-learn N803, # There are no bad quotes diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index cae9b45d3..74c80f279 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -60,7 +60,6 @@ class and return a :class:`FData` object with only one sample """ def __init__(self, metric=l2_distance, centroid=mean): - """Initialize the classifier.""" self.metric = metric self.centroid = centroid @@ -171,7 +170,6 @@ def __init__( depth_method: Depth = None, metric: Callable = lp_distance, ) -> None: - """Initialize the classifier.""" self.proportiontocut = proportiontocut if depth_method is None: self.depth_method = ModifiedBandDepth() From 87aaa9bc0391b995a71caf31859087bd3d98a3cf Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 14:21:53 +0100 Subject: [PATCH 167/210] self type --- skfda/ml/classification/_centroid_classifiers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 74c80f279..0c4c384af 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -74,7 +74,7 @@ def fit(self, X, y): shape = [n_samples] or [n_samples, n_outputs]. Returns: - self + self (object) """ classes_, y_ind = _classifier_get_classes(y) @@ -184,7 +184,7 @@ def fit(self, X, y): y (array-like): Target values of shape = [n_samples]. Returns: - self + self (object) """ self._clf = NearestCentroid( metric=self.metric, From e0a346354304bde1ca729cef49d6a04ab71f3318 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 14:30:41 +0100 Subject: [PATCH 168/210] Make aux function private --- tests/test_neighbors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 46c6b486a..7ae5a2bd3 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -232,12 +232,12 @@ def test_radius_functional_response(self): res.data_matrix, self.X.data_matrix, ) - def weights(self, weights_): + def _weights(self, weights_): return np.array([w == 0 for w in weights_], dtype=float) def test_functional_response_custom_weights(self): - knnr = KNeighborsRegressor(weights=self.weights, n_neighbors=5) + knnr = KNeighborsRegressor(weights=self._weights, n_neighbors=5) response = self.X.to_basis(Fourier(domain_range=(-1, 1), n_basis=10)) knnr.fit(self.X, response) From 9c39ccb747f548f6075d8edcfddb528c0f3b4fa2 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 14:33:35 +0100 Subject: [PATCH 169/210] Ordering methods --- tests/test_neighbors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 7ae5a2bd3..7d6288bf7 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -232,9 +232,6 @@ def test_radius_functional_response(self): res.data_matrix, self.X.data_matrix, ) - def _weights(self, weights_): - return np.array([w == 0 for w in weights_], dtype=float) - def test_functional_response_custom_weights(self): knnr = KNeighborsRegressor(weights=self._weights, n_neighbors=5) @@ -485,6 +482,9 @@ def test_lof_exceptions(self): with np.testing.assert_raises(AttributeError): lof.predict(self.fd_lof[5:]) + def _weights(self, weights_): + return np.array([w == 0 for w in weights_], dtype=float) + if __name__ == '__main__': unittest.main() From 1e5b3de8df9ae2fb31012d0643cd93d8cd828e77 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 16:45:54 +0100 Subject: [PATCH 170/210] Typos --- skfda/ml/classification/depth_classifiers.py | 2 +- skfda/ml/clustering/kmeans.py | 6 +++--- tests/test_neighbors.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/skfda/ml/classification/depth_classifiers.py b/skfda/ml/classification/depth_classifiers.py index 80478976a..ecca1e600 100644 --- a/skfda/ml/classification/depth_classifiers.py +++ b/skfda/ml/classification/depth_classifiers.py @@ -1,4 +1,4 @@ -"""Depth models for supervised classification.""" +"""Depth-based models for supervised classification.""" import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, clone diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/kmeans.py index 285658dc6..2f9ef8bc2 100644 --- a/skfda/ml/clustering/kmeans.py +++ b/skfda/ml/clustering/kmeans.py @@ -514,7 +514,7 @@ def _compute_inertia(self, membership, centroids, distances_to_their_center = np.choose(membership, distances_to_centroids.T) - return np.sum(distances_to_their_center ** 2) + return np.sum(distances_to_their_center**2) def _create_membership(self, n_samples): return np.empty(n_samples, dtype=int) @@ -699,7 +699,7 @@ def _check_params(self): def _compute_inertia(self, membership, centroids, distances_to_centroids): return np.sum( - membership ** self.fuzzifier * distances_to_centroids ** 2, + membership**self.fuzzifier * distances_to_centroids**2, ) def _create_membership(self, n_samples): @@ -709,7 +709,7 @@ def _update(self, fdata, membership_matrix, distances_to_centroids, centroids): # Divisions by zero allowed with np.errstate(divide='ignore'): - distances_to_centers_raised = (distances_to_centroids ** ( + distances_to_centers_raised = (distances_to_centroids**( 2 / (1 - self.fuzzifier))) # Divisions infinity by infinity allowed diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 7d6288bf7..e699afa22 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -92,7 +92,7 @@ def test_predict_proba_classifier(self): np.testing.assert_array_almost_equal(probs, self.probs) def test_predict_regressor(self): - """Test scalar regression, predics mode location.""" + """Test scalar regression, predicts mode location.""" # Dummy test, with weight = distance, only the sample with distance 0 # will be returned, obtaining the exact location knnr = KNeighborsRegressor(weights='distance') From 7b8089c83c6fea252ab2a3de6708219d0bdbaac2 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 17:26:26 +0100 Subject: [PATCH 171/210] None bug --- .../classification/_centroid_classifiers.py | 5 +++- skfda/ml/classification/depth_classifiers.py | 28 +++++++++++++------ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 0c4c384af..6e064d586 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -171,9 +171,12 @@ def __init__( metric: Callable = lp_distance, ) -> None: self.proportiontocut = proportiontocut + if depth_method is None: self.depth_method = ModifiedBandDepth() - self.depth_method = depth_method + else: + self.depth_method = depth_method + self.metric = metric def fit(self, X, y): diff --git a/skfda/ml/classification/depth_classifiers.py b/skfda/ml/classification/depth_classifiers.py index ecca1e600..cf1e32e20 100644 --- a/skfda/ml/classification/depth_classifiers.py +++ b/skfda/ml/classification/depth_classifiers.py @@ -49,17 +49,21 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): 0.875 See also: - :class:`~skfda.ml.classification.DTMClassifier + :class:`~skfda.ml.classification.DTMClassifier` References: Ghosh, A. K. and Chaudhuri, P. (2005b). On maximum depth and related classifiers. Scandinavian Journal of Statistics, 32, 327–350. """ - def __init__(self, depth_method: Depth = ModifiedBandDepth()): - """Initialize the classifier.""" + def __init__(self, depth_method: Depth = None): self.depth_method = depth_method + if depth_method is None: + self.depth_method = ModifiedBandDepth() + else: + self.depth_method = depth_method + def fit(self, X, y): """Fit the model using X as training data and y as target values. @@ -67,11 +71,17 @@ def fit(self, X, y): X (:class:`FDataGrid`): FDataGrid with the training data. y (array-like): Target values of shape = [n_samples]. + Returns: + self (object) + """ - self.classes_, y_ind = _classifier_get_classes(y) + classes_, y_ind = _classifier_get_classes(y) - self.distributions_ = [clone(self.depth_method).fit( - X[y_ind == cur_class]) for cur_class in range(self.classes_.size)] + self.classes_ = classes_ + self.distributions_ = [ + clone(self.depth_method).fit(X[y_ind == cur_class]) + for cur_class in range(self.classes_.size) + ] return self @@ -88,7 +98,9 @@ def predict(self, X): """ sklearn_check_is_fitted(self) - depths = [distribution.predict(X) - for distribution in self.distributions_] + depths = [ + distribution.predict(X) + for distribution in self.distributions_ + ] return self.classes_[np.argmax(depths, axis=0)] From 732b26ea20a98c821da7e0455939c0350328abab Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 17:36:34 +0100 Subject: [PATCH 172/210] Updating pytest --- skfda/ml/classification/depth_classifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/depth_classifiers.py b/skfda/ml/classification/depth_classifiers.py index cf1e32e20..37aa27f42 100644 --- a/skfda/ml/classification/depth_classifiers.py +++ b/skfda/ml/classification/depth_classifiers.py @@ -35,7 +35,7 @@ class MaximumDepthClassifier(BaseEstimator, ClassifierMixin): >>> from skfda.ml.classification import MaximumDepthClassifier >>> clf = MaximumDepthClassifier() >>> clf.fit(X_train, y_train) - MaximumDepthClassifier() + MaximumDepthClassifier(...) We can predict the class of new samples From 1d935dc2c01d1da58d6c307f1ab61380301cb488 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 2 Dec 2020 22:33:41 +0100 Subject: [PATCH 173/210] Changes in basis. --- skfda/representation/basis/_basis.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index aa9701b68..740be02cb 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -14,11 +14,6 @@ from ..._utils import _domain_range, _reshape_eval_points, _same_domain from . import _fdatabasis -__author__ = "Miguel Carbajo Berrocal" -__email__ = "miguel.carbajo@estudiante.uam.es" - -# aux functions - def _check_domain(domain_range): for domain in domain_range: @@ -36,7 +31,7 @@ class Basis(ABC): """ - def __init__(self, domain_range=None, n_basis: int=1): + def __init__(self, *, domain_range=None, n_basis: int=1): """Basis constructor. Args: @@ -61,6 +56,9 @@ def __init__(self, domain_range=None, n_basis: int=1): super().__init__() + def __call__(self, *args, **kwargs) -> np.ndarray: + return self.evaluate(*args, **kwargs) + @property def dim_domain(self) -> int: return 1 @@ -116,9 +114,6 @@ def evaluate(self, eval_points, *, derivative: int = 0) -> np.ndarray: return self._evaluate(eval_points).reshape( (self.n_basis, len(eval_points), self.dim_codomain)) - def __call__(self, *args, **kwargs) -> np.ndarray: - return self.evaluate(*args, **kwargs) - def __len__(self) -> int: return self.n_basis From 7b8dcde99f4ba76cf45ed472287c7222031e483f Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 23:15:33 +0100 Subject: [PATCH 174/210] Final commit --- setup.cfg | 8 ++++---- skfda/ml/classification/__init__.py | 4 ++-- skfda/ml/classification/_centroid_classifiers.py | 2 +- .../{depth_classifiers.py => _depth_classifiers.py} | 0 ...neighbors_classifiers.py => _neighbors_classifiers.py} | 0 skfda/ml/clustering/__init__.py | 4 ++-- skfda/ml/clustering/{kmeans.py => _kmeans.py} | 0 .../clustering/{neighbors.py => _neighbors_clustering.py} | 0 skfda/ml/regression/__init__.py | 7 +++++-- skfda/ml/regression/{linear.py => _linear_regression.py} | 0 .../regression/{neighbors.py => _neighbors_regression.py} | 0 tests/test_regularization.py | 2 +- 12 files changed, 15 insertions(+), 12 deletions(-) rename skfda/ml/classification/{depth_classifiers.py => _depth_classifiers.py} (100%) rename skfda/ml/classification/{neighbors_classifiers.py => _neighbors_classifiers.py} (100%) rename skfda/ml/clustering/{kmeans.py => _kmeans.py} (100%) rename skfda/ml/clustering/{neighbors.py => _neighbors_clustering.py} (100%) rename skfda/ml/regression/{linear.py => _linear_regression.py} (100%) rename skfda/ml/regression/{neighbors.py => _neighbors_regression.py} (100%) diff --git a/setup.cfg b/setup.cfg index c6d6e7487..0f63cfa4b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,10 +34,10 @@ ignore = per-file-ignores = __init__.py: - # No docstrings are allowed in __init__ - D104, - # Unused modules are allowed in __init__, to reduce imports - F401 + # Unused modules are allowed in `__init__.py`, to reduce imports + F401, + # Logic is allowec in `__init__.py` + WPS412 # Tests benefit from magic numbers test_*.py: WPS432 diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index f90822dbf..e728da47a 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,6 +1,6 @@ from ._centroid_classifiers import DTMClassifier, NearestCentroid -from .depth_classifiers import MaximumDepthClassifier -from .neighbors_classifiers import ( +from ._depth_classifiers import MaximumDepthClassifier +from ._neighbors_classifiers import ( KNeighborsClassifier, RadiusNeighborsClassifier, ) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 6e064d586..613920964 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -1,4 +1,4 @@ -"""Centroid models for supervised classification.""" +"""Centroid-based models for supervised classification.""" from typing import Callable diff --git a/skfda/ml/classification/depth_classifiers.py b/skfda/ml/classification/_depth_classifiers.py similarity index 100% rename from skfda/ml/classification/depth_classifiers.py rename to skfda/ml/classification/_depth_classifiers.py diff --git a/skfda/ml/classification/neighbors_classifiers.py b/skfda/ml/classification/_neighbors_classifiers.py similarity index 100% rename from skfda/ml/classification/neighbors_classifiers.py rename to skfda/ml/classification/_neighbors_classifiers.py diff --git a/skfda/ml/clustering/__init__.py b/skfda/ml/clustering/__init__.py index 6d73ea1f0..7ac86a0a9 100644 --- a/skfda/ml/clustering/__init__.py +++ b/skfda/ml/clustering/__init__.py @@ -1,2 +1,2 @@ -from .kmeans import BaseKMeans, FuzzyCMeans, KMeans -from .neighbors import NearestNeighbors +from ._kmeans import BaseKMeans, FuzzyCMeans, KMeans +from ._neighbors_clustering import NearestNeighbors diff --git a/skfda/ml/clustering/kmeans.py b/skfda/ml/clustering/_kmeans.py similarity index 100% rename from skfda/ml/clustering/kmeans.py rename to skfda/ml/clustering/_kmeans.py diff --git a/skfda/ml/clustering/neighbors.py b/skfda/ml/clustering/_neighbors_clustering.py similarity index 100% rename from skfda/ml/clustering/neighbors.py rename to skfda/ml/clustering/_neighbors_clustering.py diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index aaa0a09cc..721586d42 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -1,2 +1,5 @@ -from .neighbors import KNeighborsRegressor, RadiusNeighborsRegressor -from .linear import LinearRegression +from ._linear_regression import LinearRegression +from ._neighbors_regression import ( + KNeighborsRegressor, + RadiusNeighborsRegressor, +) diff --git a/skfda/ml/regression/linear.py b/skfda/ml/regression/_linear_regression.py similarity index 100% rename from skfda/ml/regression/linear.py rename to skfda/ml/regression/_linear_regression.py diff --git a/skfda/ml/regression/neighbors.py b/skfda/ml/regression/_neighbors_regression.py similarity index 100% rename from skfda/ml/regression/neighbors.py rename to skfda/ml/regression/_neighbors_regression.py diff --git a/tests/test_regularization.py b/tests/test_regularization.py index 3daadcb33..f9b8b1db9 100644 --- a/tests/test_regularization.py +++ b/tests/test_regularization.py @@ -4,7 +4,7 @@ _monomial_evaluate_constant_linear_diff_op) from skfda.misc.operators._operators import gramian_matrix_numerical from skfda.misc.regularization import TikhonovRegularization, L2Regularization -from skfda.ml.regression.linear import LinearRegression +from skfda.ml.regression import LinearRegression from skfda.representation.basis import Constant, Monomial, BSpline, Fourier import unittest import warnings From 86d881babd6a20a8740f3b7c7c8987887e2b60a0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 2 Dec 2020 23:30:19 +0100 Subject: [PATCH 175/210] Second final commit --- skfda/ml/classification/__init__.py | 1 + skfda/ml/clustering/__init__.py | 1 + skfda/ml/regression/__init__.py | 1 + 3 files changed, 3 insertions(+) diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index e728da47a..da2bca618 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -1,3 +1,4 @@ +"""Classification.""" from ._centroid_classifiers import DTMClassifier, NearestCentroid from ._depth_classifiers import MaximumDepthClassifier from ._neighbors_classifiers import ( diff --git a/skfda/ml/clustering/__init__.py b/skfda/ml/clustering/__init__.py index 7ac86a0a9..1ac00d603 100644 --- a/skfda/ml/clustering/__init__.py +++ b/skfda/ml/clustering/__init__.py @@ -1,2 +1,3 @@ +"""Clustering.""" from ._kmeans import BaseKMeans, FuzzyCMeans, KMeans from ._neighbors_clustering import NearestNeighbors diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index 721586d42..7ba5a1ac9 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -1,3 +1,4 @@ +"""Regression.""" from ._linear_regression import LinearRegression from ._neighbors_regression import ( KNeighborsRegressor, From 379e65aad58f94d45e9bda725a02914d3177470a Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 3 Dec 2020 02:00:24 +0100 Subject: [PATCH 176/210] Fix errors in stats. --- setup.cfg | 3 + skfda/exploratory/stats/_stats.py | 154 ++++++++++-------- .../classification/_centroid_classifiers.py | 2 +- 3 files changed, 87 insertions(+), 72 deletions(-) diff --git a/setup.cfg b/setup.cfg index 0f63cfa4b..9521525ca 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,6 +14,9 @@ ignore = N803, # There are no bad quotes Q000, + # Google Python style is not RST until after processed by Napoleon + # See https://github.com/peterjc/flake8-rst-docstrings/issues/17 + RST201, RST203, RST301, # Line break occurred before a binary operator (antipattern) W503, # Short names like X or y are common in scikit-learn diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 1a67b9676..46a1d4545 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,115 +1,128 @@ -""" -Functional data descriptive statistics. -""" -from builtins import isinstance +"""Functional data descriptive statistics.""" -from typing import Callable, TypeVar, Union +from builtins import isinstance +from typing import Callable, Optional, TypeVar, Union import numpy as np -from ...misc.metrics import lp_norm, l2_distance -from ...representation import FData -from ..depth import ModifiedBandDepth +from ...misc.metrics import l2_distance, lp_norm +from ...representation import FData, FDataGrid +from ..depth import Depth, ModifiedBandDepth +F = TypeVar('F', bound=FData) -def mean(fdata): - """Compute the mean of all the samples in a FData object. - Computes the mean of all the samples in a FDataGrid or FDataBasis object. +def mean(X: F) -> F: + """Compute the mean of all the samples in a FData object. Args: - fdata (FDataGrid or FDataBasis): Object containing all the samples - whose mean is wanted. + X: Object containing all the samples whose mean is wanted. Returns: - FDataGrid or FDataBasis: A FDataGrid or FDataBasis object with just - one sample representing the mean of all the samples in the original - object. + A :term:`functional data object` with just one sample representing + the mean of all the samples in the original object. """ - return fdata.mean() + return X.mean() -def var(fdatagrid): +def var(X: FData) -> FDataGrid: # noqa: WPS110 """Compute the variance of a set of samples in a FDataGrid object. Args: - fdatagrid (FDataGrid): Object containing all the set of samples - whose variance is desired. + X: Object containing all the set of samples whose variance is desired. Returns: - FDataGrid: A FDataGrid object with just one sample representing the - variance of all the samples in the original FDataGrid object. + A :term:`functional data object` with just one sample representing the + variance of all the samples in the original object. """ - return fdatagrid.var() + return X.var() -def gmean(fdatagrid): +def gmean(X: FDataGrid) -> FDataGrid: """Compute the geometric mean of all the samples in a FDataGrid object. Args: - fdatagrid (FDataGrid): Object containing all the samples whose - geometric mean is wanted. + X: Object containing all the samples whose geometric mean is wanted. Returns: - FDataGrid: A FDataGrid object with just one sample representing the - geometric mean of all the samples in the original FDataGrid object. + A :term:`functional data object` with just one sample representing the + geometric mean of all the samples in the original object. """ - return fdatagrid.gmean() + return X.gmean() -def cov(fdatagrid): +def cov(X: FData) -> FDataGrid: """Compute the covariance. Calculates the covariance matrix representing the covariance of the functional samples at the observation points. Args: - fdatagrid (FDataGrid): Object containing different samples of a - functional variable. + X: Object containing different samples of a functional variable. Returns: - numpy.darray: Matrix of covariances. + A :term:`functional data object` with just one sample representing the + covariance of all the samples in the original object. """ - return fdatagrid.cov() + return X.cov() -def depth_based_median(fdatagrid, depth_method=ModifiedBandDepth()): +def depth_based_median( + X: FDataGrid, + depth_method: Optional[Depth] = None, +) -> FDataGrid: """Compute the median based on a depth measure. The depth based median is the deepest curve given a certain - depth measure + depth measure. Args: - fdatagrid (FDataGrid): Object containing different samples of a + X: Object containing different samples of a functional variable. - depth_method (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`modified - band depth `. + depth_method: Method used to order the data. Defaults to + :func:`modified band + depth `. Returns: - FDataGrid: object containing the computed depth_based median. + Object containing the computed depth_based median. See also: :func:`geometric_median` """ - depth = depth_method(fdatagrid) + if depth_method is None: + depth_method = ModifiedBandDepth() + + depth = depth_method(X) indices_descending_depth = (-depth).argsort(axis=0) # The median is the deepest curve - return fdatagrid[indices_descending_depth[0]] + return X[indices_descending_depth[0]] -T = TypeVar('T', bound=Union[np.array, FData]) +T = TypeVar('T', bound=Union[np.ndarray, FData]) -def geometric_median(X: T, tol: float=1.e-8, - metric: Callable = l2_distance) -> T: +def _weighted_average(X: T, weights: np.ndarray) -> T: + + return ( + (X * weights).sum() if isinstance(X, FData) + # To support also multivariate data + else (X.T * weights).T.sum(axis=0) + ) + + +def geometric_median( + X: T, + *, + tol: float = 1.e-8, + metric: Callable[[T, T], np.ndarray] = l2_distance, +) -> T: r"""Compute the geometric median. The sample geometric median is the point that minimizes the :math:`L_1` @@ -150,25 +163,19 @@ def geometric_median(X: T, tol: float=1.e-8, https://doi.org/10.1093/biomet/asn031 """ - - def weighted_average(X, weights): - if isinstance(X, FData): - return (X * weights).sum() - else: - # To support also multivariate data - return (X.T * weights).T.sum(axis=0) - weights = np.full(len(X), 1 / len(X)) - median = weighted_average(X, weights) + median = _weighted_average(X, weights) distances = metric(X, median) while True: zero_distances = (distances == 0) n_zeros = np.sum(zero_distances) - weights_new = ((1 / distances) / np.sum(1 / distances) if n_zeros == 0 - else (1 / n_zeros) * zero_distances) + weights_new = ( + (1 / distances) / np.sum(1 / distances) if n_zeros == 0 + else (1 / n_zeros) * zero_distances + ) - median_new = weighted_average(X, weights_new) + median_new = _weighted_average(X, weights_new) if lp_norm(median_new - median) < tol: return median_new @@ -178,9 +185,12 @@ def weighted_average(X, weights): weights, median = (weights_new, median_new) -def trim_mean(fdatagrid, - proportiontocut, - depth_method=ModifiedBandDepth()): +def trim_mean( + X: FDataGrid, + proportiontocut: float, + *, + depth_method: Optional[Depth] = None, +) -> FDataGrid: """Compute the trimmed means based on a depth measure. The trimmed means consists in computing the mean function without a @@ -192,26 +202,28 @@ def trim_mean(fdatagrid, those that have the least depth values. Args: - fdatagrid (FDataGrid): Object containing different samples of a + X: Object containing different samples of a functional variable. - proportiontocut (float): indicates the percentage of functions to + proportiontocut: Indicates the percentage of functions to remove. It is not easy to determine as it varies from dataset to dataset. - depth_method (:ref:`depth measure `, optional): - Method used to order the data. Defaults to :func:`modified - band depth `. + depth_method: Method used to order the data. Defaults to + :func:`modified band depth + `. Returns: - FDataGrid: object containing the computed trimmed mean. + Object containing the computed trimmed mean. """ - n_samples_to_keep = (fdatagrid.n_samples - - int(fdatagrid.n_samples * proportiontocut)) + if depth_method is None: + depth_method = ModifiedBandDepth() + + n_samples_to_keep = (len(X) - int(len(X) * proportiontocut)) # compute the depth of each curve and store the indexes in descending order - depth = depth_method(fdatagrid) + depth = depth_method(X) indices_descending_depth = (-depth).argsort(axis=0) - trimmed_curves = fdatagrid[indices_descending_depth[:n_samples_to_keep]] + trimmed_curves = X[indices_descending_depth[:n_samples_to_keep]] return trimmed_curves.mean() diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 613920964..8b36fc001 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -194,7 +194,7 @@ def fit(self, X, y): centroid=lambda fdatagrid: trim_mean( fdatagrid, self.proportiontocut, - self.depth_method, + depth_method=self.depth_method, ), ) self._clf.fit(X, y) From 8aff5fac48fa516376703915ea66b8ffd37f1bbf Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 3 Dec 2020 02:10:25 +0100 Subject: [PATCH 177/210] More style fixes. --- skfda/representation/basis/_basis.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index 740be02cb..8c7a479a1 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -31,15 +31,15 @@ class Basis(ABC): """ - def __init__(self, *, domain_range=None, n_basis: int=1): + def __init__(self, *, domain_range=None, n_basis: int = 1): """Basis constructor. Args: domain_range (tuple or list of tuples, optional): Definition of the interval where the basis defines a space. Defaults to (0,1). n_basis: Number of functions that form the basis. Defaults to 1. - """ + """ if domain_range is not None: domain_range = _domain_range(domain_range) @@ -48,8 +48,9 @@ def __init__(self, *, domain_range=None, n_basis: int=1): _check_domain(domain_range) if n_basis < 1: - raise ValueError("The number of basis has to be strictly " - "possitive.") + raise ValueError( + "The number of basis has to be strictly positive.", + ) self._domain_range = domain_range self._n_basis = n_basis From 00eb47249c3872c3c5141cf8966b5a73c6660ca7 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 3 Dec 2020 21:54:44 +0100 Subject: [PATCH 178/210] Modify style complexity options. --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 6b1f360ba..baf48d0c4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,6 +60,7 @@ max-methods = 30 max-local-variables = 15 max-expressions = 15 max-module-expressions = 15 +max-module-members = 10 max-string-usages = 10 [coverage:run] From 191761748addd70ec443e2ca78774da5ef0377c9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 3 Dec 2020 23:02:21 +0100 Subject: [PATCH 179/210] Fix tests. --- .../operators/_linear_differential_operator.py | 12 +++++------- .../dim_reduction/projection/_fpca.py | 16 ++++++++-------- skfda/representation/basis/_bspline.py | 13 +++++-------- skfda/representation/basis/_constant.py | 3 ++- skfda/representation/basis/_fdatabasis.py | 11 +++++------ skfda/representation/basis/_fourier.py | 5 ++--- skfda/representation/basis/_monomial.py | 11 ++++++----- skfda/representation/basis/_tensor_basis.py | 4 ++-- skfda/representation/basis/_vector_basis.py | 7 +++---- tests/test_linear_differential_operator.py | 18 ++++++++++-------- 10 files changed, 48 insertions(+), 52 deletions(-) diff --git a/skfda/misc/operators/_linear_differential_operator.py b/skfda/misc/operators/_linear_differential_operator.py index 1009a4417..2597b6740 100644 --- a/skfda/misc/operators/_linear_differential_operator.py +++ b/skfda/misc/operators/_linear_differential_operator.py @@ -1,17 +1,15 @@ import numbers -from numpy import polyder, polyint, polymul, polyval +import numpy as np import scipy.integrate +from numpy import polyder, polyint, polymul, polyval from scipy.interpolate import PPoly -import numpy as np - -from ..._utils import _same_domain, _FDataCallable +from ..._utils import _FDataCallable, _same_domain from ...representation import FDataGrid -from ...representation.basis import Constant, Monomial, Fourier, BSpline +from ...representation.basis import BSpline, Constant, Fourier, Monomial from ._operators import Operator, gramian_matrix_optimization - __author__ = "Pablo Pérez Manso" __email__ = "92manso@gmail.com" @@ -79,7 +77,7 @@ class LinearDifferentialOperator(Operator): Create a linear differential operator with non-constant weights. >>> constant = Constant() - >>> monomial = Monomial((0, 1), n_basis=3) + >>> monomial = Monomial(domain_range=(0, 1), n_basis=3) >>> fdlist = [FDataBasis(constant, [0.]), ... FDataBasis(constant, [0.]), ... FDataBasis(monomial, [1., 2., 3.])] diff --git a/skfda/preprocessing/dim_reduction/projection/_fpca.py b/skfda/preprocessing/dim_reduction/projection/_fpca.py index 76346b24a..d3a7726f2 100644 --- a/skfda/preprocessing/dim_reduction/projection/_fpca.py +++ b/skfda/preprocessing/dim_reduction/projection/_fpca.py @@ -1,16 +1,14 @@ """Functional Principal Component Analysis Module.""" -import skfda -from skfda.misc.regularization import compute_penalty_matrix -from skfda.representation.basis import FDataBasis -from skfda.representation.grid import FDataGrid - +import numpy as np from scipy.linalg import solve_triangular from sklearn.base import BaseEstimator, TransformerMixin from sklearn.decomposition import PCA -import numpy as np - +import skfda +from skfda.misc.regularization import compute_penalty_matrix +from skfda.representation.basis import FDataBasis +from skfda.representation.grid import FDataGrid __author__ = "Yujian Hong" __email__ = "yujian.hong@estudiante.uam.es" @@ -58,7 +56,9 @@ class FPCA(BaseEstimator, TransformerMixin): >>> data_matrix = np.array([[1.0, 0.0], [0.0, 2.0]]) >>> grid_points = [0, 1] >>> fd = FDataGrid(data_matrix, grid_points) - >>> basis = skfda.representation.basis.Monomial((0,1), n_basis=2) + >>> basis = skfda.representation.basis.Monomial( + ... domain_range=(0,1), n_basis=2 + ... ) >>> basis_fd = fd.to_basis(basis) >>> fpca_basis = FPCA(2) >>> fpca_basis = fpca_basis.fit(basis_fd) diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index 7aea4593d..f95936c17 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -1,12 +1,9 @@ -from numpy import polyder, polyint, polymul, polyval -from scipy.interpolate import BSpline as SciBSpline -from scipy.interpolate import PPoly -import scipy.interpolate - import numpy as np +import scipy.interpolate +from numpy import polyder, polyint, polymul, polyval +from scipy.interpolate import BSpline as SciBSpline, PPoly -from ..._utils import _domain_range -from ..._utils import _same_domain +from ..._utils import _domain_range, _same_domain from ._basis import Basis @@ -137,7 +134,7 @@ def __init__(self, domain_range=None, n_basis=None, order=4, knots=None): self._order = order self._knots = None if knots is None else tuple(knots) - super().__init__(domain_range, n_basis) + super().__init__(domain_range=domain_range, n_basis=n_basis) # Checks if self.n_basis != self.order + len(self.knots) - 2: diff --git a/skfda/representation/basis/_constant.py b/skfda/representation/basis/_constant.py index 283d5a93b..220adc8b6 100644 --- a/skfda/representation/basis/_constant.py +++ b/skfda/representation/basis/_constant.py @@ -1,4 +1,5 @@ import numpy as np + from ..._utils import _same_domain from ._basis import Basis @@ -28,7 +29,7 @@ def __init__(self, domain_range=None): function is defined. """ - super().__init__(domain_range, 1) + super().__init__(domain_range=domain_range, n_basis=1) def _evaluate(self, eval_points): return np.ones((1, len(eval_points))) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index e7151dc7b..3fa04c0f9 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -1,15 +1,14 @@ -from builtins import isinstance import copy import numbers -from typing import Any import warnings - -import pandas.api.extensions +from builtins import isinstance +from typing import Any import numpy as np +import pandas.api.extensions +from ..._utils import _check_array_key, _int_to_real, constants from .. import grid -from ..._utils import constants, _int_to_real, _check_array_key from .._functional_data import FData @@ -487,7 +486,7 @@ def to_grid(self, grid_points=None, *, sample_points=None): >>> from skfda.representation.basis import FDataBasis, Monomial >>> fd = FDataBasis(coefficients=[[1, 1, 1], [1, 0, 1]], - ... basis=Monomial((0,5), n_basis=3)) + ... basis=Monomial(domain_range=(0,5), n_basis=3)) >>> fd.to_grid([0, 1, 2]) FDataGrid( array([[[ 1.], diff --git a/skfda/representation/basis/_fourier.py b/skfda/representation/basis/_fourier.py index 3895e322e..a9a4530ae 100644 --- a/skfda/representation/basis/_fourier.py +++ b/skfda/representation/basis/_fourier.py @@ -1,7 +1,6 @@ import numpy as np -from ..._utils import _domain_range -from ..._utils import _same_domain +from ..._utils import _domain_range, _same_domain from ._basis import Basis @@ -94,7 +93,7 @@ def __init__(self, domain_range=None, n_basis=3, period=None): self._period = period # If number of basis is even, add 1 n_basis += 1 - n_basis % 2 - super().__init__(domain_range, n_basis) + super().__init__(domain_range=domain_range, n_basis=n_basis) @property def period(self): diff --git a/skfda/representation/basis/_monomial.py b/skfda/representation/basis/_monomial.py index f6782ef0c..7489c51e4 100644 --- a/skfda/representation/basis/_monomial.py +++ b/skfda/representation/basis/_monomial.py @@ -1,6 +1,5 @@ -import scipy.linalg - import numpy as np +import scipy.linalg from ..._utils import _same_domain from ._basis import Basis @@ -23,7 +22,7 @@ class Monomial(Basis): Defines a monomial base over the interval :math:`[0, 5]` consisting on the first 3 powers of :math:`t`: :math:`1, t, t^2`. - >>> bs_mon = Monomial((0,5), n_basis=3) + >>> bs_mon = Monomial(domain_range=(0,5), n_basis=3) And evaluates all the functions in the basis in a list of descrete values. @@ -77,10 +76,12 @@ def _evaluate(self, eval_points): def _derivative_basis_and_coefs(self, coefs, order=1): if order >= self.n_basis: - return (Monomial(self.domain_range, 1), + return (Monomial(domain_range=self.domain_range, + n_basis=1), np.zeros((len(coefs), 1))) else: - return (Monomial(self.domain_range, self.n_basis - order), + return (Monomial(domain_range=self.domain_range, + n_basis=self.n_basis - order), np.array([np.polyder(x[::-1], order)[::-1] for x in coefs])) diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index a6bbf2f16..9c20f88f8 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -29,8 +29,8 @@ class Tensor(Basis): >>> from skfda.representation.basis import Tensor, Monomial >>> - >>> basis_x = Monomial((0,5), n_basis=3) - >>> basis_y = Monomial((0,3), n_basis=2) + >>> basis_x = Monomial(domain_range=(0,5), n_basis=3) + >>> basis_y = Monomial(domain_range=(0,3), n_basis=2) >>> >>> basis = Tensor([basis_x, basis_y]) diff --git a/skfda/representation/basis/_vector_basis.py b/skfda/representation/basis/_vector_basis.py index 839a8cbbe..29b7080c0 100644 --- a/skfda/representation/basis/_vector_basis.py +++ b/skfda/representation/basis/_vector_basis.py @@ -1,6 +1,5 @@ -import scipy.linalg - import numpy as np +import scipy.linalg from ..._utils import _same_domain from ._basis import Basis @@ -30,8 +29,8 @@ class VectorValued(Basis): >>> from skfda.representation.basis import VectorValued, Monomial >>> - >>> basis_x = Monomial((0,5), n_basis=3) - >>> basis_y = Monomial((0,5), n_basis=2) + >>> basis_x = Monomial(domain_range=(0,5), n_basis=3) + >>> basis_y = Monomial(domain_range=(0,5), n_basis=2) >>> >>> basis = VectorValued([basis_x, basis_y]) diff --git a/tests/test_linear_differential_operator.py b/tests/test_linear_differential_operator.py index 2e22e0948..d8de85aa0 100644 --- a/tests/test_linear_differential_operator.py +++ b/tests/test_linear_differential_operator.py @@ -1,9 +1,10 @@ -from skfda.misc.operators import LinearDifferentialOperator -from skfda.representation.basis import FDataBasis, Constant, Monomial import unittest import numpy as np +from skfda.misc.operators import LinearDifferentialOperator +from skfda.representation.basis import Constant, FDataBasis, Monomial + class TestLinearDifferentialOperator(unittest.TestCase): @@ -22,7 +23,7 @@ def _assert_equal_weights(self, weights, weights2, msg): def test_init_default(self): """Tests default initialization (do not penalize).""" lfd = LinearDifferentialOperator() - weightfd = [FDataBasis(Constant((0, 1)), 0)] + weightfd = [FDataBasis(Constant(domain_range=(0, 1)), 0)] self._assert_equal_weights( lfd.weights, weightfd, @@ -33,7 +34,7 @@ def test_init_integer(self): # Checks for a zero order Lfd object lfd_0 = LinearDifferentialOperator(order=0) - weightfd = [FDataBasis(Constant((0, 1)), 1)] + weightfd = [FDataBasis(Constant(domain_range=(0, 1)), 1)] self._assert_equal_weights( lfd_0.weights, weightfd, @@ -41,7 +42,8 @@ def test_init_integer(self): # Checks for a non zero order Lfd object lfd_3 = LinearDifferentialOperator(3) - consfd = FDataBasis(Constant((0, 1)), [[0], [0], [0], [1]]) + consfd = FDataBasis(Constant(domain_range=(0, 1)), + [[0], [0], [0], [1]]) bwtlist3 = list(consfd) self._assert_equal_weights( @@ -72,7 +74,7 @@ def test_init_list_fdatabasis(self): n_basis = 4 n_weights = 6 - monomial = Monomial((0, 1), n_basis=n_basis) + monomial = Monomial(domain_range=(0, 1), n_basis=n_basis) weights = np.arange(n_basis * n_weights).reshape((n_weights, n_basis)) @@ -86,7 +88,7 @@ def test_init_list_fdatabasis(self): "Wrong list of weight functions of the linear operator") # Check failure if intervals do not match - constant = Constant((0, 2)) + constant = Constant(domain_range=(0, 2)) fdlist.append(FDataBasis(constant, 1)) with np.testing.assert_raises(ValueError): LinearDifferentialOperator(weights=fdlist) @@ -98,7 +100,7 @@ def test_init_wrong_params(self): LinearDifferentialOperator(1, weights=[1, 1]) # Check invalid domain range - monomial = Monomial((0, 1), n_basis=3) + monomial = Monomial(domain_range=(0, 1), n_basis=3) fdlist = [FDataBasis(monomial, [1, 2, 3])] with np.testing.assert_raises(ValueError): From 750dce71b6f3931c81038d5229f8ddd64acf7ee1 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 4 Dec 2020 11:57:19 +0100 Subject: [PATCH 180/210] Fix style errors. --- setup.cfg | 6 ++++++ .../dim_reduction/projection/_fpca.py | 1 - skfda/representation/basis/_basis.py | 10 +++++++-- skfda/representation/basis/_bspline.py | 4 ++-- skfda/representation/basis/_fourier.py | 2 +- skfda/representation/basis/_monomial.py | 21 +++++++++++-------- tests/test_linear_differential_operator.py | 6 ++++-- 7 files changed, 33 insertions(+), 17 deletions(-) diff --git a/setup.cfg b/setup.cfg index baf48d0c4..6c7b209ab 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,6 +8,8 @@ norecursedirs = '.*', 'build', 'dist' '*.egg' 'venv' .svn _build docs/auto_examp [flake8] ignore = + # No docstring for magic methods + D105, # No docstrings in __init__ D107, # Uppercase arguments like X are common in scikit-learn @@ -25,6 +27,8 @@ ignore = WPS120, # We like local imports, thanks WPS300, + # Dotted imports are ok + WPS301, # We love f-strings WPS305, # We need multine loops @@ -63,6 +67,8 @@ max-module-expressions = 15 max-module-members = 10 max-string-usages = 10 +ignore-decorators=property + [coverage:run] omit = # Omit reporting for dataset module diff --git a/skfda/preprocessing/dim_reduction/projection/_fpca.py b/skfda/preprocessing/dim_reduction/projection/_fpca.py index d3a7726f2..e63ea3168 100644 --- a/skfda/preprocessing/dim_reduction/projection/_fpca.py +++ b/skfda/preprocessing/dim_reduction/projection/_fpca.py @@ -5,7 +5,6 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.decomposition import PCA -import skfda from skfda.misc.regularization import compute_penalty_matrix from skfda.representation.basis import FDataBasis from skfda.representation.grid import FDataGrid diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index 8c7a479a1..c607e2185 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -58,6 +58,7 @@ def __init__(self, *, domain_range=None, n_basis: int = 1): super().__init__() def __call__(self, *args, **kwargs) -> np.ndarray: + """Evaluate the basis using :meth:`evaluate`.""" return self.evaluate(*args, **kwargs) @property @@ -219,6 +220,11 @@ def copy(self, domain_range=None): return new_copy def to_basis(self) -> '_fdatabasis.FDataBasis': + """Convert the Basis to FDatabasis. + + The resulting FDataBasis will have this basis as its basis, and as + observations it will have all functions in the basis. + """ from . import FDataBasis return FDataBasis(self.copy(), np.identity(self.n_basis)) @@ -246,8 +252,8 @@ def inner_product_matrix(self, other: 'Basis' = None) -> np.array: Args: other: Basis to compute the inner product - matrix. If not basis is given, it computes the matrix with - itself returning the Gram Matrix + matrix. If not basis is given, it computes the matrix with + itself returning the Gram Matrix Returns: Inner Product Matrix of two basis diff --git a/skfda/representation/basis/_bspline.py b/skfda/representation/basis/_bspline.py index f95936c17..791699398 100644 --- a/skfda/representation/basis/_bspline.py +++ b/skfda/representation/basis/_bspline.py @@ -1,9 +1,9 @@ import numpy as np import scipy.interpolate -from numpy import polyder, polyint, polymul, polyval +from numpy import polyint, polymul, polyval from scipy.interpolate import BSpline as SciBSpline, PPoly -from ..._utils import _domain_range, _same_domain +from ..._utils import _domain_range from ._basis import Basis diff --git a/skfda/representation/basis/_fourier.py b/skfda/representation/basis/_fourier.py index a9a4530ae..a6d89623e 100644 --- a/skfda/representation/basis/_fourier.py +++ b/skfda/representation/basis/_fourier.py @@ -1,6 +1,6 @@ import numpy as np -from ..._utils import _domain_range, _same_domain +from ..._utils import _domain_range from ._basis import Basis diff --git a/skfda/representation/basis/_monomial.py b/skfda/representation/basis/_monomial.py index 7489c51e4..273b31c97 100644 --- a/skfda/representation/basis/_monomial.py +++ b/skfda/representation/basis/_monomial.py @@ -1,7 +1,6 @@ import numpy as np import scipy.linalg -from ..._utils import _same_domain from ._basis import Basis @@ -76,14 +75,18 @@ def _evaluate(self, eval_points): def _derivative_basis_and_coefs(self, coefs, order=1): if order >= self.n_basis: - return (Monomial(domain_range=self.domain_range, - n_basis=1), - np.zeros((len(coefs), 1))) - else: - return (Monomial(domain_range=self.domain_range, - n_basis=self.n_basis - order), - np.array([np.polyder(x[::-1], order)[::-1] - for x in coefs])) + return ( + Monomial(domain_range=self.domain_range, n_basis=1), + np.zeros((len(coefs), 1)), + ) + + return ( + Monomial( + domain_range=self.domain_range, + n_basis=self.n_basis - order, + ), + np.array([np.polyder(x[::-1], order)[::-1] for x in coefs]), + ) def _gram_matrix(self): integral_coefs = np.polyint(np.ones(2 * self.n_basis - 1)) diff --git a/tests/test_linear_differential_operator.py b/tests/test_linear_differential_operator.py index d8de85aa0..d871f70bc 100644 --- a/tests/test_linear_differential_operator.py +++ b/tests/test_linear_differential_operator.py @@ -42,8 +42,10 @@ def test_init_integer(self): # Checks for a non zero order Lfd object lfd_3 = LinearDifferentialOperator(3) - consfd = FDataBasis(Constant(domain_range=(0, 1)), - [[0], [0], [0], [1]]) + consfd = FDataBasis( + Constant(domain_range=(0, 1)), + [[0], [0], [0], [1]], + ) bwtlist3 = list(consfd) self._assert_equal_weights( From 79f73188c303fb58f81f7eb9888b613b832b3dcf Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 4 Dec 2020 12:15:14 +0100 Subject: [PATCH 181/210] Fix style errors. --- setup.cfg | 4 +++- skfda/preprocessing/dim_reduction/projection/_fpca.py | 1 + skfda/representation/basis/_basis.py | 6 ++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 6c7b209ab..54ab6552e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,7 +67,9 @@ max-module-expressions = 15 max-module-members = 10 max-string-usages = 10 -ignore-decorators=property +ignore-decorators = property + +strictness = long [coverage:run] omit = diff --git a/skfda/preprocessing/dim_reduction/projection/_fpca.py b/skfda/preprocessing/dim_reduction/projection/_fpca.py index e63ea3168..07b8a07df 100644 --- a/skfda/preprocessing/dim_reduction/projection/_fpca.py +++ b/skfda/preprocessing/dim_reduction/projection/_fpca.py @@ -52,6 +52,7 @@ class FPCA(BaseEstimator, TransformerMixin): The resulting principal components are not compared because there are several equivalent possibilities. + >>> import skfda >>> data_matrix = np.array([[1.0, 0.0], [0.0, 2.0]]) >>> grid_points = [0, 1] >>> fd = FDataGrid(data_matrix, grid_points) diff --git a/skfda/representation/basis/_basis.py b/skfda/representation/basis/_basis.py index c607e2185..e6e666e5f 100644 --- a/skfda/representation/basis/_basis.py +++ b/skfda/representation/basis/_basis.py @@ -222,8 +222,10 @@ def copy(self, domain_range=None): def to_basis(self) -> '_fdatabasis.FDataBasis': """Convert the Basis to FDatabasis. - The resulting FDataBasis will have this basis as its basis, and as - observations it will have all functions in the basis. + Returns: + FDataBasis with this basis as its basis, and all basis functions + as observations. + """ from . import FDataBasis return FDataBasis(self.copy(), np.identity(self.n_basis)) From 9c39c511fa896f1474e49cca56c7258bbc5a1c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 4 Dec 2020 12:46:48 +0100 Subject: [PATCH 182/210] Update tests - Remove flake8 in test - Remove numpy and cython prior installation - Test in Python 3.8 --- .github/workflows/tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 521edf338..e1c1eae94 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.6', '3.7'] + python-version: ['3.6', '3.7', '3.8'] steps: - uses: actions/checkout@v2 @@ -23,14 +23,11 @@ jobs: - name: Install dependencies run: | - pip3 install --upgrade pip cython numpy || pip3 install --upgrade --user pip cython numpy; - pip3 install flake8 || pip3 install --user flake8; pip3 install codecov pytest-cov || pip3 install --user codecov pytest-cov; - name: Run tests run: | pip3 install . - flake8 --exit-zero skfda; coverage run --source=skfda/ setup.py test; - name: Upload coverage to Codecov From 5d693f0fb7cd45ae3e81c69953f1e6b98b024734 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 4 Dec 2020 16:52:41 +0100 Subject: [PATCH 183/210] First version of ANOVA typing. --- setup.cfg | 1 + skfda/inference/anova/anova_oneway.py | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/setup.cfg b/setup.cfg index 54ab6552e..bbfc0dc82 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,6 +66,7 @@ max-expressions = 15 max-module-expressions = 15 max-module-members = 10 max-string-usages = 10 +max-cognitive-score = 30 ignore-decorators = property diff --git a/skfda/inference/anova/anova_oneway.py b/skfda/inference/anova/anova_oneway.py index f25bede87..fcea00cdb 100644 --- a/skfda/inference/anova/anova_oneway.py +++ b/skfda/inference/anova/anova_oneway.py @@ -1,12 +1,13 @@ +from typing import Tuple + +import numpy as np +from sklearn.utils import check_random_state + from skfda import concatenate from skfda.datasets import make_gaussian_process from skfda.misc.metrics import lp_distance from skfda.representation import FData, FDataGrid -from sklearn.utils import check_random_state - -import numpy as np - def v_sample_stat(fd, weights, p=2): r""" @@ -202,8 +203,14 @@ def _anova_bootstrap(fd_grouped, n_reps, random_state=None, p=2, return v_samples -def oneway_anova(*args, n_reps=2000, return_dist=False, random_state=None, - p=2, equal_var=True): +def oneway_anova( + *args, + n_reps: int = 2000, + return_dist: bool = False, + random_state=None, + p: int = 2, + equal_var: bool = True, +) -> Tuple[float, float]: r""" Performs one-way functional ANOVA. @@ -237,20 +244,20 @@ def oneway_anova(*args, n_reps=2000, return_dist=False, random_state=None, Args: fd1,fd2,.... (FDataGrid): The sample measurements for each each group. - n_reps (int, optional): Number of simulations for the bootstrap + n_reps: Number of simulations for the bootstrap procedure. Defaults to 2000 (This value may change in future versions). - return_dist (bool, optional): Flag to indicate if the function should + return_dist: Flag to indicate if the function should return a numpy.array with the sampling distribution simulated. random_state (optional): Random state. - p (int, optional): p of the lp norm. Must be greater or equal + p: p of the lp norm. Must be greater or equal than 1. If p='inf' or p=np.inf it is used the L infinity metric. Defaults to 2. - equal_var (bool, optional): If True (default), perform a One-way + equal_var: If True (default), perform a One-way ANOVA assuming the same covariance operator for all the groups, else considers an independent covariance operator for each group. From 2e8c05261b6b25efd84e685707120cef3c3da895 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Fri, 4 Dec 2020 18:07:38 +0100 Subject: [PATCH 184/210] Linting --- linting.png | Bin 0 -> 36452 bytes setup.cfg | 12 ++++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 linting.png diff --git a/linting.png b/linting.png new file mode 100644 index 0000000000000000000000000000000000000000..f4cf1fb2eaf784823439a2b60d2c514c4c160fa7 GIT binary patch literal 36452 zcmbrlcT`i)7cYtx6e(X3K`9XnC>=zq5CH{IiXgp32nYzFhn~c*G?gZT6lp3T(u)u} zNDv}~A|MG6iV%7TJ%l7A-01J!-+S+_`^Q`Nowc$!XP9$l_IzgV+50nlUOh6_=Q@4y zG#eWmm%;tJW^8Q7o!HopJmEOWDiPXsk!Srm5@4pU!&WgMyux}o?xt;|&Bj)p#ChO! zg7tjL_r6U48ylb6pWl%rl;sgNwgjrdU2XFaM-qdlMZ_X!iLp2xI+Y2hLERh~eSIT0 zgP#^?+ZKu7fzK09zx{HA>*Ot^<2v$2FOTb7%6^lf#(SST`_1fZ^2@1Dfwl+eGY_)# zW~+y!lxp|c&kg=e`ssYd>f|9RGC^CtSHlF-TW*s;O$^&>vUzd zL*27c+tOzn4#cbpx^xIZXy!VryR2*Zyx5t)H{S$#QGajP;*b2F7Ij$bW~5iP8FIHr zBngLZ+lCTQ_nuz(`_<>ihOBimz|$dR!fnr*r8D-k5B3T{sp)VgVBs2|3FO%7tTP?5 zrV}9ae^obF66;go6zGY=uD?Bz9$|3aei!jntnZMTq$x-J5j@W&k?rEm`M0v~hh?QG zwMC=2g<#xaru0}?k#xg+qIA?a?Ote4`JwgZ`#9$JNzuP`pIA#|`@XP;kw;hBv9zBb z7T8B^I0AOLB(qkA4u3e#2jDVeSl|sEn=dwYAwPK52-IKgF@7KyCoOztP62_ zIpj1KW&6LL^h1lZ;(Dn%Xk8G24D!@uW4ptz0?N+J^zhOeOad~uk}&mcWEtw38_-Uu zY&lG(XaTBG+V>^qw9~GJBez{ZVM|EAu^KI5VIwe@NAvQqheL!h>-}{;zq?+iJERpBN;?BS8|63J#_4@)zW%{PDaU6aX3u4 zz(-wl))P0WAf_^X^9qPDA5~r9ikp0&ol@URFN0pWpbw~l^-WsdUf*bzK~ziyyz`nR+E~A=gWM$tGXI@*XyX&Eozoetz4LFH2NW6 z22Ljhd`iLwAlC)^UD;6q1*gRf1vs4E-(Ww=mv`{Y&smS-v1f}|AAKHMve|XvR@BxD zQ4&6(`c$}M#bj<(o=x)zExTmLn{5T@G^NcjN&rUnpl?6|IHA~D8KKfO^0R?xd8S%A zagppJTv_h^pc1HL?hz*QabYolBY*X1n%@ceKca5}+w9M;(VBA)mqrm%m0|P?+p7eV z~AnAxTS0~5J}acs{l|1V*$vxonT?^sHOY-IvKKo0m%^Kb4yT;$5{8XuOCDv z%sxjqdNd3_;2=N%YJ=fN?iaT^TDzRQ)f48Em8_dFjFg=e^g8o&;{dmHaW;hPLh#Nr z`6{UOtJTjU^W4_{0fsUK`(r%pgT)I+)P{N7+AXCl&6J!m4>s+&9tOiN76 z2U@~4Y1Z{n*G`21JZY>tA87=M%aND>wC7YLz?&V^aE^sZ4xv-inwH+T83v^q+{_x~ zt_cT;+ugI&k?67N&B2+mkZ49=k{co0-x<6?Fe}S|i%1 zTaU7Ym4dBd8{vV_b@e8bQb@06(ZN#YHd>cT?A+RKc zto-^>T>Hz4(yv@bdmmj7PQ`ha=H6{GPLCE?lUxH^O-6mo+mEW{nmFHXsOm94~cLR_oMJZ*Su1m(IQ3_5!7_VxD+9cUddh|yMQQ= zE|c?$6^d=D9&&{XO)^&#@8MC*efz9~rAj1vHA^Z#iLvF&_!)Oa1=*_!Vl>&M8_T|Z zrN3U97m??eoMUv^A#?6(A;L7*csXog@*V=&)_!{y8JrUNfvXN6uyO;J1F_4!ehzP$ zd$%A(&IeY>rH!Y-D3pKVO)fil8^TdR=V+_FMTg=^1opJ(FzukaX2L-g+oia$beBBVq#lR9E}Rr0&0w6t zEPp+X?g8o*j|G^gr+2FmzaCD?C3?<7J2bfq@RlNoN=GXb*3@P6u7+>k{yGDYxBH`J zTu!Ng?-4s$GlqvER&4otr=5Py7J0orVFk3_W|zouOG9*K&4|a&9C&qtp$An(b! zPjNmUbjVn357QBV%s`h*F#qPt@NTvBM3KQwttzzSK9KxwL|K~v{y1X8VZN= zZO0Ov(0cWLuFmLBkO|qtTa$C$0|~BU;_XwjAq{YA*v4)|k6cxG&z-}|8j4QK4!abI z8D?~-r3qs0iSgd#_b$6!Bjg-kVq3)adBdohpqLIFGpwAzZu2jsqo&hLN|=w-Uh}V| z;qPV6XHZ+g1k-1wK!ww^;gR2}$4Lh7RsqIilvgIx6QwsbJ~Opw1VL}BkZh#NScUxJ z0{BtSvL`hHGbmM=JtOcAsc<_TR?RrRWWCo=VKPoI%b2|}N-FmB97WDj0ivaw`vt_v zo%+DF$bc$umJ3n1wTVwg-qg~HtR5)du0dIwYwCxi$ICIea6$>5qNQGd+R+mHsu(<= z*DHuGLKn)%ys84MfDg%o>Y@RIWE?2Z_&a+c-iF-o8rJG3A1nW0rQ1%Jgv0LcMga0wH@Ty0jAz@n=9rx+G zb2tz{E79Tms{ewV=i`*1*c5?6md_IW55Tdiv?_`4fi#pd>GL(~b`)VbtyyayJmn&! za?;xwT-|-m{=5))0RSkisETiIfy~0o2>4nbCX%_~9dWp8a;U_x!g+)qjssj0=anOkO&moIHauk9P(pNfC+#Y1}nob{IS(u{uNlw zAZKtseK+Zqr`?)$t&Fm0Z%+C;wcrq|Dc8wthDU*gyag(RESxS6JIL8Im}QvF?jW6Q?pp=2cx>52|k^ zZG0=~Zl;CAfU)7(dm59nlP05Zu0p~E54#0FlV2iyp7bY~GaINz;qhQwaSu-_)r?~Q z1v?*_nrJdnQ8n1LWkNSEky6!zf(6P{h|;4@!1@@Ai+uST(L2ZzlcowK@zKFo!?Y#^ zOd;gKVDg8uYS;vNGZf^PxW~1hopl1SC4fk7bK`zc|8+GBv)lAs!)b9mM;a_ zd&5TFFH5@N_jw!N0i&AVC_C2<1S6;iIOET1L5`>82cxxE9jVsF4f-qnCb=3@tu99g2 zi9Cj+Xzigfa(sLJ7l8{CWxK&LrT9hx5GftIkr<8l5eNiTpDNoPMj6> zfe9Ob0h2hAO!mi4QHBbuxGG3kwS4=$vK=|(BoOXx(=gM7Hda+$7p72(T_>qZi2*t* z%qgu-=!7NYYdCkKvYq`5ifKjeyf_mS!Y2(5b#ivyEd*YOcSsdT&q~n@@HbV_?s0FP z<2zDnWYY=q6MlMvkrm#GV?Lf; zcU2(GX{RQ9jHl@+1(wIRGa6X7I$+d1$PeKb&m2NVFPvRz;9pIk^h~skVSfu#CY-cP zZdZuF^N7e|v(|Fl!1R_(0OUr_VI#_)AY6IT?(2Y=$i%?wfPgnU)+f+%5YW#U=jIx4 zDO6yvD+*a>z>%^(;*(!LE%;T^1OE(JPcvT&UaRW}8*oA{LpwSyqV|u(Gc|nL_!eg> znj|?LvdM@W`oa|1P_kV*=0(n-)zG_4t2V~e3bGP0vb8;z1y81KWLiNo=wgPdeuA;u zc+xW1VO{n|Cuh*;RCK5&0`)WS>^vayznH2UV-Ful9qwC0qg@XcAOYY{;Q7RM2{MdaA&gOlMDsSQ z!K8dCC!zPZDXDwo>L;=O#M@^mc9gL#v^drhww9%mm(&yp};r={R)o-r)dr7W=yo*r3;&8xHO^l@TV8H z{V3?-ZPjZi#(db|zr%3)QpAdI&2hR)o*kEkH?jUP#pY|0HJ%g038@6I;2Yle)LhsS z%|$r6w#QcL2Z0inzNmRK2<1s5O{Yn66>^k%$JvI9_#$HGp1^=z^UN{e($5>M1FC|$ z8S>Bl{(a~bYQ+la9JlrPYMyj1nH>gB8Ts#KB@)XuRvUtQfsF@QHBZoo7D|lg!c7ro zj+Dh~{$mLGy$&gr1CNhPwAr)~yaGUgLS)ec31bS+QlKqUs`Mr25G^TEEP&$^%Ca9#mI59-CCe!AlJVY6q(o0+wPAC$|B1C}u3fe__&}XdN%xoRMgBb@E;i zyjCTm0EJ@546OM}j6e0`JVsdUGDNO{I_)y82bv@0PjvZMd zr95GPQN^S!x!V(VZeS%$Rt`)~pl1NhC&8Xrn+aoz=@jzc6|F$~VR(8{-4xEy$*^4O z{D20LYO3(mh>91)XQ4G^mbw)*nAL>we&WfKg*YL0JC4T!wedG!UZ80>IX2D*QhB71 zSwu(rBHw(T!vU`53C)LIu@9*E{kHN)yW^ps+FCd0T+8;0MiJC^Szl-ke)qeGoQL2veJ$PdE70_(9~NB^&s?wcsYwvk7hy*z5)PqPtzz4! zFA6|>LL~IY*gZ8$?bGcM2jC6Q2^A3@M7Tqd>jAuB>8=+D5&-+!ttbq(!m5LsygkRx zBN~AN>fKB*P79IAEX7QMC6Gz7){v_1`gQ86H3lk;Vn)%+!`Gr%%jGtx9@?7w)jLxo z3EBuU!J~rqCqrnJy0g?z2TQJF2wc)8)qQlNWcO3E>XG{r0zEZz^HMF##Mpw&R7Yn|>eJ5pj_ARZ3^qTL7FDljePK#EnT;{9RxAh=8v%<>{ zV57OPx6in>mqVL-bHQ*HW**9fVTU>BC_La-IxFm-`Whh>SFpP#VU z5{AM~56&`ol>Ch;ScwM=8Ob(yaReZG$y8p8s<2*xC!muRuoZlBvNcFIY1OJMC3)d!Ijp+(arut#yN6gd?Ho}OUgd0$|+B4V%TVYur9zINt(e*HkUy-UiGLlkhp zNLbj)YAn{dadWV&>D4_1tj(c#25yHTn7^05$l#<{SZUqXVm5q+l0{WCxg`NfzWy3Y z8)J(oLb_2mjmQb!sOuE%bg`L8+RmV(zf?^+8|m?Nf#1d>9#xF(xwM8(hh3(dj<>}_ zD*zn|r_eH^1%^6F+FQamzSh^_y+z!J2XYBTAbt5DD44jq2B~~Ns9Xvt>+g&cm8u`8 z!{p=}t4>FgZ9$tCmvTB+v7Rr;h0OE(XbnCs0h+61_q4iCWZS!Gz%Y4Qxr5*Z*g}Gp zW^bACfg3B9f1rU(KPtAqZS=aepLTp2j#Q?YzbYMNS_#}U+LjxnH%MoM-8t`-mwY0H z^25hy`dTb}jx{aRJtzKfndU-P8Nax|&ha+#Km0hvL*1?JaYV`n@}bx=?3Anq6HNL-J@TMVn6~8>21o4l`|nnB_Ct=a-Kmq+ z&6vv(hLNJh9iruJ9zCgR-31cvpYaub#&i?8ayjb4O&vit0xpyzKhfZTK2qe`5q_u# z_r&CEhXca(xLXf==6}@ZIi<0)vGw~(oYBMWs`|F5FCkhA=JM|dS_H|u%}R@%dA@b= zAqSylm$Z?bUUqm>&A+F3F^r8(UQkCczJjT`+ji`Q#w#{9!5(W}!T->~k^KLsob>;O z%BEu-9jA{fQmCb-nw#CwA6_Y-id z{?X1$QLJqMQ*1Pga%UGJYMvE5`A3+SHL`b)u&It&ZzO+{=V0wK+)0Isoo4&i(H)j_ zJVu-2*oSI?_WdsZ`DFdxPxsXN96VgU2d&d>N9k@EY zJjy1YH64tP@9V4mjih9f?aC-WDq6hA-c--h)-*W>2~X~ymlnI^LRZRE53U_nKPH@V zIxmv~6)?dJVo)lKXq>BvSVDA@@H0@Pe^voX*fTp_B8QA7L^+3VoZ99Dsp~^=E0*U zH;aYlUypS9fBRQ5oO@Cp`o7vbv3ZlRU2M_{5-)>(4xO~>Gudc+z}zmqskk%Ro?3d6 z%XtzBYPXnH`G&-XyI65JDZ=Fy8^8;12O}Q0F}rwr$N9DUqz0HD4n|=!se|@GUOwZq zdcMP-*L5>S#9bu0%*9=z>$Iwz390c?FRQr&iN?99o|Wy66)N`OTQ@v1p;jy-p(;1S zY!NAhdN=poPz;6C984If71oz6r^ZzpA)775O@c5rkTAM0q{ZLCM@V7{YdwvxtdlMG zi7n$yPa4r*Pm#zC`SiOj$-MG#8suPBrfURuY!gV8`o(wSaT)O}^?(vjO06dG&Cpxy z=O8$QeNYbwCO5tqzD+go2_37a2MHiObPk(L$?2QZV!(47ZZq`@`A-|Q%6)>E&xX~< zO{v2Et_Mpm2|%mu0Ie@l3%Q8yojS_xhj;T#nj9Qs$zg9U>;y<|Pb^R*Y%3^XF7YP$%eK#i0`gbddfin*_P_* zEBJW?dmp+zwRY(%NsslXE!gdn8y4ajfG@kQ7c&T#?ap&xj)*Dk!k|E~Q&4Mc}BXv#OPjAh6{} zXQCsoS^f08Q|Tf#cI%vx34wUJOYm$pApTdh{50}J(-dVsyc*(Q*8(g2XvjOU#w-na z$(N6KI&s=8Z2jQl!KN`~KYyD9yG$27aIcQnCf4oYG7qR`5%7f+94esNfP)a1S{d4= ze9DQ20n_8&l zg%Mcxc`W^HO)w8dL7b>rBeN3xEh0c~ojw7pd&;B{{Qu>2etIP;bo}CY;Oy3(dS^|* zz`H_SxLX1s~(a7YrT4}B+gxJkdM@^SHU}~-4IHVcQJ6kGt{ixW!+BKI^<51{{1o* z;P}Jrv>EEBYbvj}#|Wj(ZMy&MNmG3v)&)X8*o4)&IfTlVbZ|c(*z7LtBR#A{@rBCkG`qqg)?A>p`h`bH?M?$-bR1r zMYZg`)$|9|3}%TFBQpp^wQI1NrMI~EMB;wOkL##!h3njzoe94Si~>}m##?ODa%v6J za$59|_CrRzV3B2Gs32-hb@2VURdt8+6`Zou;VECd8fVUj4}T#qq{x<<1Smz_IDSsI zjKbd=qy4}GKctgE*E$=e)e3DYYZz$adny>oxObJ|KCDIEgl>R#BI@!?UWM|bj#p|Qi$#aCp9J8H1nQt zv9@?WY(wZjBfE0^hWZ5_aUh&yB2rPNjUK;R8Wwu6Q!lu#J!DlYFl=J4W z#*MIr6a&!6D~|~)1b*&nm9-BrqjzG7-NHK)U*GNkqu*4|rS9pyNltL}%Y!vR{2ca7 z-?R+qj5BG@8uSGew0|NmN&DxXyGGzzO2E_0q^w(8+_d>b58u8A5Ki=r`bMPsWT>rI z?0WlWQC;<*9>INZ#weZ#K*JnB;)w5Vt#^S_^5j~p%R{ut6C1f%$98WCkC@4y7cRYf zkljBR{UdZbFk2x>(hK4By98PUl_jg@d-0ugZG;ic3)7Qr|1KILfi#)Qgh-(lmnEbYG zW2d#xP*^zwdU-qID^6(1Pf~d(A(e+RK2I&^LnivXg_MMd3eb|%q}pQ0x{+l(dri!+ z(xm9kkL%WYchPtj^f!<44NXus>%lSuO5Ur|u8RdW;X7Y3QE zeNy7rQ?`E2R*q@L+>x&w1$eb^{-eoEG|QB@qd+a_h0>C#j$o%3>! z{@8lPy|2UrR?%`8Zd*^xJ8U+=FUnCQ8oFKTB}k9^_a$84?_HTv7QXpaM19e_?)r=6 z)%fqn@uUc}F5kP>PI1onO4!rMxc6{gBy8!ya-R`PuP}0Bk|S1z=_J)ET|xR~jgR4Ox*f{6VH zu3q5=F~c>gce~^6AM1%CzE;rk&>X^b8K#RFZV7+ULcCtmE-QO%4+?{kF8oKi$LE!@ zIQG7ayJ8EZG78>_<4dX>EZvF?uV=Ruvf%w>p?;%wRkH4u%1;;B>{RjLuuc+~;Ns;I zZxdX@;Hi@XXMX}uJr8v})3y9~0=sEVCkPAe%=tGXy1>>5IQkuNa21)xzh@O9>T((* zltbKWd{l<`)`rzeUxRTM&I%IP{Du;?M>8vX69n#wCVdM_@_Go23e6N|APZ}DMPgLi z6>n&;TnJZo1;`2c-qZJEZf)ntUql^D$EA=vYQ$b-1nFdmBVwAIIQYKq&6VXwDn_k9 z@&%S=?*(t^v{+d=IiGXfWf#~eG%!a13k?iyu~`SZA$Tj}-tMa{BiGEW0WspUw3^tUv1mC#?&oKF58W9LeN6+JofCsZQr_HH#?d0G~X`js8q6`-N4fL(fB zciXJ(sTt@3Dm5+XkE#i_qdYAaRPA6ts{ksUPPN;sS2VT!icyG>U&**d^89wOXj(zgS+R_4gZ6l2nb!wiO$g{x zU8cSlskeUe=r;T6;6{Tr$KwduREm{--VVKhQK^X|FcE;SFuerRFPaJ&1zqU52o_ZI}yaqsZ& z-}SVW)jtDv1)0~W*$2zV$S+2fNqo5}o^8#o{?SuOp&VR$%hKoil!>CY%sUYf$76)F zq}_*%9t3z6%jx_USu6PMi@fj!J!#+K)6U!~FH*F1G6Vp7;nVtQa1(9@43TB2lfkd3 zP=2C1FV6igrAFa` zQk21Rj(Bv^rNRzF!6)ut{)w1*aN~YY@2+lonoh=npM`a;MyR&zFN@RZ?~Ccg`@si4 zcODE( zL6+y-$Wy3bM3mkGYM!4}T|_HJl$V`WG7qv(2)MM*H)=HS&rY5C%=AWQlT^9iPC8c* zx2U0*Ar#j8KIGy9KFpEMxWJ>fxwVC)xM;EAt>xNIdElsj{kNsL>lxS!>iv%OYufw$}15iy2)G zjXl=g!sE0?hS_lJ;PS@AG(L}tt3fS~2`;(T1WpBw!;Qm_#BWC?JN=s4I%|h$5XjgS z6f-=J6c&#S#c*rv-s|#T(NCkf1`6q9_`Np#8RisQ6g&j+CYa2}bNWqv_VokWa}f4M zN=ucJHw&Qi^kt(55y_+`#k(AQJl5MA2|e#n=kg6dCOU?grDDNecpjtpO+CEBe#Lud zj+E(t!Pr|FSCVcIpE1qeH8GCKm2oAMPIZ zY#-X#r$;IvU&#OsbP=enQZs~^7mBV(-471iUYC`L5WBK15mur={>&M_8@bY;aR z?Kn|S9h+slX{4hFl~8i$s2N&uJ46FtZmQ7XI4}KlC)h&rE69{1ml?2ZjcS&z`4P= z)l>dWklov!pFWu^9&e3cx`;*L*rl#+CMJX(Mt4(fL*eK`@T0LJ)A~kf9%o5PIs_-h zmrJzFERgA1I(!b}>Xs~X`><1~M<$-*R*jNpKL?b)F!0az>(^}EpbbB-M+NEk(SpClCjsK) zAIq6wswR;ADF8DgWz~9kZ2R9ArSq+rgqmYDte9!H zW+7{PBeYo2s_od*+hxyPgim+80(>|w7Ecc`-2fY`ppzw;F3`4I-jjEJJp|E)I}Xjv z?$qPL%4kRWVA$y8h-4$L*v#iEE>#Tnvt45Oex|_eb7r)u$Yf znf}we>lvsidAVd=9+FQH@styr`7oR7n1*wp#!D(@+$$tihell03K&h~HVtd3lv<;WVfmsw%9MC8By+wou0^OGHmgyV)k89u?9 zehDiem0{q`R`;P#yI?)p)FA}9S{6q>;J^C&8!IL~%H~_e;#h`cq%K`x2g6Hf*v3T`r!1psg50rBne^nsvWB>h=7zxo4 zgie~-1P#}G8OpdLMX>qzv#v`PVDo) z#cY4N%bv|5+m4>}Ec;tT(B(Mx_eTEYACCCX1-|p&BAF<8&hR6>KZ3kcUh#7#zydoj z^8T&!T$bf+eTlAdcFqzhQ@XXtrpG910q>Z!PsyTg`}X?0?mmF92?Y(?9RbFIa}M-6^*Mgr>HTAwD-F)6|=C zynROhVEoSHfGfgSXCy7Bvo3J3Ngl66Yhd)2L#G z7k{q3#pU_7nCdrkI89(-hl_Dg;$S*aYu8E5Y30>o7__MI z2`UKUM;{KdCZaulZagpU4F1mQ={FsYnJsoq| z6PA;rCB3m4E%Ogz@1M~A*Gcn9J}D}>PkxPD*2zHrgHnZx8K%5>ntJJ8{e^36#n19$ zxQ$*UROn{BL0aH%_l1mJxbi*W$)|D;zUQfMG6^6QBYQSop;~!M*?^{Jo z09iYp^U2#D7@_&;(Mt8hH0R&WA@rD=oHKkodt61CP%a5hj^Pc-p7u(YSeE~fA#2&D zEJq7d(O&jyz>xcx5Tf~>UpD-H@`*d%FhT4uPRi__PqcjDUX+;8&-A&BtLV0>0Z&hw zUqOpkapGN{?09A&eh+hAtKt{_bGuJ{o2vJ#Dmmjy2tRt{z*y)VDL6IhkwvtiH_YLV zLhpBw?pxFKpD*hWZI7a5VbDa7Xc=ZnZYWY9yPKf~zhD2J?0GMyyk2W8SKBsGM zt)nM`C7)^tOTYrfmn&08Zbq=>I|dj4y=?Z{9Q=RTp=NEj`OcG(B_DKW0@(I_N zcP5LBQ{i&Q<&Uyg+FyJU!be%#vYMCbrsSSmXH)2mgnzgu6_sJ7)e0JWsWOEX%CP2e` zy}aUOSA?5=rTq&T-&YFkfwZ{>gD^qo2LFqO*AYU&R* zg-lkx*9xSFeug5jjy~;FYg~>8ts{>A7%j(i@`_syY@xdjGm_UI;?H_7)hm?QrRN3F zdIJw<1y}YevfJ~<#$uq&5c}o>KTZsdVdZTon>{8dbMs3VTEs2py6K;QuZL%W+cQU> z#Wb^a{hmh(h#AK9UHV+erbi7%wYV-2=U;Nwhs>M+dlHQ^0*8H~gvUiLGkTDWtpex! z|9BHpk>#hIP{<5dNRSZyfmr-Uc~Oghbsd^n*)eOs^M8I0%<|6^hY=0%Pbd4|;v7_S zqX|yCVmcC&>ZgKJLKwmQ@wF;-Nc*{H~R=fN4X z%Ct&=Ry1A`(mo?z4`8%D^Kx*gXzg8%s^hHdZKeTR8KTzg=_OtzE0l;X2@>{*(q*KUL+)CB5iVBxoC1A*LWt8N=9JSgB>oAy zQc77gv}*Y3rB~|`Uf_Jf(~+K_5UZDjqeoRTc_hZ0rt`9!x8V-h?=x=Y^=URxPmbR& z*b0ed&29$}l;Q*8%M%<;lN;b4GnN#$cjXr&s}h{(lSprX1zPIdI__C4HlL1joozc% zgl7f9R4bbLfafD?j;6lVKx7~Q@jrMrl(72Nbe75d))B}1`Izj>OzP9PBZjN@shxbX z@4#}@supIH(e_h0-%$M(Iq|8>af3iR=Fc*>M|$#aZt4mOm>oYY7SI1e>3yI6#92;} z2XipL2P8}+C&Fg7BE?+vdjaZO7a;uu5bitoPNr5)m>nmY0zln;!Hb~3mM@u9W=Xx} z7wR=P`Q2{Ai$h{YT>;m8IS?gM5kQ36mFogLFC13ah%=d5B8XOgrDdA$93V!*$@&R| z4{3vI)Dq@l2Lx!3UMdJhZyI8 zj-P1+Q()V9l_M13>rbqh6Sbx&KGrl1s^Ht&^nU4kv=Zu>BWuoqlX6(XYBWvrNd1x* zi}xmMNKLX9vSzI&g>AYZQoB(rhHh!=82L}V(O7qtrgwyWC<8asUg_;sUL6D*W}%zo zt5H6a@Jb3SBNWA-Yfk?X5f1vQ;atXGIX1t4-{ZLH!@m(_>SGgH&=NMM9J+J5^uT?1 z^IJxl+eaP5r|Egwm%dpje+1rGCxfQ^ne^q1)AomJw)$!X%bcsMu(Y@gh`SnPYVoge z)4-TX)9o*K(nkAgdou0frUX^?l4>Zmc;&c8_5m@>VO@N>q6Kh!KP3Vt9K$ht9C-e? zHPlm6*7wmYa#p#?9!G~ZFzDp{17B_84!snmyzfjr2#R9i#fu!g$gYnuPk`)OduXoj zN71)2`;cMstg1EUq{4DAN{P#9-}p}!K@^A6VmWyJi&2Nb4s%>!{(I@+-Ndj23;5m8 zAG5K?#vSCHKDQ*i}w91+!z=02-i|7L8eo4y|$awX7dE7`( zJ@5w3pe5@EE2(xiI0cG$TS*KK<@2%X&in2edU!d9R#O^(DJB4y**R(q`#h8WajOsNEAlwziWDjb}II<@x6`$-IVgWO^r z%@ZcvGg??%=1u5bZf9CwpGt2BPE98mjMixXVnPcv5Y`;_!$8x1kp5AY8uPM7r`by6 z3L>QMR3@%CqxYw>750pluk|K%UL3!b0IT5?*n`HBKLR zIh|?~tlBqV(Y_r;t`1qLr7C9xcB`fG=@dL~sc!wK`lNq5z^NUj)c|pDU@Dfjm9R3E z6J*{0COO3CLz|$I!YN}l+AXsg;6C0$2}a5#ee} z%`4rxu$L$}mB{wT<+b>eN5XN2CZZ6*{^wZd`#-5C-##Fw?jQE;&Jhx&V1@>)lx~MX zkTwhSJCE6+905PXVr!puz2;x^XT89>bTZDfayE>*IG7@#oO^l3HZ*=MY{ALD*7QHeF9TzFKi>7EI^$&j=Duy5 z=oKFC)<=7!+a{v7XA39!{{~op&a&3JBXu*1-?`o5vE+soA5h1aezcX!zF|)n_D)yh zH6kTvW`?^^B*7x;fiY`s;d)P5c_dhov4I9=?tC`>3O)gvuS%UyS!g0wunujtJ1YB3 zOS|Pj(;|K!1H8ExcRq7LCLIJ1?X_h&J!~Ifti}F)KkwplzDa6PWz!udGfMmDyUvA_ zCD0|hEbyKd^{gjMw=@W&d9&wua{=Sk(hm-&i}*GPwV3i~@vr}&T<9%Jw43oz!pJ?2 zfIlfeqgJ4t7&-58uNe6zld1ZXIuG!lA!jfi;kQ(W*=|gB`w^#}g49t7 zm8TW)Kud1`&8NYEznur(T)M|!lj!;wD@cR=RG->*T4iZXGUgSwkZqIAGpX*5OHEZnN=6yPtTJ2pS7UdDo`&q;xN?v^& z*ApygX>PP91BsHCz^`^ZC z^4oSSa38LRm0QVHGov0atdtI<__R{1rfVGBawX`WP2Tc^Eu7Th2)J`O$^-qcD9V~% zHz5!sY-xSAWD*Qa41JIdOiLAYKM3YqMLZ1x$DgOw{tfwI+N_!v9sce3p1a zUeWRbc|n$Hchza#Lw5{nHcA2>{A*+`+O#YF z$MktAzBl}y(;SSs0MAsqbi>55Q~m=*Rc5AgZ~|eAv=T24f6n{4p&U2!D`=ch{&<#2 z3R+}HNpnu5Bo@hHBI8=@k%x1NH-G&|()Vj-(k8YzktY0!7C&P)RZGl-kSaV*59mDI z5|H5j(6hTT7GF1psTWdt2HBh2w#U3wbTdwZ1wW>E1XH5&Q^E`RU_keuJ@uf0_t|ue zlIqXqn+IX`_kXW$9^-W-shqiNF9PS7eetTq_mWrM@OA~4Wy42%;G}=4C{7&Fh!sgI zbtVDV!ppJ=6GN8w=aQ1_KEs}s!dd9V$FMZ_D?ch?`(Q2qKBf~I+h(X0{PCnhQSVY( z&SjI0mixKZ7mmxXF8HLOE1T|%+Mkt9>XGQ_*UgAPc^J(zT#dwW>($2g4_SNA3AFGO zdJ}K=vyGr}5-rPQlap~P?HxXoT_t$t3MB+uwD4>q^ThEc2Q}$pZ+)Z*@TG?`oLXb{ z^9@C85*a8=>_#opRsea!H8D%CG!mUSTyIc-<9lINR*i!R=?h@fvgQs!Y?U}oy#wbe zdu9kq^W(3dKCOA+iz=r`YU**d9?o8>1B zRO0K<6DDHa>ljUZzkzRNTIVOd;9i{fq z$+O8#Wd`u;Qje#VQ`nWFdxFKMWY1W=2h>yWp_&+36sxBR<)UbC#Q zu@jMs_<5>9cIO!1LhKQzOCN~YzIFoK%j)%gpmlNb1nW%1xkBOr4OOMv$d)fD%t2(? zf`mwraXk;7$cbO*wf*?Ii`*gSLzX(9_@!izONa6ijVsYdd;X%a7M`Dfg|y_jNnQ_d znfdjqM#`FrRLOpGQr;vxG^)jJ<7MOiWsR4i5VQ4?oq11vc1+f5c&)ZYKaGqH5kR~hE0&b2I3IBtK#EZESMv+|M zx3f#s51?EAPdUKxc_p5u;AkH8Q#B~R)bB?VZUQuG!iNJ$sZL@QsluyaP_Q&%R#2k};2CN`e?Qy=7E7`NKKN^H zdH$GNsVKGiWxj=#-<4NH7!&oMkMxZ!wS7EvE9F5H7HIbsO7!G~Yxmv5B*^WF+|{Dy z1q7q4`G|o4%lp60jhgr@U|g%p_$OqAGqQb`I=!u&07GmAUwZb(7oK(N-NW+=dp_){ zUfDJl{3BRU`bJds|6uPu!&%OK zStjO<6__FGi9*BhIY4fAw8bB6VN$o2qIhtSgI zX9w9$KSQ?fW!(e`XfO}WZ0ppK3!bcSMuYqCYQ#6BnLUr?3Js)p=dWV9JW^EAo>#am z0vM^KNKkawH!ls{Y@9T)mPvasJWc7|7%$3HB=yE_*O>$N?}z|AEADz>uX?{oYfmRz!@ z$8&`6gMp4Dk4afrZ+ezk&x+fXa%nX%|878D+ZtOA?`xaJ*{!r;}=F;~$Or;UQG!_&Mk zPF}w`uOBFo<-l+Ix{v*&Z~o(UOXo>lTVyqvONXSX2dOOk&{@uP|Ikc28)1Y~NERqs zTa#GEb0N{E4_!W&DVFamPY0C8;!=6JvQXx)raM&glkXlf_)NaNZZY`Vy6c0#Y#G+- z*>}+el^s*hi-PR7iR_0;?zfNRwjFC`7n}$|3xo@VKYlzy-X;NdqyeP#z+y^z#%tAV z=2Y@Gs5#cYBzy-u*|K}%+Gzmnggv_OP(B>!*P>o6*<1hWmwP5yplv=&QphSF*p-xx z0&WuGwgt?W213opv6EU$d3##2<^b18$TM;Ovp|k>-mw|%+9mtMM0wh3KJ=|!+$%*- zRo(NafvQ<2#>cuIqM>zSlnlLKAOQ{DIvsr9L+xI6i>xJcqoA(m=;T)0z8je96@0UF zlTLl@<1DEUq75f(uPeD0+W?uJea}$2n_N!QjV9gWe)#V6Gxr=mRBBY;PcF3(C-+;6 zh$l;2$IbCbzB}WF>u!)2cj&%o&v;d`|7?fVF$h{fBku ztNfmw?%?|AxVoF-J5b2t-P!)aO5ddq%oZ3g{{#5mTkh#i029F8x|P$#WUI+RqH=_p|2z+IyA-+j`Xy?Dh(M!wb98u~XH-y1TNo0ZvkPZu#_(`gg#; zSc+t_oE+-pC?a5eE zaJ4zkm(nd(NLWPUfR2!T-*0HM4ShtNY1<7QW6ZJ7wk*KJrPq@oVI*);8b32XlPt;K zNmcK@6j-p7O&$PY2L!eF?uG209Okl<0?K}r1K~?HjwKl$+Q{XS>i)Y269MR!4~MH! zy39Fj{n6*xl*d~tV%>5ZC_Uor%UiDX4y8kwhKbE@k@s6ZFE92ej|WO}^J5F_OJ_nR z1MvCF!qPU>-S$Qd!&enoqi}ejI1L|g_cp*C4_PH;2h+7u>0juV57U=6& z$8gIDh3R@d1A}zi=R!{-2=xtCTn`j+QoFz42r@)T;n5LgWP>yeMr#D?O!Fz&dT`^& zJ@*z=})d-=s4I2VlwUPB|`#EKg$s12fHP;2In>H_5UxStb}`QjdnH*IHjwX1+P zlyKq|ms7aN@0K8hLApHGS6Jz${uinxUV~RBf`Ibwv?zo#o^>2Q&=%4jIQ)%4rkZ%? z!fVS!?Uy@EBI)?$wm?(YD)1S>fzh=9#;=q|m7Vu7<_CGvo{X%$OmA<%o?wa#_g1kp zQN3X_NLRe+EcOLp;6=g}8~J~}VBH_}4l#7X~GhlYDo@=3{uJ zrSylbFHEOwt~yl!=3)%z8?N4<5qR{D#056772Ki8^oho2e_QZM4H4TxBJS`pkrTJ~C zteQhhz70k0(wSAD2;Iy|Qk~;ZG$YDvr#hsVr@33(%m4D7*K0S-^p=aB1z1ytglXys#c7d26!@}-ikYD4&%)Q-*zP1Y~ z7Oy4`Dkm2$yIG*)&COs#$V99oMp*3#2|031Yyy{O2HVo+WmvbLR@}kE7+gBoXw5k0 ziuBO2K)(ACn^EwjWX^Ur>z~p4!-Z;2!m?^1rG`LQ)H6+dZoHWFgx(Idtck^WEn|7v zMbBH81)zL!Et4&6q~r!h(J!-WWTl6abzf+Y0qT&w$l2dkxA(LSS6kln-R%b=A`vp} z>H8hH32!sHNZe-j2{{I`q1=DBYNdi=uNa87zpk~{Gtti1{UpENaF*N7wEy@~T6#P% zKNZ<`_$kZ?hQ#pMnxTZ1>L+?e9B@O89qDB8E#ivvq|)d20s)Nm@stsPWR8KP{qaEy zK;OlquS)Y3Gz*O*`UE?;M+N+(cIthz&Yb$jind7J@Ez|1Wsd%;u&QOGDee%2%tk11 ztSKQB60^OtnEbfnfG6~tEZS%-9`X|z;1LrCmkJ`p{na+#GBH~3JH35%5@+FAG@dh4 zN0js<08U1|1?Ea;T#hQwT|yalSHlH}VC~!YxNOhGEQtnH!q1umej(I-0S5k9-}KtQ z%31G#aj?r@2Iq|^lI7T7T1jAP#JA6E_>=GEHPj4((Kn2h4!AJh%5FU_c+BfRP+A}`a- z2-Aie?OXtQ^hT{Z>wLPc#85h$=Gd)B5PFw z9kk(4PfJCOvP>dT)SurpdJyS?-feOHuZeJv24xhm%xQCl(rjpNJd1pXN%_Sdc9bjY ze)0IRLQ8xgoBgt!hVAlI4}Lu(cG4RJ#VAc{Ow^u>{pQ8Zxz-nw{4t{Y?;&Kf_Mjm9 z%sZj-w$U?$2ZTUaY_IS1XnbH(G^gz%{R!M-2w6|_XkISvA2~jKXf*v3IV1}d{o$E- zF2af9Qcdg`wr#SuRM&l6gw57MPd&pz_&Ks;S#=3vS%?T>;30(@l_LOgRF+%DnaA~x zOfy=WPl+WD(sF^Bj-32VC=757&4?0unOtY)F}QYV&k6Js*n@xa5#6kfjnX+jSP1P4 zOP^}Bqw2f-x|MQs+R*$c`EvyK(u=lP`L_nwCL>mFaN4HFYDzqOHSA^36Z1s-%}s=? zJ-hAha$f_GmP}Bg*x%RC`RAoh+Me4Oks7LzT=P9_LOA#Jbu$>nDb``4^q2UIC<#cz zjhuR;KOex)q2dpH^y)ns;L=JKB_|?H$s;rdDPWg+Mcl!+q{W=^4QWnLmD*UtDtd&D zY#05q`XuKP924LroVK$LbBw-N$SU_GV#4ovs!W;%b)i!xVJ$aS z8Y#q+{)&1^tx})UE>V%y0KGu%J6*dT3W4_fZ@ zJD?wOVk@_5HFZ8uuf1N9R=Yij2_Tg;3_1q1MqnU0UY1MRqve3=UxIU!O`ivxgnz{s zcob>e_Io#Hn&o{_z+l)O;lOdQMAp;#Q5Fwx;x@f9*{#_`UBwo!Fi!*h`eae)A3SuS z-4rifM;xVz&U~~|hUp3XXREro@!jt@VWFWz_$$5{C`SbRRuCSjOK`zyrT)k5LSz`o z8`!EpU*MwJz-GJjtgst$ z`x^DK0Jc*&c4XkDkr&^!({RJ8UZ4gxPIT_I4n}Mlv8BYDEn1_PhReE9F}$JBUC@*qCEVKDsf4aSTPMBWje&@uIxg7{u~i}HcA(5-4I6<7DHY)ftEZJ-xRKIGk&74 z|NV*m62?KQ*0Ib-^TiP<9PQ=nc+E%Ne|9Zi3q4|*I0 z+R4)L`v~zXlJqmq;AZaJJ$Sqs;qVsp8Ox$xxW%xEFQ&of9rp`+<;LENavMT@z20k7U~54Nz`KC9n>iRtY$ zK5|;k1p^P;gBj^S6#svVjsCaif15aN8~?7|U^lO6p$CmH25-2mtU7ehijg=WKC%Ck zsd~NV3}22=-i>V}Z4A72M_D=y@n)w~gWpAh3W*!+V^5utVeMiqQswIDpx?EF&wTdc z6HCz`v998>u)7MbUA{ATQf-U@2;1b}!Ao-Ba9s(zo1E#81n6Jj@c13ii2`A|glu-$ z$9%WeDJzmGXUPYnz24%X_pvOk&u}D(8*@Yeh`i$V$OVklmd~$LtxFnGhab1 z>sh{=Bc|A&eHndK$p}LY>NB0tu+l*8rOIEOsju{*`@)l$o*vvY zTSJ@eyer~X#0Y*d*oj)$xZ&g~Xc<@_p!{;^qdB{+0j}|Jc1lpTdx^0d(?DT&WWCBg zu2O1ku&B@h3t@07x0xx5UC(-dJ!`w4M=RA_i)~1cFcZ7D-vJe!`F173Rl`7k{AWFmSl{f@izf3o6Q29MZiRkQ#B9wjPRM!*;EjYq%dyOC% z5BoR_f4nqAi*4!XM@-3@YS7;0Z0`ta2rlk&;-W(s$AwI(LtOE$t3GR}q!D8)9>F0S z{`5>S%`{5=2But5LB8DQr}AX+z)#V6rU19fRY=oQAE0FGrpx_ljkb?%@MW{-QAhS$ z;L2}{G0RSpq7O+CMO3yOob2Qj->5tTa#8mTM#wUKCCkZe(5npVgWPCVfE_)E zY)3yAq1w4;ReBBg#CnA5r-)((*@!HZB$*_PJ!wus0ikHhAs9W|`>)e_*>|xNmDQF) zwbo-|k7TC4)S=aK9L|VJlN=Wu3T@ePm!%}d-Q6U}`d^oh73VHrfdg|hq!7`AxpF4H zbfiDbe7Zg{9%2G3(r=dO7u>?lVHUF;(uf7_zKcM=wTbuA23*QinDRfy#DgapiZ zxs0O+QCT&uDk)8VwczKRDM^@8rKt{&o;IHA;!Mkt#tI8+rvl7X(RtNXhEmdKq#1EaY zRRU5-4H_rR{nLb|=FzlK>0C73u?^D`$-Po}oWM=(zJ||>wJf0H-`+|wCnt@RU9-R) z*Gyk3zB*}8w}lz~=JaR*IvOF z9o+d7C6OJ+XlLE*m4ahx@admM_^UWBzl;P;Uy{j)J%YKWR$yywlW}*3f6VYIn+*L- ztgZ*KU1mwTctzWx_oK(wiDB@}?#5xq-U}Cr5OisHPBqkGp!K77<`Sl_pOtR!bcar= ztuzO7Os5vWUwZV;7<=Z9McrHrxU5u=ZI^QqV6svWYg^-c$fsi_4D|O+xOu;aZ$hkj zj)2O-%nHxktY3b__jf&VC!iQq^6IceLzX!3{u(C%yRYMqdIX6b*(#tvxmGezh~0yf zsf4{)t23Ir0O*d|R<&*GzN^t)Ez0`{QPAcfne*NZank;U-Q6vcA@kbe2~mTW5up5T zldl+m%IPeZN0EmOG_tcb3HOsn%q@iCJzP)7Md~`cLpqUTH+>C*iD#BK#d0buH?X6a ze&<+ogvL#{Y1rW8Pz9{gPnCZ)&&m+3XG_nn&@USx48e*qm?BkTMpmePeA;CzV#%Il z^vNKzIZ4b8+TfDfvG(+maK95J3I*6FGz;MqB6h;DQNrim)Ia;eX<@}yZQCAIqgP*= z%zp4p-2~^^8i8=ocdbLF5Q&Fu?Qbd!zM!uwN3L1pr)Tj}TJ?g5@V<9dYs6T#c$ zo%<^}lTi_{u-rM_dMVlPeBWsbt+G0Tn+1ySfit)+XHL-f$H`aXeZ!y5_zAyc_bqwf zxon9wCpHgWk63#N7j@hvGB%klnVQ;h6)T$5kwnizLgUOVe@!!B7kBqWz)iSU*m8Dz z;FUXSEylOLHvC&oPbI%sk_gXPvJJF4lX>}_U-`rX8{S7UX|5o= zaEeT4iLTJcIqR1f-rKU;ps7$-qk`cAX80h*39L1(G@{<05B2b7;@RrlaE97XCwgaG z)P;H&G1LygBur|gl&ISFA~QTTX6@allDJProl?1q??xTjM>7*6+j4OktXMz$(GxWt z_2~8$6CE;+O9zk9iUgHZ9N?&%s4XpvC8V4=uWW(yDD*gYY53O`c)lh5O; zU71?zSVC2p*LK&l_t@eX1G^0Z?(|h-XqgilA8+rfw}Au+OA97%)od1K)bZns?whj8SM+WSc5Cr@A3`Kn=#3n@oHd*D z^7&uYRtlQe41SipwhA^^*uS14pBZ66@5Q{A1!O~*K=k&wQhkuO6l`VUY42)DPF+Mc zYeYkj32srXHIBadkXEY)*MGSj0u#8Jd1CGf{d@m@*pCb$f)??FK0O8b0RnT+WH>rO zF3uS!F{hPyA_>Fl{UqozJonqc&9&5|NX8RDWIzS^n4*&GAM5!O=%wwW%b@;@rTQ~; zQNYsW;uwF4hT_W|+hdWrY5*`VnnWaPrdCjld&VL|Tovnj_`Ml{A zLLr*Ic@4T0ea6Jvq7yce7khak#lE(<2SoSv@N5uL3|~>&@i(7j0@Z9w07Ce-6a`v5 z6g-k-*j>JfkaFH76k`Tc#^Z&H$7)uYX#=zG@^RQDr*s zliiDidHtx81_Aqvh;FgSDP|0!-0CY_-PktcqKAaO)+1O|_F%WM1?(36QLzWxVSok{ z5hQWZ-YomR=ts=ta-mrZ=)v-`7+>LFp5@`?_(XD!NLm{oX~uY5VLZCUu9=~b692s1 zmDRT99ufjj*)!!>t|RLJU-?ticYEYZJ`T|J zF1`YRa|QKq&A4izkfZOy^GX&l7T^eS1#cbKY{XG03_AK0R!3dHGt#7UbWKy z`|saSZQtV)d)9gH=zIpe#oqCM{?lVU;a;u1WS@JzixE;^AG;BsRpnRc9AZ%*m652= zkL!AKnLTc~(U#*MPO%%j4M4^Y$b8OuzVjH;oHw!$A@G>j*}ayiS#*CVf8qBd_gT+N zM3{$l9vNGVmNs=qNb=))J{uE-@9~_$N?B1(nlmwOj_sj!XI2u@rW!O&1WhFfqJNsb zzG|{no0)iy0^xWX}W2?w!Iq)W8R&1jY~kP^&hA?uZ0!vH8QVuyoN?N zS|GF0Ry-M`b4s45610O#x!YZ2iTRTh{^x&&o+w*q%5-h-UAXH-YFH{J-sLJZ%!np|UJCqe~LzHF_pq6XUE@ zBbO&exQx~~-jzL89qzWx_Ua>teow-C_T>mrj0rY*_ukXGY_~9Xmv6_%ywm6|59}~C zdLDf%5n~OLmSsY=DqbibQ{XLvbWNN>u!}h6lZ1Z*8E09($qWMndS5becy+YX`O0M^8v}H@WS)lkk|Gn;fn}m6tJiCsRx!Aqf&Eh(j zeF)HM!m*8&z_F{ACj4YWvy^h;HL}IFC*Ply1KF`v-^bZxMv5Zl%wQ;&NIgAAa$PoI zR+?XO3A%+H?mY98w#eZ4ugCDUl_cn!Is_GzD6Z<6iz61Y#QV^g{H=GnF#vx~N@mJr zuv~L+5-ObQPXnTD4VTjc=GuF4h3;!^Q0a)YWc}ijO~+z|IU;+tV=apL>CsM5uT_=j z>}q7?pT)=p!W;TsiAOFPb62%lz=mbVD)hBeM=>#447jvM{FWL+o>ksCD6Zh8HmBZvOf|vuzMCJ%6-P=&Lm1Q@bZGAm1a~Z*p@99A#cUOI(fNjLM zdFLRV#Yn@hH@5=SuWn_5-bt?rMEsEkKA8v~;&he1!WRvVM9YQ{gx+%3%jDE$?y76P zg|RalQ^QNDPM^&W9{4@7uaT#;Q*#|QlO=A9HF&;8*+{DUX(1cnmqx~rx+ZA159R|8 zZcDxkNO2$`OOQCu|5_Db@ztDV*$wv=|1O!g=M6j3QAo&0wZM@OipPF7PU#-pjZf(i zTfCis!xofI-A)KJ=q`{;ITCQ$po?{h+OKPFx%nd$Ppb?r?Ot}7Ynlr@(1vGAE3 z^b-x;-@9>{>=@j4JB}x(!9a8!{2&Wg_wniNUZCytezl|iZcOOW$A1)$4yWy3VqRUW z6Pvirs$t0>SjQEa+TZ|KaMmqx2fI+gFY}NazClfy^%`h)rwsRPk@8)(KLgMb7mj5) zf2zwa6cSm41!4O=_-7s5GOmf0Axd{rww*R5lQu1@F+-Lnb-ui@Re!_=*u$?Qi|b~E z)YB7QSBp7V(&#nrD(zB`!l0nOVNvrrji~F*w@m7OS%c#Ltdommgf2P)A9=YS6x6CD z=F#=?eBrKuOzyw8l?1ANDoZwArryKJv!qhL`nZJ%`3&xJ!k(KczZ}=~XB*a>Z@a>% z9=4>zl0%*tDU{gYc0;WIhQUX3hCL6NSs>NIy6%Y0xzCpWPLm42(ANQ0%&ND`(DSsC zb~BMe`CuI<9nkUm0*7V(y^;ic;pH){Ksw{?$f|_|S*kn5y2^luiZmDI zHXfst&PgRZ7K=F${9yz=Ma_RD4lvqRkAQ}F%ToeYHxH({rcJ$LXvkMKRV zSFR$6GIHv*j})4nQCnige&UMV4+VcN>Gime)!X2hYresl1j_B?D91nZr~^cnEyNJU z%$%f??&iiPObx6=EDK_NUl+xRzqs(zVXUH0s?iVdXsnwIlF#&T_Wv19EKgge!Z)<# zWkvN`eNm#SFd1Kxn2*^ZHN$zAGZVT2T|w%RuM%bz>(V+JElqe=ea-6~B1@EmUb?4G zTr)O3WIHe%QSz(jyI6MOY0`8%#yR~ zlK$iQ_Ntafg18IMlA&iDP2GD}qg?+g$&}Om4>iHhEb2;xdF`mw&`@UzTDcvzhMXaOiuU{p!N$(T=pVPZsgnQq^1QsGj zfL^-9MC-*AQn|?QE&c2u@ISo)EptjgX~$aJ(h;Bsx+m(myrwyJ3+vd|shgJG-=pW1 zllZzi1#5?nrL0z^jbF1=C`+k@HCatn<~^DyZ=_@r9YbJs}Y-DQ^~|Ub9lobEqe*(@3K}zb}Xj9GV;VO z{-9!|x(=jgdt^9lHUe}fJf&}6PeF76V=EUg?XG(K-@7bg-kEq&U#4zWDWK_>mAl)%M=F2JSPd-RkHrwEWPXB%7-*ocAuyW;0ZryvHY_hw!Ol=6Lx>L>i-h@{oet^ z|Ig0i*BZd> zf@#j8puUrHLWm^DxHz0)G5dN|!!!BtzQ-WW?w-b+k_uMAVw~Xh zjHQl+qS=qk=Q%w+)`7wM=G$x)!IwENt;$fyknVOOQ5+9lkTFNi$tbh1U<3iSD?Y~3 zlGSAGxa)ltO0o75!B%L`CpH`ck1kFXW+z#$u!=Hs;8;l_QQB}<#e9uC zM@k-mPk*esR|ggktwpHxcPo3-v(}ry`@EkKb{WL3DItxt7E?TmB=p9wtABDkZ-VYl z4X6Aq0!D~~S*cbbt2QX|RVWy$DM(*|H!$D=z%mIQ;2TP55gF+6ql`$OC5LEShK$7Y zG6a{}CGhUFX!L?#-#tPD(N)qrKVD5sc5*mpPkCVnSVN~W$<_3RkbSAT=CVT;;$2f2Kk zs<=*;r)$HpNn2o%t;Nk3OtEKCy0mk?bB422=j*R^r4oc_tuDOX!4+~ib{HfVe{0`S z#KDCIpgUZU@f;wLPXlT zx(PHG#iuWom~&SuJ-kfU&evzHgBdyf^ESd!dRIq~$*lG*FB78K?OZrvTf;%mN$SHj z;MCKb_(XY8^c)1amz^H%KvK*mGY@~hkoJAvSZe$1=HXBi;?m%H58c%}Y?GsuWei=j zk;vG+zTynRRR)r{Q*{Q~`?XU``phk>LPF*`-N7&nZ9+hE#%J-0M9hwvuRG#cy&!>h zEQmS2Iq`TVOT#p3(J)YW>x;-3#*?_-Rem!YsxxJL4R5uMRJ*9CmbO*fn)TYtFn%>W z*v9khthyp}Dt-qwGdY$Erf;$C3gRw^)*5<_=8^=L8DgzdIU0|Got;#&<;!ieY_{j< zzhEKdX>m5ZoLrA)y!YvrUu_8Qf0(7nZ1{CfWV^eQ*n)hraqjf_p%u6@4mY}9>)(f9 z`fu;b_Ae+%!o^nYt*BQ*w`=WVVtuKoY;npc2fJ)e#BwU zf!zVa!&vKmMkl2Pzh35aE_v^?2xI1ra23>eoI7wC{4(NloL1_Xb2f1JYJr?+EYGXT5E5QPjHwRH-yj5R%|ci3DpH&DlN{s8?JH9$=Ox7-2H#YG&A6ggVr=;Z6WqenPv{wK zSsur7?6|p6%FkE7$GzRObDfHa~~%;)}SGT6~Xu zj=~rAaoQ$AeaSJ-l(vlyrYxg=&|4Gu)+!$OerIFtxCpZ#nIJ!eM#=y>tLX0%)9&`SXcV1VCej} zJit5(E$^`MKeQ4y8^4C545j4jUh~yUj=8NeF)+Il`tu2{PZ$x>KvS$&trre?>&5((x<3)DSGl(z13EGUiq0I z&nM`k>YUNsRw5>MAdOKC=I{5rQPflQWjDLz@-O_94+_7{T;1xDT^E*Z@F`bHc-x>*gPaIE9S!Hau8kJA(+>W0xw53z5HK%Wv9)~WE zm``|ZK7A->m>RXDII)CtWnhoc@Q8kN5QOq$^Th;Vwp=$)+MkirOk}04T})|K+6teD zu~>=|tFPCCur=qjA%+^b^izVt)9f&>471Ip}V!6#7d~_Crn~8!+LEyMf!c)y2ym- zP{=j&Idk}Cr^aXpg1>lP`5_9HvtpWKe9+2$BRhIb0eHSr$mZdJV8&S*>f0|ApWuu) zG$giJ2sY8~k&C&sYPu=I-g#xGXdGWLXIBaCrVXx1$D0<#kGi>K3^uAKFhY1eq9=F# zSu?tgf{2U>41AYewJs~vsy!~PR;qvtxoux5PA67h&pd{7{b+9KFGORymWJM$q7-3f z;R)$JlA<2&H3~BZ1a5g4@l@Z3p+=m4!G^iGJeu&>WbaL zAPVLk1*$_k=Lrd$dYA)rJAB3$OaWAA8E@z(Y`|8HL$FQ!^!RxG&zbw1%P6tzX3!@gEIU zn+WJvLrDqX@CnLp3%RJVlu@vn{Q$kD`DJ!=dpGeY|I$6e>1OKrBw%-2WoYX01v=Q? zq$zQ@BtYw{e$LE_(Z0Q3(MtUxphl{Sgx@96uSMwxKOJ-n1vvG|_x2~&!ELgppPoKg zs+lULr~M6qW0h=`ZwAgBfqYIu%UF2jF`$W%)FIgg_Vm!!b%L>#Nyzlct+P!fR5RTLP6a|8yGnG%xhsucySekYg7wf8zR55SC(Rb;58)8d- zv1@9_^EZ>GD%#mkuu#Tff{%K(ASKddKUi!VEq$ci9s>5H8E;zYjsFlTe%y3-&cMjo z-+IIhFBt8u_t#X*xC-x(tWE}_K@kRn?a22NMJbu?NrH`S8gkLlwX}_sY+0#p`}|y= z#`ulr{F;cnNB;>A&iZA4yOnuY$oSliYUKPVhGxoW`bHyX8>=t)6;8YLUeQQy%DEIe zxRrHJC|HAW&vap#492fMACunFpwvt-jiJ_eX}d{cq_T~eV3-JJST1uQAtgJfrOl_h zf9^-sV5Kgma8V7L1AMOSdUjC03PLs#z8paw!gL#Y_n(-;Ny!TwMMHNKY=k_Y_7F9~ zti@CC8j7lc9W$9Y#>2pVht+8Lf#)@4O|h&{(--&*9YB;KpkCJqWm^^rLhY^Ovw+#y zlO+&lIFwP_ANiQSxNiZKzB}9!&AZr-eHtKNR46wdWDg#PcoaSr4h`VxQ>^}*W?LLzQ2o>Sn1Dd z3js)Ap8Ytjl@IVR-)ar1?r%`}3?LLhtV~!x;-})NSg5aKd(6sgM44cYE%!LMOu`W&rbJy~0SvrgOajq+<)!Uz8TwO7ItY z36K#(dFjgl1*^=n9);Qfsm{eBRmeT^Q6*;kAT)hu&4rd_N6V#4Hyc3jh;8Z!i5yY4 z$(Q~v8Rx2?U>K%IZ=2G*?iK@`a{LUB!4bEqiw%AsFMEA!e3P)1(>&lf{bmOW+Ht!> z<7NzQS7~M&=T}Ord$m=%P|x2X>%(3wuWOuvpcfb2y-Q`ZEE}(nDqVWg4`z+xgy!C? z-N7runy;|c31IvRJ~vkeeDl~PM=%XZp-&8b2u_69i%MGu7nHe}U;1Aw@Y$zl&8%!Y z&!rNlYT~BcVgsfM0wR{Bnt4WqX2wGrh5=UBnpHH8b!SWre`52tMbsk=DH)ZhtY#+* z#Okz1LeVN1abiA~V(W!sw`&PlJ!3O=+=lN|`0dyu(&nG;^M0xabvDZ7wR3fPIiEY3 z27HXce&U0XndjjvWrmC#;L9R7Wgrm>-|AU5WQfv*Bw2GQbW$=Q-2i8AnE0dPCiSB; zqneWTOjDSH@o$+`MhBI;gVktam`;3V`n@J!*^peC%sag zvxnFEooCCAz;@;Uhu40Uut)yb<{?g8^a{siWP$s8z&f-><&_Sa8@VWH z#Pm^14SaXPgQ7FYkACBKr4(JGK`DfVbxQQ@E%o+mm> zOO|~zbk5$xFz_OHdgtpi!Iv&Df(ozwCg03MmF!kp4;85ife*q}X;`5?=C!v5U(f|J zUb{s$)D@_Fh8R^VuTdsoQh>M^(_t)571+sTTLWDIR6zZE3R7H)+&W-9(%le2LS z2{6NaibMt$jVX;{Cy~uLyQ4huk3O|N-B=Wu4^RyqxYERmol)lIFaHdPNvy_(!`w#} zH+Yp&rQNhDZ#sfYp9|p~l;H?}0Z?Bo#_2L?Oph= zj5h2J-7pvdzO8)>k)li2pzhOo)C}WU2sL&t2k=_xQ#|0qZntWRRylvadCkR#mV>N=>SiySl{}_=V1^KEUB6RasO{chNDv*XJ$5Z)M!w7Sr7#}`}2$b7ZKIH?i1c7+;4eP=D3KU^TG{8k)B zgrB}q*h+pLN_@rer;hXev+UQ(R6o91RaSqr{N&a@hQVc!Sfj_>9VWh{ntLb^5AMzd zZ-hw+ri_QEBsE3B0KjL+G^7fNt`whC7+A`R+$UODxI=#&lHxR(vdf7D@E_(dhMMu1DMDmuJ?K}>ijVxO{ zU`hWrmO!R*oBjrTxv&Zjx{N_puRdRsS28#z0FkthBi;83tW+F(9IWaU zcm^6fJTgBKWC!)>g+7c6^{KjlwQALl-cCy&hBVECZgrNydP^7&mkAL3Qj_~<%2Q;3 zQ~~;S{H+gqAg6ps{rG`T6manpb4w5XY6~-v#0+De3eo@!x+2SLkP9X{eqIh67fPy} z)uK$Pz9fZ)i*9M405ooFCCHK+^wB2_F64X)yf~1BgUxf;Bea2MTs&eDokd+fO;!&!0wBc(7nwDPXfulxZdY||$hdy;j zw^9H-UqwFZiv`aM+38MkBep&)fo}^Y)P>0e=j;Yk1UK9ViThWa+zB^Wx`-nR=mf&c zEvktbVzssQ?B5PmH-0Dw7yRTMI>2GiN?&H}s6+AT$> zt!{XTw9^8+;dWGFlNql~*hbq=#qKIf%wrSC+O48r`H8JDCbW-doAwn1Y)j__HnU2M6a&2Sb#OA_5|%{rm~0Ht%oJ&PP*)CnX+)Y0ZhF7mQeVxlIjSS?W zo>6w1PCe7#5ylTwYnuT+%$=hh+<2Vsxt)1_B(6I=ymV%AsT`SKZatlKO}snEq57 z)WY*Z{->hNmEVDow9~Ns#XbB>J(!xvey#2CW!ob{!FP;yU5!L{7NAW<8WC3+e9QMX zU+`?r)OH*eg;`1%pSahyFTbAU%$!S!ApjzO})a*@$0n8|S3dwNzbpRB*aVwX# z{!5Q)Hv>uCn0gxuCh@AS>x<7xxrcyT`GdDVCXJ97Uu@>@dTjNac)uHkue64D2Y~x& z^{g}pt)2VS7ykETQDIuE{vp$*zN`2}mh7$2bek)Dymsn48HvnUuVH;+UfX)E_X=v0 zV{u`9{73kUm}rWOw3l&%((1@N_)C_3yzZ6J+oZ#BYihARyy9{lbFMy?xf-j%0t7=9 zgGbBm8&(qWrrXZH_yu4rF$G*O?DN3Puh9S>sdE_RE5>1Qelv3;)A6p8hh5mK;O8$x zOKBQ!IUQN_pRk8TSN(dm7H$XIyu}D|NDn?yD}$)e#Ze5NTm$V~q*7zT%qN!E`jQ4p zl;)EaO&Yvi#X24QeQLk<`e3l!Nt_#FU%CAA!O#FE0w<&#T|JjNvidz3vHP(0YFOE# z;fE#^>wWo@+k2*(8h5PW1|J6wkPN}-o~`Kj*|YNGs6jW_h?{fA(?j-4UW&hJ@k%*z z3oW}jZ}2l+C{Hjk{i=H1wQ}BZE6vzx%qo*Y4Hg_=?x(#nZ3O{XOY* z`xDzwr|nPfv-vUo_miO2F}K&8{GMQTGMgiT&jUDZpt(P^`hVR{&7Br@|L>(g?R?Vo zb7MY#-R}>K4;eTUpUEEkyCCIXKfjfUxA@fEaud&fx1Zg5=;!op%N~B1e7%1{Fk?#<5ADX SKYu%DF3!`{&t;ucLK6ThZQeNm literal 0 HcmV?d00001 diff --git a/setup.cfg b/setup.cfg index 54ab6552e..9e312b979 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,7 +38,7 @@ ignore = WPS436, # Our private objects are fine to import WPS450 - + per-file-ignores = __init__.py: # Unused modules are allowed in `__init__.py`, to reduce imports @@ -48,13 +48,13 @@ per-file-ignores = # Tests benefit from magic numbers test_*.py: WPS432 - + rst-directives = # These are sorted alphabetically - but that does not matter autosummary,data,currentmodule,deprecated, glossary,moduleauthor,plot,testcode, versionadded,versionchanged, - + rst-roles = attr,class,func,meth,mod,obj,ref,term, @@ -71,13 +71,17 @@ ignore-decorators = property strictness = long +# Beautify output and make it more informative +format = wemake +show-source = true + [coverage:run] omit = # Omit reporting for dataset module */datasets/* # Omit reporting for __init__.py files */__init__.py - + [isort] multi_line_output = 3 include_trailing_comma = true From 31d70c2639aa675718fb4e0a924803937f25eee9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 15 Dec 2020 01:05:37 +0100 Subject: [PATCH 185/210] Update types. --- skfda/_utils/__init__.py | 29 ++++++++---- skfda/_utils/_utils.py | 8 ++-- skfda/inference/anova/__init__.py | 3 +- .../{anova_oneway.py => _anova_oneway.py} | 47 ++++++++++--------- 4 files changed, 50 insertions(+), 37 deletions(-) rename skfda/inference/anova/{anova_oneway.py => _anova_oneway.py} (90%) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 9b78fb479..8852c319e 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -1,11 +1,20 @@ from . import constants - -from ._utils import (_tuple_of_arrays, _cartesian_product, - _check_estimator, _int_to_real, - _to_grid, check_is_univariate, - _same_domain, _to_array_maybe_ragged, - _reshape_eval_points, - _evaluate_grid, nquad_vec, - _FDataCallable, _pairwise_commutative, - _domain_range, _check_array_key, - _classifier_get_classes) +from ._utils import ( + RandomStateLike, + _cartesian_product, + _check_array_key, + _check_estimator, + _classifier_get_classes, + _domain_range, + _evaluate_grid, + _FDataCallable, + _int_to_real, + _pairwise_commutative, + _reshape_eval_points, + _same_domain, + _to_array_maybe_ragged, + _to_grid, + _tuple_of_arrays, + check_is_univariate, + nquad_vec, +) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 821b440ed..2fecf9d8b 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -1,14 +1,14 @@ """Module with generic methods""" -from builtins import getattr import functools import numbers -import types +from typing import Optional, Union -from pandas.api.indexers import check_array_indexer +import numpy as np import scipy.integrate +from pandas.api.indexers import check_array_indexer -import numpy as np +RandomStateLike = Optional[Union[int, np.random.RandomState]] class _FDataCallable(): diff --git a/skfda/inference/anova/__init__.py b/skfda/inference/anova/__init__.py index 516031100..e5aa240de 100644 --- a/skfda/inference/anova/__init__.py +++ b/skfda/inference/anova/__init__.py @@ -1,2 +1 @@ -from . import anova_oneway -from .anova_oneway import v_sample_stat, v_asymptotic_stat, oneway_anova +from ._anova_oneway import oneway_anova, v_asymptotic_stat, v_sample_stat diff --git a/skfda/inference/anova/anova_oneway.py b/skfda/inference/anova/_anova_oneway.py similarity index 90% rename from skfda/inference/anova/anova_oneway.py rename to skfda/inference/anova/_anova_oneway.py index fcea00cdb..ad5be9d40 100644 --- a/skfda/inference/anova/anova_oneway.py +++ b/skfda/inference/anova/_anova_oneway.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import List, Tuple, Union import numpy as np from sklearn.utils import check_random_state @@ -8,8 +8,10 @@ from skfda.misc.metrics import lp_distance from skfda.representation import FData, FDataGrid +from ..._utils import RandomStateLike -def v_sample_stat(fd, weights, p=2): + +def v_sample_stat(fd: FData, weights: List[int], p: int = 2) -> float: r""" Calculates a statistic that measures the variability between groups of samples in a :class:`skfda.representation.FData` object. @@ -29,12 +31,12 @@ def v_sample_stat(fd, weights, p=2): This statistic is defined in Cuevas[1]. Args: - fd (FData): Object containing all the samples for which we want + fd: Object containing all the samples for which we want to calculate the statistic. - weights (list of int): Weights related to each sample. Each + weights: Weights related to each sample. Each weight is expected to appear in the same position as its corresponding sample in the FData object. - p (int, optional): p of the lp norm. Must be greater or equal + p: p of the lp norm. Must be greater or equal than 1. If p='inf' or p=np.inf it is used the L infinity metric. Defaults to 2. @@ -82,7 +84,7 @@ def v_sample_stat(fd, weights, p=2): return np.sum(coef * lp_distance(fd[t_ind[0]], fd[t_ind[1]], p=p) ** p) -def v_asymptotic_stat(fd, weights, p=2): +def v_asymptotic_stat(fd: FData, weights: List[int], p: int = 2) -> float: r""" Calculates a statistic that measures the variability between groups of samples in a :class:`skfda.representation.FData` object. @@ -102,12 +104,12 @@ def v_asymptotic_stat(fd, weights, p=2): This statistic is defined in Cuevas[1]. Args: - fd (FData): Object containing all the samples for which we want + fd: Object containing all the samples for which we want to calculate the statistic. - weights (list of int): Weights related to each sample. Each + weights: Weights related to each sample. Each weight is expected to appear in the same position as its corresponding sample in the FData object. - p (int, optional): p of the lp norm. Must be greater or equal + p: p of the lp norm. Must be greater or equal than 1. If p='inf' or p=np.inf it is used the L infinity metric. Defaults to 2. @@ -158,8 +160,13 @@ def v_asymptotic_stat(fd, weights, p=2): return np.sum(lp_distance(left_fd, right_fd, p=p) ** p) -def _anova_bootstrap(fd_grouped, n_reps, random_state=None, p=2, - equal_var=True): +def _anova_bootstrap( + fd_grouped: Tuple[FData, ...], + n_reps: int, + random_state: RandomStateLike = None, + p: int = 2, + equal_var: bool = True +) -> np.ndarray: n_groups = len(fd_grouped) if n_groups < 2: @@ -204,13 +211,13 @@ def _anova_bootstrap(fd_grouped, n_reps, random_state=None, p=2, def oneway_anova( - *args, + *args: FData, n_reps: int = 2000, return_dist: bool = False, - random_state=None, + random_state: RandomStateLike = None, p: int = 2, equal_var: bool = True, -) -> Tuple[float, float]: +) -> Union[Tuple[float, float], Tuple[float, float, np.ndarray]]: r""" Performs one-way functional ANOVA. @@ -242,7 +249,7 @@ def oneway_anova( This procedure is from Cuevas[1]. Args: - fd1,fd2,.... (FDataGrid): The sample measurements for each each group. + args: The sample measurements for each each group. n_reps: Number of simulations for the bootstrap procedure. Defaults to 2000 (This value may change in future @@ -251,7 +258,7 @@ def oneway_anova( return_dist: Flag to indicate if the function should return a numpy.array with the sampling distribution simulated. - random_state (optional): Random state. + random_state: Random state. p: p of the lp norm. Must be greater or equal than 1. If p='inf' or p=np.inf it is used the L infinity metric. @@ -262,11 +269,9 @@ def oneway_anova( else considers an independent covariance operator for each group. Returns: - Value of the sample statistic, p-value and sampling distribution of - the simulated asymptotic statistic. - - Return type: - (float, float, numpy.array) + Tuple containing the value of the sample statistic, p-value (and + sampling distribution of the simulated asymptotic statistic if + `return_dist` is `True`). Raises: ValueError: In case of bad arguments. From 0b160b4094446ef6306ed95f16aaa0dd87d35e94 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 15 Dec 2020 02:27:32 +0100 Subject: [PATCH 186/210] Fix style. --- setup.cfg | 3 +++ skfda/inference/anova/__init__.py | 3 +++ skfda/inference/anova/_anova_oneway.py | 14 +++++--------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/setup.cfg b/setup.cfg index bbfc0dc82..c71aeb604 100644 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,8 @@ per-file-ignores = __init__.py: # Unused modules are allowed in `__init__.py`, to reduce imports F401, + # Import multiple names is allowed in `__init__.py` + WPS235, # Logic is allowec in `__init__.py` WPS412 @@ -59,6 +61,7 @@ rst-roles = attr,class,func,meth,mod,obj,ref,term, # Needs to be tuned +max-arguments = 10 max-line-complexity = 25 max-methods = 30 max-local-variables = 15 diff --git a/skfda/inference/anova/__init__.py b/skfda/inference/anova/__init__.py index e5aa240de..e16aa8b77 100644 --- a/skfda/inference/anova/__init__.py +++ b/skfda/inference/anova/__init__.py @@ -1 +1,4 @@ +""" +Implementation of ANOVA for functional data. +""" from ._anova_oneway import oneway_anova, v_asymptotic_stat, v_sample_stat diff --git a/skfda/inference/anova/_anova_oneway.py b/skfda/inference/anova/_anova_oneway.py index ad5be9d40..41fe8f5d6 100644 --- a/skfda/inference/anova/_anova_oneway.py +++ b/skfda/inference/anova/_anova_oneway.py @@ -3,12 +3,11 @@ import numpy as np from sklearn.utils import check_random_state -from skfda import concatenate -from skfda.datasets import make_gaussian_process -from skfda.misc.metrics import lp_distance -from skfda.representation import FData, FDataGrid - +from ... import concatenate from ..._utils import RandomStateLike +from ...datasets import make_gaussian_process +from ...misc.metrics import lp_distance +from ...representation import FData, FDataGrid def v_sample_stat(fd: FData, weights: List[int], p: int = 2) -> float: @@ -165,7 +164,7 @@ def _anova_bootstrap( n_reps: int, random_state: RandomStateLike = None, p: int = 2, - equal_var: bool = True + equal_var: bool = True, ) -> np.ndarray: n_groups = len(fd_grouped) @@ -273,9 +272,6 @@ def oneway_anova( sampling distribution of the simulated asymptotic statistic if `return_dist` is `True`). - Raises: - ValueError: In case of bad arguments. - Examples: >>> from skfda.inference.anova import oneway_anova >>> from skfda.datasets import fetch_gait From b09f3e8227b64cea489dd2b6d95418892269d461 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 15 Dec 2020 12:58:40 +0100 Subject: [PATCH 187/210] Put ANOVA docstring in one line. --- skfda/inference/anova/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/skfda/inference/anova/__init__.py b/skfda/inference/anova/__init__.py index e16aa8b77..044f4adb0 100644 --- a/skfda/inference/anova/__init__.py +++ b/skfda/inference/anova/__init__.py @@ -1,4 +1,2 @@ -""" -Implementation of ANOVA for functional data. -""" +"""Implementation of ANOVA for functional data.""" from ._anova_oneway import oneway_anova, v_asymptotic_stat, v_sample_stat From 52427a4cd3316144cd3d8a4c0fd15e0498cdd3ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Tue, 15 Dec 2020 22:58:04 +0100 Subject: [PATCH 188/210] Use existing action. --- .github/workflows/mypy.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 21bd6b384..ccec5a8b9 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -11,14 +11,11 @@ jobs: name: Mypy steps: - uses: actions/checkout@v1 - - name: Set up Python 3.8 - uses: actions/setup-python@v1 + - uses: tsuyoshicho/action-mypy@v1 with: - python-version: 3.8 - - name: Install Dependencies - run: | - pip install mypy - pip install . - - name: mypy - run: | - mypy . + github_token: ${{ secrets.github_token }} + # Change reviewdog reporter if you need [github-pr-check,github-check,github-pr-review]. + reporter: github-pr-review + # Change reporter level if you need. + # GitHub Status Check won't become failure with warning. + level: warning From 8abf2fe827f51d83bcd7b6d10533fdb1c2ed14fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Tue, 15 Dec 2020 23:09:29 +0100 Subject: [PATCH 189/210] Remove trigger outside PR. --- .github/workflows/mypy.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index ccec5a8b9..0e90e4224 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -1,16 +1,14 @@ name: Mypy on: - push: pull_request: - workflow_dispatch: jobs: build: runs-on: ubuntu-latest name: Mypy steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 - uses: tsuyoshicho/action-mypy@v1 with: github_token: ${{ secrets.github_token }} From a54a47e1c6e826ed59959813e4ffcd4a133715ef Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 16 Dec 2020 21:11:26 +0100 Subject: [PATCH 190/210] Add overloads for real datasets. --- examples/plot_kernel_smoothing.py | 5 +- setup.py | 6 +- skfda/datasets/_real_datasets.py | 294 +++++++++++++++++++++++++++--- 3 files changed, 277 insertions(+), 28 deletions(-) diff --git a/examples/plot_kernel_smoothing.py b/examples/plot_kernel_smoothing.py index a118be297..7f661e89d 100644 --- a/examples/plot_kernel_smoothing.py +++ b/examples/plot_kernel_smoothing.py @@ -11,14 +11,13 @@ # Author: Miguel Carbajo Berrocal # License: MIT -import skfda - import matplotlib.pylab as plt import numpy as np + +import skfda import skfda.preprocessing.smoothing.kernel_smoothers as ks import skfda.preprocessing.smoothing.validation as val - ############################################################################## # # For this example, we will use the diff --git a/setup.py b/setup.py index a71366d6e..4a840bc26 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,7 @@ import os import sys -from setuptools import setup, find_packages - +from setuptools import find_packages, setup needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) pytest_runner = ['pytest-runner'] if needs_pytest else [] @@ -72,7 +71,8 @@ 'rdata', 'scikit-datasets[cran]>=0.1.24', 'scikit-learn>=0.20', - 'scipy>=1.3.0' + 'scipy>=1.3.0', + 'typing-extensions', ], setup_requires=pytest_runner, tests_require=['pytest'], diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index a0bcfb116..6ac6b78f5 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -1,10 +1,14 @@ -import rdata import warnings - -from sklearn.utils import Bunch +from typing import Any, Tuple, Union, overload import numpy as np import pandas as pd +from numpy import ndarray +from pandas import DataFrame, Series +from sklearn.utils import Bunch +from typing_extensions import Literal + +import rdata from .. import FDataGrid @@ -59,8 +63,13 @@ def functional_constructor(obj, attrs): coordinate_names=(values_label[0],)), target) -def fetch_cran(name, package_name, *, converter=None, - **kwargs): +def fetch_cran( + name: str, + package_name: str, + *, + converter: rdata.conversion.Converter = None, + **kwargs, +) -> Any: """ Fetch a dataset from CRAN. @@ -81,7 +90,7 @@ def fetch_cran(name, package_name, *, converter=None, converter=converter, **kwargs) -def fetch_ucr(name, **kwargs): +def fetch_ucr(name: str, **kwargs) -> Bunch: """ Fetch a dataset from the UCR. @@ -204,7 +213,38 @@ def _fetch_fda_usc(name): """ -def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_phoneme( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_phoneme( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, ndarray]: + ... + + +@overload +def fetch_phoneme( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, Series]: + ... + + +def fetch_phoneme( + *, + return_X_y: bool = False, + as_frame: bool = False, +) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the phoneme dataset. @@ -258,7 +298,7 @@ def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_phoneme, "__doc__"): # docstrings can be stripped off +if fetch_phoneme.__doc__ is not None: # docstrings can be stripped off fetch_phoneme.__doc__ += _phoneme_descr + _param_descr _growth_descr = """ @@ -276,7 +316,37 @@ def fetch_phoneme(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_growth(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_growth( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_growth( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, ndarray]: + ... + + +@overload +def fetch_growth( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, Series]: + ... + + +def fetch_growth( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the Berkeley Growth Study dataset. @@ -325,7 +395,7 @@ def fetch_growth(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_growth, "__doc__"): # docstrings can be stripped off +if fetch_growth.__doc__ is not None: # docstrings can be stripped off fetch_growth.__doc__ += _growth_descr + _param_descr _tecator_descr = """ @@ -366,7 +436,37 @@ def fetch_growth(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_tecator( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_tecator( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, ndarray]: + ... + + +@overload +def fetch_tecator( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, DataFrame]: + ... + + +def fetch_tecator( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, DataFrame]]: """ Load the Tecator dataset. @@ -405,7 +505,7 @@ def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_tecator, "__doc__"): # docstrings can be stripped off +if fetch_tecator.__doc__ is not None: # docstrings can be stripped off fetch_tecator.__doc__ += _tecator_descr + _param_descr _medflies_descr = """ @@ -446,7 +546,37 @@ def fetch_tecator(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_medflies(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_medflies( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_medflies( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, ndarray]: + ... + + +@overload +def fetch_medflies( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, Series]: + ... + + +def fetch_medflies( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the Medflies dataset, where the flies are separated in two classes according to their longevity. @@ -491,7 +621,7 @@ def fetch_medflies(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_medflies, "__doc__"): # docstrings can be stripped off +if fetch_medflies.__doc__ is not None: # docstrings can be stripped off fetch_medflies.__doc__ += _medflies_descr + _param_descr _weather_descr = """ @@ -507,7 +637,37 @@ def fetch_medflies(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_weather(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_weather( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_weather( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, ndarray]: + ... + + +@overload +def fetch_weather( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, Series]: + ... + + +def fetch_weather( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the Canadian Weather dataset. @@ -592,7 +752,7 @@ def fetch_weather(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_weather, "__doc__"): # docstrings can be stripped off +if fetch_weather.__doc__ is not None: # docstrings can be stripped off fetch_weather.__doc__ += _weather_descr + _param_descr _aemet_descr = """ @@ -610,7 +770,37 @@ def fetch_weather(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_aemet(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_aemet( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_aemet( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, None]: + ... + + +@overload +def fetch_aemet( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, None]: + ... + + +def fetch_aemet( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, None], Tuple[DataFrame, None]]: """ Load the Spanish Weather dataset. @@ -673,7 +863,7 @@ def fetch_aemet(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_aemet, "__doc__"): # docstrings can be stripped off +if fetch_aemet.__doc__ is not None: # docstrings can be stripped off fetch_aemet.__doc__ += _aemet_descr + _param_descr @@ -702,7 +892,37 @@ def fetch_aemet(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_octane(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_octane( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_octane( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, ndarray]: + ... + + +@overload +def fetch_octane( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, Series]: + ... + + +def fetch_octane( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """Load near infrared spectra of gasoline samples. This function fetchs the octane dataset from the R package 'mrfDepth' @@ -756,7 +976,7 @@ def fetch_octane(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_octane, "__doc__"): # docstrings can be stripped off +if fetch_octane.__doc__ is not None: # docstrings can be stripped off fetch_octane.__doc__ += _octane_descr + _param_descr _gait_descr = """ @@ -772,7 +992,37 @@ def fetch_octane(return_X_y: bool = False, as_frame: bool = False): """ -def fetch_gait(return_X_y: bool = False, as_frame: bool = False): +@overload +def fetch_gait( + *, + return_X_y: Literal[False] = False, + as_frame: bool = False, +) -> Bunch: + ... + + +@overload +def fetch_gait( + *, + return_X_y: Literal[True], + as_frame: Literal[False] = False, +) -> Tuple[FDataGrid, None]: + ... + + +@overload +def fetch_gait( + *, + return_X_y: Literal[True], + as_frame: Literal[True], +) -> Tuple[DataFrame, None]: + ... + + +def fetch_gait( + return_X_y: bool = False, + as_frame: bool = False +) -> Union[Bunch, Tuple[FDataGrid, None], Tuple[DataFrame, None]]: """ Load the GAIT dataset. @@ -817,5 +1067,5 @@ def fetch_gait(return_X_y: bool = False, as_frame: bool = False): DESCR=DESCR) -if hasattr(fetch_gait, "__doc__"): # docstrings can be stripped off +if fetch_gait.__doc__ is not None: # docstrings can be stripped off fetch_gait.__doc__ += _gait_descr + _param_descr From 3ab04e3d7182d2ab908b69b284ff92732d9b3a55 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 17 Dec 2020 00:22:57 +0100 Subject: [PATCH 191/210] Style fixes. --- setup.cfg | 6 +- skfda/datasets/_real_datasets.py | 186 +++++++++++++++++++----------- skfda/exploratory/stats/_stats.py | 2 +- 3 files changed, 126 insertions(+), 68 deletions(-) diff --git a/setup.cfg b/setup.cfg index 10bec4995..d297fce04 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,8 @@ ignore = WPS305, # We need multine loops WPS352, + # We use nested imports sometimes, and it is not THAT bad + WPS433, # Our private modules are fine to import # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) WPS436, @@ -59,6 +61,8 @@ rst-directives = rst-roles = attr,class,func,meth,mod,obj,ref,term, + +allowed-domain-names = data, obj, var # Needs to be tuned max-arguments = 10 @@ -71,7 +75,7 @@ max-module-members = 10 max-string-usages = 10 max-cognitive-score = 30 -ignore-decorators = property +ignore-decorators = (property)|(overload) strictness = long diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 6ac6b78f5..e733df06c 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -24,17 +24,35 @@ def _get_skdatasets_repositories(): def fdata_constructor(obj, attrs): + """ + Construct a :func:`FDataGrid` objet from a R `fdata` object. + + This constructor can be used in the dict passed to + :func:`rdata.conversion.SimpleConverter` in order to + convert `fdata` objects from the fda.usc package. + + """ names = obj["names"] - return FDataGrid(data_matrix=obj["data"], - grid_points=obj["argvals"], - domain_range=obj["rangeval"], - dataset_name=names['main'][0], - argument_names=(names['xlab'][0],), - coordinate_names=(names['ylab'][0],)) + return FDataGrid( + data_matrix=obj["data"], + grid_points=obj["argvals"], + domain_range=obj["rangeval"], + dataset_name=names['main'][0], + argument_names=(names['xlab'][0],), + coordinate_names=(names['ylab'][0],), + ) def functional_constructor(obj, attrs): + """ + Construct a :func:`FDataGrid` objet from a R `functional` object. + + This constructor can be used in the dict passed to + :func:`rdata.conversion.SimpleConverter` in order to + convert `functional` objects from the ddalpha package. + + """ name = obj['name'] args_label = obj['args'] values_label = obj['vals'] @@ -46,8 +64,7 @@ def functional_constructor(obj, attrs): args_init = min(grid_points_set) args_end = max(grid_points_set) - grid_points = np.arange(args_init, - args_end + 1) + grid_points = np.arange(args_init, args_end + 1) data_matrix = np.zeros(shape=(len(dataf), len(grid_points))) @@ -55,12 +72,17 @@ def functional_constructor(obj, attrs): for t, x in zip(o["args"], o["vals"]): data_matrix[num_sample, t - args_init] = x - return (FDataGrid(data_matrix=data_matrix, - grid_points=grid_points, - domain_range=(args_init, args_end), - dataset_name=name[0], - argument_names=(args_label[0],), - coordinate_names=(values_label[0],)), target) + return ( + FDataGrid( + data_matrix=data_matrix, + grid_points=grid_points, + domain_range=(args_init, args_end), + dataset_name=name[0], + argument_names=(args_label[0],), + coordinate_names=(values_label[0],), + ), + target, + ) def fetch_cran( @@ -76,6 +98,14 @@ def fetch_cran( Args: name: Dataset name. package_name: Name of the R package containing the dataset. + converter: Object that performs the conversion of the R objects to + Python objects. + kwargs: Additional parameters for the function + :func:`skdatasets.repositories.cran.fetch_dataset`. + + Returns: + The dataset, with the R types converted to suitable Python + types. """ repositories = _get_skdatasets_repositories() @@ -84,10 +114,30 @@ def fetch_cran( converter = rdata.conversion.SimpleConverter({ **rdata.conversion.DEFAULT_CLASS_MAP, "fdata": fdata_constructor, - "functional": functional_constructor}) + "functional": functional_constructor, + }) + + return repositories.cran.fetch_dataset( + name, + package_name, + converter=converter, + **kwargs, + ) - return repositories.cran.fetch_dataset(name, package_name, - converter=converter, **kwargs) + +def _ucr_to_fdatagrid(data): + if data.dtype == np.object_: + data = np.array(data.tolist()) + + # n_instances := data.shape[0] + # dim_output := data.shape[1] + # n_points := data.shape[2] + + data = np.transpose(data, axes=(0, 2, 1)) + + grid_points = range(data.shape[1]) + + return FDataGrid(data, grid_points=grid_points) def fetch_ucr(name: str, **kwargs) -> Bunch: @@ -96,6 +146,11 @@ def fetch_ucr(name: str, **kwargs) -> Bunch: Args: name: Dataset name. + kwargs: Additional parameters for the function + :func:`skdatasets.repositories.ucr.fetch`. + + Returns: + The dataset requested. Note: Functional multivariate datasets are not yet supported. @@ -113,26 +168,12 @@ def fetch_ucr(name: str, **kwargs) -> Bunch: dataset = repositories.ucr.fetch(name, **kwargs) - def ucr_to_fdatagrid(data): - if data.dtype == np.object_: - data = np.array(data.tolist()) - - # n_instances = data.shape[0] - # dim_output = data.shape[1] - # n_points = data.shape[2] - - data = np.transpose(data, axes=(0, 2, 1)) - - grid_points = range(data.shape[1]) - - return FDataGrid(data, grid_points=grid_points) - - dataset['data'] = ucr_to_fdatagrid(dataset['data']) - del dataset['feature_names'] + dataset['data'] = _ucr_to_fdatagrid(dataset['data']) + dataset.pop('feature_names') data_test = dataset.get('data_test', None) if data_test is not None: - dataset['data_test'] = ucr_to_fdatagrid(data_test) + dataset['data_test'] = _ucr_to_fdatagrid(data_test) return dataset @@ -140,14 +181,20 @@ def ucr_to_fdatagrid(data): def _fetch_cran_no_encoding_warning(*args, **kwargs): # Probably non thread safe with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning, - message="Unknown encoding. Assumed ASCII.") + warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Unknown encoding. Assumed ASCII.", + ) return fetch_cran(*args, **kwargs) def _fetch_elem_stat_learn(name): - return _fetch_cran_no_encoding_warning(name, "ElemStatLearn", - version="0.1-7.1") + return _fetch_cran_no_encoding_warning( + name, + "ElemStatLearn", + version="0.1-7.1", + ) def _fetch_ddalpha(name): @@ -252,30 +299,36 @@ def fetch_phoneme( from the dataset in `https://web.stanford.edu/~hastie/ElemStatLearn/`. """ - DESCR = _phoneme_descr + descr = _phoneme_descr raw_dataset = _fetch_elem_stat_learn("phoneme") data = raw_dataset["phoneme"] - curve_data = data.iloc[:, 0:256] + n_points = 256 + + curve_data = data.iloc[:, 0:n_points] sound = data["g"].values speaker = data["speaker"].values - curves = FDataGrid(data_matrix=curve_data.values, - grid_points=np.linspace(0, 8, 256), - domain_range=[0, 8], - dataset_name="Phoneme", - argument_names=("frequency (kHz)",), - coordinate_names=("log-periodogram",)) + curves = FDataGrid( + data_matrix=curve_data.values, + grid_points=np.linspace(0, 8, n_points), + domain_range=[0, 8], + dataset_name="Phoneme", + argument_names=("frequency (kHz)",), + coordinate_names=("log-periodogram",), + ) curve_name = "log-periodogram" target_name = "phoneme" frame = None if as_frame: - frame = pd.DataFrame({curve_name: curves, - target_name: sound}) + frame = pd.DataFrame({ + curve_name: curves, + target_name: sound + }) curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] meta = pd.Series(speaker, name="speaker") @@ -285,17 +338,18 @@ def fetch_phoneme( if return_X_y: return curves, target - else: - return Bunch( - data=curves, - target=target, - frame=frame, - categories={target_name: sound.categories.tolist()}, - feature_names=[curve_name], - target_names=[target_name], - meta=meta, - meta_names=["speaker"], - DESCR=DESCR) + + return Bunch( + data=curves, + target=target, + frame=frame, + categories={target_name: sound.categories.tolist()}, + feature_names=[curve_name], + target_names=[target_name], + meta=meta, + meta_names=["speaker"], + DESCR=descr + ) if fetch_phoneme.__doc__ is not None: # docstrings can be stripped off @@ -345,7 +399,7 @@ def fetch_growth( def fetch_growth( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the Berkeley Growth Study dataset. @@ -465,7 +519,7 @@ def fetch_tecator( def fetch_tecator( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, DataFrame]]: """ Load the Tecator dataset. @@ -575,7 +629,7 @@ def fetch_medflies( def fetch_medflies( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the Medflies dataset, where the flies are separated in two classes @@ -666,7 +720,7 @@ def fetch_weather( def fetch_weather( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ Load the Canadian Weather dataset. @@ -799,7 +853,7 @@ def fetch_aemet( def fetch_aemet( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, None], Tuple[DataFrame, None]]: """ Load the Spanish Weather dataset. @@ -921,7 +975,7 @@ def fetch_octane( def fetch_octane( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """Load near infrared spectra of gasoline samples. @@ -1021,7 +1075,7 @@ def fetch_gait( def fetch_gait( return_X_y: bool = False, - as_frame: bool = False + as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, None], Tuple[DataFrame, None]]: """ Load the GAIT dataset. diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 46a1d4545..32871ffb9 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -27,7 +27,7 @@ def mean(X: F) -> F: return X.mean() -def var(X: FData) -> FDataGrid: # noqa: WPS110 +def var(X: FData) -> FDataGrid: """Compute the variance of a set of samples in a FDataGrid object. Args: From 7a52c58adc7b69e77800a33b3ed605401dc11eed Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 17 Dec 2020 23:03:36 +0100 Subject: [PATCH 192/210] Fix real datasets. --- setup.cfg | 9 + skfda/datasets/_real_datasets.py | 399 ++++++++++++++++++------------- 2 files changed, 243 insertions(+), 165 deletions(-) diff --git a/setup.cfg b/setup.cfg index d297fce04..5841335ce 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,6 +14,8 @@ ignore = D107, # Uppercase arguments like X are common in scikit-learn N803, + # Uppercase variables like X are common in scikit-learn + N806, # There are no bad quotes Q000, # Google Python style is not RST until after processed by Napoleon @@ -33,8 +35,12 @@ ignore = WPS305, # We need multine loops WPS352, + # All keywords are beautiful + WPS420, # We use nested imports sometimes, and it is not THAT bad WPS433, + # We use list multiplication to allocate list with immutable values (None or numbers) + WPS435, # Our private modules are fine to import # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) WPS436, @@ -49,6 +55,9 @@ per-file-ignores = WPS235, # Logic is allowec in `__init__.py` WPS412 + + # There are many datasets + _real_datasets.py: WPS202 # Tests benefit from magic numbers test_*.py: WPS432 diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index e733df06c..98515643a 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -266,7 +266,7 @@ def fetch_phoneme( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -275,7 +275,7 @@ def fetch_phoneme( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, ndarray]: - ... + pass @overload @@ -284,7 +284,7 @@ def fetch_phoneme( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, Series]: - ... + pass def fetch_phoneme( @@ -327,7 +327,7 @@ def fetch_phoneme( if as_frame: frame = pd.DataFrame({ curve_name: curves, - target_name: sound + target_name: sound, }) curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] @@ -348,7 +348,7 @@ def fetch_phoneme( target_names=[target_name], meta=meta, meta_names=["speaker"], - DESCR=descr + DESCR=descr, ) @@ -376,7 +376,7 @@ def fetch_growth( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -385,7 +385,7 @@ def fetch_growth( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, ndarray]: - ... + pass @overload @@ -394,7 +394,7 @@ def fetch_growth( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, Series]: - ... + pass def fetch_growth( @@ -408,7 +408,7 @@ def fetch_growth( Berkeley Growth Study. """ - DESCR = _growth_descr + descr = _growth_descr raw_dataset = _fetch_fda("growth") @@ -419,11 +419,13 @@ def fetch_growth( males = data["hgtm"].T sex = np.array([0] * males.shape[0] + [1] * females.shape[0]) - curves = FDataGrid(data_matrix=np.concatenate((males, females), axis=0), - grid_points=ages, - dataset_name="Berkeley Growth Study", - argument_names=("age",), - coordinate_names=("height",)) + curves = FDataGrid( + data_matrix=np.concatenate((males, females), axis=0), + grid_points=ages, + dataset_name="Berkeley Growth Study", + argument_names=("age",), + coordinate_names=("height",), + ) curve_name = "height" target_name = "sex" @@ -432,21 +434,25 @@ def fetch_growth( if as_frame: sex = pd.Categorical.from_codes(sex, categories=target_categories) - frame = pd.DataFrame({curve_name: curves, - target_name: sex}) + frame = pd.DataFrame({ + curve_name: curves, + target_name: sex, + }) curves = frame.iloc[:, [0]] sex = frame.iloc[:, 1] if return_X_y: return curves, sex - else: - return Bunch(data=curves, - target=sex, - frame=frame, - categories={target_name: target_categories}, - feature_names=[curve_name], - target_names=[target_name], - DESCR=DESCR) + + return Bunch( + data=curves, + target=sex, + frame=frame, + categories={target_name: target_categories}, + feature_names=[curve_name], + target_names=[target_name], + DESCR=descr, + ) if fetch_growth.__doc__ is not None: # docstrings can be stripped off @@ -496,7 +502,7 @@ def fetch_tecator( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -505,7 +511,7 @@ def fetch_tecator( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, ndarray]: - ... + pass @overload @@ -514,7 +520,7 @@ def fetch_tecator( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, DataFrame]: - ... + pass def fetch_tecator( @@ -528,7 +534,7 @@ def fetch_tecator( http://lib.stat.cmu.edu/datasets/tecator. """ - DESCR = _tecator_descr + descr = _tecator_descr raw_dataset = _fetch_fda_usc("tecator") @@ -549,14 +555,16 @@ def fetch_tecator( if return_X_y: return curves, target - else: - return Bunch(data=curves, - target=target, - frame=frame, - categories={}, - feature_names=[feature_name], - target_names=target_names, - DESCR=DESCR) + + return Bunch( + data=curves, + target=target, + frame=frame, + categories={}, + feature_names=[feature_name], + target_names=target_names, + DESCR=descr, + ) if fetch_tecator.__doc__ is not None: # docstrings can be stripped off @@ -606,7 +614,7 @@ def fetch_medflies( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -615,7 +623,7 @@ def fetch_medflies( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, ndarray]: - ... + pass @overload @@ -624,7 +632,7 @@ def fetch_medflies( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, Series]: - ... + pass def fetch_medflies( @@ -632,14 +640,13 @@ def fetch_medflies( as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, ndarray], Tuple[DataFrame, Series]]: """ - Load the Medflies dataset, where the flies are separated in two classes - according to their longevity. + Load the Medflies dataset. The data is obtained from the R package 'ddalpha', which its a modification of the dataset in http://www.stat.ucdavis.edu/~wang/data/medfly1000.htm. """ - DESCR = _medflies_descr + descr = _medflies_descr raw_dataset = _fetch_ddalpha("medflies") @@ -657,22 +664,28 @@ def fetch_medflies( if as_frame: target = pd.Categorical.from_codes( - target, categories=target_categories) - frame = pd.DataFrame({curve_name: curves, - target_name: target}) + target, + categories=target_categories, + ) + frame = pd.DataFrame({ + curve_name: curves, + target_name: target, + }) curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] if return_X_y: return curves, target - else: - return Bunch(data=curves, - target=target, - frame=frame, - categories={target_name: target_categories}, - feature_names=[curve_name], - target_names=[target_name], - DESCR=DESCR) + + return Bunch( + data=curves, + target=target, + frame=frame, + categories={target_name: target_categories}, + feature_names=[curve_name], + target_names=[target_name], + DESCR=descr, + ) if fetch_medflies.__doc__ is not None: # docstrings can be stripped off @@ -697,7 +710,7 @@ def fetch_weather( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -706,7 +719,7 @@ def fetch_weather( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, ndarray]: - ... + pass @overload @@ -715,7 +728,7 @@ def fetch_weather( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, Series]: - ... + pass def fetch_weather( @@ -728,27 +741,29 @@ def fetch_weather( The data is obtained from the R package 'fda' from CRAN. """ - DESCR = _weather_descr - - raw_dataset = _fetch_fda("CanadianWeather") + descr = _weather_descr - data = raw_dataset["CanadianWeather"] - - weather_daily = np.asarray(data["dailyAv"]) + data = _fetch_fda("CanadianWeather")["CanadianWeather"] # Axes 0 and 1 must be transposed since in the downloaded dataset the # data_matrix shape is (nfeatures, n_samples, dim_codomain) while our # data_matrix shape is (n_samples, nfeatures, dim_codomain). - temp_prec_daily = np.transpose(weather_daily[:, :, 0:2], axes=(1, 0, 2)) + temp_prec_daily = np.transpose(data["dailyAv"][:, :, 0:2], axes=(1, 0, 2)) + + days_in_year = 365 - curves = FDataGrid(data_matrix=temp_prec_daily, - grid_points=np.arange(0, 365) + 0.5, - domain_range=(0, 365), - dataset_name="Canadian Weather", - sample_names=data["place"], - argument_names=("day",), - coordinate_names=("temperature (ºC)", - "precipitation (mm.)")) + curves = FDataGrid( + data_matrix=temp_prec_daily, + grid_points=np.arange(0, days_in_year) + 0.5, + domain_range=(0, days_in_year), + dataset_name="Canadian Weather", + sample_names=data["place"], + argument_names=("day",), + coordinate_names=( + "temperature (ºC)", + "precipitation (mm.)", + ), + ) curve_name = "daily averages" target_name = "region" @@ -758,7 +773,9 @@ def fetch_weather( if as_frame: target = pd.Categorical.from_codes( - target, categories=target_categories) + target, + categories=target_categories, + ) frame = pd.DataFrame({ curve_name: curves, "place": data["place"], @@ -767,10 +784,13 @@ def fetch_weather( "longitude": np.asarray(data["coordinates"])[:, 1], "index": data["geogindex"], "monthly temperatures": np.asarray( - data["monthlyTemp"]).T.tolist(), + data["monthlyTemp"], + ).T.tolist(), "monthly precipitation": np.asarray( - data["monthlyPrecip"]).T.tolist(), - target_name: target}) + data["monthlyPrecip"], + ).T.tolist(), + target_name: target, + }) X = frame.iloc[:, :-1] target = frame.iloc[:, -1] feature_names = list(X.columns.values) @@ -779,31 +799,43 @@ def fetch_weather( else: feature_names = [curve_name] X = curves - meta = np.array(list(zip(data["place"], - data["province"], - np.asarray(data["coordinates"])[:, 0], - np.asarray(data["coordinates"])[:, 1], - data["geogindex"], - np.asarray(data["monthlyTemp"]).T, - np.asarray(data["monthlyPrecip"]).T))) - meta_names = ["place", "province", "latitude", "longitude", - "index", "monthly temperatures", - "monthly precipitation"], - - additional_dict = {"meta": meta, - "meta_names": meta_names} + meta = np.array(list(zip( + data["place"], + data["province"], + np.asarray(data["coordinates"])[:, 0], + np.asarray(data["coordinates"])[:, 1], + data["geogindex"], + np.asarray(data["monthlyTemp"]).T, + np.asarray(data["monthlyPrecip"]).T, + ))) + meta_names = [ + "place", + "province", + "latitude", + "longitude", + "index", + "monthly temperatures", + "monthly precipitation", + ] + + additional_dict = { + "meta": meta, + "meta_names": meta_names, + } if return_X_y: return X, target - else: - return Bunch(data=X, - target=target, - frame=frame, - categories={target_name: target_categories}, - feature_names=feature_names, - target_names=[target_name], - **additional_dict, - DESCR=DESCR) + + return Bunch( + data=X, + target=target, + frame=frame, + categories={target_name: target_categories}, + feature_names=feature_names, + target_names=[target_name], + **additional_dict, + DESCR=descr, + ) if fetch_weather.__doc__ is not None: # docstrings can be stripped off @@ -830,7 +862,7 @@ def fetch_aemet( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -839,7 +871,7 @@ def fetch_aemet( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, None]: - ... + pass @overload @@ -848,7 +880,7 @@ def fetch_aemet( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, None]: - ... + pass def fetch_aemet( @@ -861,28 +893,36 @@ def fetch_aemet( The data is obtained from the R package 'fda.usc' from CRAN. """ - DESCR = _aemet_descr - - raw_dataset = _fetch_fda_usc("aemet") + descr = _aemet_descr - data = raw_dataset["aemet"] + data = _fetch_fda_usc("aemet")["aemet"] data_matrix = np.empty((73, 365, 3)) data_matrix[:, :, 0] = data["temp"].data_matrix[:, :, 0] data_matrix[:, :, 1] = data["logprec"].data_matrix[:, :, 0] data_matrix[:, :, 2] = data["wind.speed"].data_matrix[:, :, 0] - curves = data["temp"].copy(data_matrix=data_matrix, - dataset_name="aemet", - sample_names=data["df"].iloc[:, 1], - argument_names=("day",), - coordinate_names=("temperature (ºC)", - "logprecipitation", - "wind speed (m/s)")) + curves = data["temp"].copy( + data_matrix=data_matrix, + dataset_name="aemet", + sample_names=data["df"].iloc[:, 1], + argument_names=("day",), + coordinate_names=( + "temperature (ºC)", + "logprecipitation", + "wind speed (m/s)", + ), + ) curve_name = "daily averages" - df_names = ["index", "place", "province", "altitude", - "longitude", "latitude"] + df_names = [ + "index", + "place", + "province", + "altitude", + "longitude", + "latitude", + ] df_indexes = np.array([0, 1, 2, 3, 6, 7]) frame = None @@ -890,7 +930,11 @@ def fetch_aemet( if as_frame: frame = pd.DataFrame({ curve_name: curves, - **{n: data["df"].iloc[:, d] for (n, d) in zip(df_names, df_indexes)}}) + **{ + n: data["df"].iloc[:, d] + for (n, d) in zip(df_names, df_indexes) + }, + }) X = frame feature_names = list(X.columns.values) @@ -901,20 +945,23 @@ def fetch_aemet( X = curves meta = np.asarray(data["df"])[:, df_indexes] meta_names = df_names - additional_dict = {"meta": meta, - "meta_names": meta_names} + additional_dict = { + "meta": meta, + "meta_names": meta_names, + } if return_X_y: return X, None - else: - return Bunch( - data=X, - target=None, - frame=frame, - categories={}, - feature_names=feature_names, - **additional_dict, - DESCR=DESCR) + + return Bunch( + data=X, + target=None, + frame=frame, + categories={}, + feature_names=feature_names, + **additional_dict, + DESCR=descr, + ) if fetch_aemet.__doc__ is not None: # docstrings can be stripped off @@ -952,7 +999,7 @@ def fetch_octane( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -961,7 +1008,7 @@ def fetch_octane( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, ndarray]: - ... + pass @overload @@ -970,7 +1017,7 @@ def fetch_octane( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, Series]: - ... + pass def fetch_octane( @@ -983,7 +1030,7 @@ def fetch_octane( from CRAN. """ - DESCR = _octane_descr + descr = _octane_descr # octane file from mrfDepth R package raw_dataset = fetch_cran("octane", "mrfDepth", version="1.0.11") @@ -994,40 +1041,56 @@ def fetch_octane( # "wavelengths ranging from 1102nm to 1552nm with measurements every two # nm."" - grid_points = np.linspace(1102, 1552, 226) + wavelength_start = 1102 + wavelength_end = 1552 + wavelength_count = 226 + + grid_points = np.linspace( + wavelength_start, + wavelength_end, + wavelength_count, + ) # "The octane data set contains six outliers (25, 26, 36–39) to which # alcohol was added". target = np.zeros(len(data), dtype=np.bool_) - target[24] = target[25] = target[35:39] = 1 # Outliers 1 + target[24:26] = 1 # noqa: WPS432 + target[35:39] = 1 # noqa: WPS432 + target_name = "is outlier" curve_name = "absorbances" - curves = FDataGrid(data, - grid_points=grid_points, - dataset_name="octane", - argument_names=("wavelength (nm)",), - coordinate_names=("absorbances",)) + curves = FDataGrid( + data, + grid_points=grid_points, + dataset_name="octane", + argument_names=("wavelength (nm)",), + coordinate_names=("absorbances",), + ) frame = None if as_frame: - frame = pd.DataFrame({curve_name: curves, - target_name: target}) + frame = pd.DataFrame({ + curve_name: curves, + target_name: target, + }) curves = frame.iloc[:, [0]] target = frame.iloc[:, 1] if return_X_y: return curves, target - else: - return Bunch(data=curves, - target=target, - frame=frame, - categories={}, - feature_names=[curve_name], - target_names=[target_name], - DESCR=DESCR) + + return Bunch( + data=curves, + target=target, + frame=frame, + categories={}, + feature_names=[curve_name], + target_names=[target_name], + DESCR=descr, + ) if fetch_octane.__doc__ is not None: # docstrings can be stripped off @@ -1052,7 +1115,7 @@ def fetch_gait( return_X_y: Literal[False] = False, as_frame: bool = False, ) -> Bunch: - ... + pass @overload @@ -1061,7 +1124,7 @@ def fetch_gait( return_X_y: Literal[True], as_frame: Literal[False] = False, ) -> Tuple[FDataGrid, None]: - ... + pass @overload @@ -1070,7 +1133,7 @@ def fetch_gait( return_X_y: Literal[True], as_frame: Literal[True], ) -> Tuple[DataFrame, None]: - ... + pass def fetch_gait( @@ -1083,7 +1146,7 @@ def fetch_gait( The data is obtained from the R package 'fda' from CRAN. """ - DESCR = _gait_descr + descr = _gait_descr raw_data = _fetch_fda("gait") @@ -1095,13 +1158,17 @@ def fetch_gait( sample_names = np.asarray(data.coords.get('dim_1')) feature_name = 'gait' - curves = FDataGrid(data_matrix=data_matrix, - grid_points=grid_points, - dataset_name=feature_name, - sample_names=sample_names, - argument_names=("time (proportion of gait cycle)",), - coordinate_names=("hip angle (degrees)", - "knee angle (degrees)")) + curves = FDataGrid( + data_matrix=data_matrix, + grid_points=grid_points, + dataset_name=feature_name, + sample_names=sample_names, + argument_names=("time (proportion of gait cycle)",), + coordinate_names=( + "hip angle (degrees)", + "knee angle (degrees)", + ), + ) frame = None @@ -1111,14 +1178,16 @@ def fetch_gait( if return_X_y: return curves, None - else: - return Bunch(data=curves, - target=None, - frame=frame, - categories={}, - feature_names=[feature_name], - target_names=[], - DESCR=DESCR) + + return Bunch( + data=curves, + target=None, + frame=frame, + categories={}, + feature_names=[feature_name], + target_names=[], + DESCR=descr, + ) if fetch_gait.__doc__ is not None: # docstrings can be stripped off From 9a2efd48c9cd56bfe1221fc575f1eb86cf0823ce Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 17 Dec 2020 23:23:04 +0100 Subject: [PATCH 193/210] Fix tests. --- skfda/datasets/_real_datasets.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 98515643a..5af17ec9b 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -748,7 +748,9 @@ def fetch_weather( # Axes 0 and 1 must be transposed since in the downloaded dataset the # data_matrix shape is (nfeatures, n_samples, dim_codomain) while our # data_matrix shape is (n_samples, nfeatures, dim_codomain). - temp_prec_daily = np.transpose(data["dailyAv"][:, :, 0:2], axes=(1, 0, 2)) + temp_prec_daily = np.transpose( + np.asarray(data["dailyAv"])[:, :, 0:2], axes=(1, 0, 2) + ) days_in_year = 365 From e77a7c66a2948271a4462bbf355505d665bc76a5 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 19 Dec 2020 18:48:30 +0100 Subject: [PATCH 194/210] Fix typing strict errors in real datasets. --- setup.cfg | 5 +++++ skfda/datasets/_real_datasets.py | 34 +++++++++++++++++++------------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/setup.cfg b/setup.cfg index 5841335ce..780300dcb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -105,6 +105,11 @@ include_trailing_comma = true use_parentheses = true combine_as_imports = 1 +[mypy] +strict = True +strict_equality = True +implicit_reexport = True + [mypy-dcor.*] ignore_missing_imports = True diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 5af17ec9b..8a1117e93 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, Tuple, Union, overload +from typing import Any, Mapping, Optional, Tuple, Union, overload import numpy as np import pandas as pd @@ -13,7 +13,7 @@ from .. import FDataGrid -def _get_skdatasets_repositories(): +def _get_skdatasets_repositories() -> Any: import skdatasets repositories = getattr(skdatasets, "repositories", None) @@ -23,7 +23,10 @@ def _get_skdatasets_repositories(): return repositories -def fdata_constructor(obj, attrs): +def fdata_constructor( + obj: Any, + attrs: Mapping[Union[str, bytes], Any], +) -> FDataGrid: """ Construct a :func:`FDataGrid` objet from a R `fdata` object. @@ -44,7 +47,10 @@ def fdata_constructor(obj, attrs): ) -def functional_constructor(obj, attrs): +def functional_constructor( + obj: Any, + attrs: Mapping[Union[str, bytes], Any], +) -> FDataGrid: """ Construct a :func:`FDataGrid` objet from a R `functional` object. @@ -89,8 +95,8 @@ def fetch_cran( name: str, package_name: str, *, - converter: rdata.conversion.Converter = None, - **kwargs, + converter: Optional[rdata.conversion.Converter] = None, + **kwargs: Any, ) -> Any: """ Fetch a dataset from CRAN. @@ -125,7 +131,7 @@ def fetch_cran( ) -def _ucr_to_fdatagrid(data): +def _ucr_to_fdatagrid(data: np.ndarray) -> FDataGrid: if data.dtype == np.object_: data = np.array(data.tolist()) @@ -140,7 +146,7 @@ def _ucr_to_fdatagrid(data): return FDataGrid(data, grid_points=grid_points) -def fetch_ucr(name: str, **kwargs) -> Bunch: +def fetch_ucr(name: str, **kwargs: Any) -> Bunch: """ Fetch a dataset from the UCR. @@ -178,7 +184,7 @@ def fetch_ucr(name: str, **kwargs) -> Bunch: return dataset -def _fetch_cran_no_encoding_warning(*args, **kwargs): +def _fetch_cran_no_encoding_warning(*args: Any, **kwargs: Any) -> Any: # Probably non thread safe with warnings.catch_warnings(): warnings.filterwarnings( @@ -189,7 +195,7 @@ def _fetch_cran_no_encoding_warning(*args, **kwargs): return fetch_cran(*args, **kwargs) -def _fetch_elem_stat_learn(name): +def _fetch_elem_stat_learn(name: str) -> Any: return _fetch_cran_no_encoding_warning( name, "ElemStatLearn", @@ -197,15 +203,15 @@ def _fetch_elem_stat_learn(name): ) -def _fetch_ddalpha(name): +def _fetch_ddalpha(name: str) -> Any: return _fetch_cran_no_encoding_warning(name, "ddalpha", version="1.3.4") -def _fetch_fda(name): +def _fetch_fda(name: str) -> Any: return _fetch_cran_no_encoding_warning(name, "fda", version="2.4.7") -def _fetch_fda_usc(name): +def _fetch_fda_usc(name: str) -> Any: return _fetch_cran_no_encoding_warning(name, "fda.usc", version="1.3.0") @@ -749,7 +755,7 @@ def fetch_weather( # data_matrix shape is (nfeatures, n_samples, dim_codomain) while our # data_matrix shape is (n_samples, nfeatures, dim_codomain). temp_prec_daily = np.transpose( - np.asarray(data["dailyAv"])[:, :, 0:2], axes=(1, 0, 2) + np.asarray(data["dailyAv"])[:, :, 0:2], axes=(1, 0, 2), ) days_in_year = 365 From 935ced061e9bec562306f7aa7b0ab09c8a360e29 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 19 Dec 2020 19:23:50 +0100 Subject: [PATCH 195/210] Fix the domain of the aemet dataset. The domain range is now (0, 365). The grid points are now in the middle of the day interval. --- skfda/datasets/_real_datasets.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 8a1117e93..a1d027134 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -891,7 +891,7 @@ def fetch_aemet( pass -def fetch_aemet( +def fetch_aemet( # noqa: WPS210 return_X_y: bool = False, as_frame: bool = False, ) -> Union[Bunch, Tuple[FDataGrid, None], Tuple[DataFrame, None]]: @@ -905,13 +905,17 @@ def fetch_aemet( data = _fetch_fda_usc("aemet")["aemet"] - data_matrix = np.empty((73, 365, 3)) + days_in_year = 365 + + data_matrix = np.empty((73, days_in_year, 3)) data_matrix[:, :, 0] = data["temp"].data_matrix[:, :, 0] data_matrix[:, :, 1] = data["logprec"].data_matrix[:, :, 0] data_matrix[:, :, 2] = data["wind.speed"].data_matrix[:, :, 0] curves = data["temp"].copy( data_matrix=data_matrix, + grid_points=np.arange(0, days_in_year) + 0.5, + domain_range=(0, days_in_year), dataset_name="aemet", sample_names=data["df"].iloc[:, 1], argument_names=("day",), From c80ec4292dc383f5c440656e097c6164abb66fae Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 21 Dec 2020 03:04:19 +0100 Subject: [PATCH 196/210] Add typing to FData. --- setup.cfg | 16 +- skfda/_utils/_utils.py | 35 +- skfda/representation/_functional_data.py | 811 +++++++++++++---------- skfda/representation/evaluator.py | 59 +- skfda/representation/extrapolation.py | 6 +- 5 files changed, 563 insertions(+), 364 deletions(-) diff --git a/setup.cfg b/setup.cfg index 780300dcb..183436e12 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,6 +12,12 @@ ignore = D105, # No docstrings in __init__ D107, + # Ignore until https://github.com/terrencepreilly/darglint/issues/54 is closed + DAR202, + # Ignore until https://github.com/terrencepreilly/darglint/issues/144 is closed + DAR401, + # Non-explicit exceptions may be documented in raises + DAR402, # Uppercase arguments like X are common in scikit-learn N803, # Uppercase variables like X are common in scikit-learn @@ -33,6 +39,10 @@ ignore = WPS301, # We love f-strings WPS305, + # Implicit string concatenation is useful for exception messages + WPS326, + # We allow multiline conditions + WPS337, # We need multine loops WPS352, # All keywords are beautiful @@ -45,7 +55,9 @@ ignore = # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) WPS436, # Our private objects are fine to import - WPS450 + WPS450, + # Explicit len compare is better than implicit + WPS507, per-file-ignores = __init__.py: @@ -71,7 +83,7 @@ rst-directives = rst-roles = attr,class,func,meth,mod,obj,ref,term, -allowed-domain-names = data, obj, var +allowed-domain-names = data, obj, result, value, values, var # Needs to be tuned max-arguments = 10 diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 2fecf9d8b..e4d023cb1 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -2,7 +2,7 @@ import functools import numbers -from typing import Optional, Union +from typing import Optional, Sequence, Union import numpy as np import scipy.integrate @@ -215,18 +215,25 @@ def _same_domain(fd, fd2): return np.array_equal(fd.domain_range, fd2.domain_range) -def _reshape_eval_points(eval_points, *, aligned, n_samples, dim_domain): +def _reshape_eval_points( + eval_points: np.ndarray, + *, + aligned: bool, + n_samples: int, + dim_domain: int, +) -> np.ndarray: """Convert and reshape the eval_points to ndarray with the corresponding shape. Args: - eval_points (array_like): Evaluation points to be reshaped. - aligned (bool): Boolean flag. True if all the samples + eval_points: Evaluation points to be reshaped. + aligned: Boolean flag. True if all the samples will be evaluated at the same evaluation_points. - dim_domain (int): Dimension of the domain. + n_samples: Number of observations. + dim_domain: Dimension of the domain. Returns: - (np.ndarray): Numpy array with the eval_points, if + Numpy array with the eval_points, if evaluation_aligned is True with shape `number of evaluation points` x `dim_domain`. If the points are not aligned the shape of the points will be `n_samples` x `number of evaluation points` @@ -283,10 +290,16 @@ def _one_grid_to_points(axes, *, dim_domain): return cartesian, shape -def _evaluate_grid(axes, *, evaluate_method, - n_samples, dim_domain, dim_codomain, - extrapolation=None, - aligned=True): +def _evaluate_grid( + axes: Sequence[np.ndarray], + *, + evaluate_method, + n_samples, + dim_domain, + dim_codomain, + extrapolation=None, + aligned=True +): """Evaluate the functional object in the cartesian grid. This method is called internally by :meth:`evaluate` when the argument @@ -310,7 +323,7 @@ def _evaluate_grid(axes, *, evaluate_method, option, but with worst performance. Args: - axes (array_like): List of axes to generated the grid where the + axes: List of axes to generated the grid where the object will be evaluated. extrapolation (str or Extrapolation, optional): Controls the extrapolation mode for elements outside the domain range. By diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index dd63b1501..03565fba2 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -4,18 +4,37 @@ objects of the package and contains some commons methods. """ -from abc import ABC, abstractmethod import warnings - -import pandas.api.extensions +from abc import ABC, abstractmethod +from typing import ( + TYPE_CHECKING, + Any, + Iterable, + Iterator, + NoReturn, + Optional, + Sequence, + Tuple, + TypeVar, + Union, +) import numpy as np +import pandas.api.extensions -from .._utils import (_evaluate_grid, _reshape_eval_points) +from .._utils import _evaluate_grid, _reshape_eval_points +from .evaluator import Evaluator from .extrapolation import _parse_extrapolation +if TYPE_CHECKING: + from . import FDataGrid, FDataBasis + from .basis import Basis -class FData(ABC, pandas.api.extensions.ExtensionArray): +T = TypeVar('T', bound='FData') +DomainRange = Tuple[Tuple[float, float], ...] + + +class FData(ABC, pandas.api.extensions.ExtensionArray): # type: ignore """Defines the structure of a functional data object. Attributes: @@ -31,13 +50,17 @@ class FData(ABC, pandas.api.extensions.ExtensionArray): """ - def __init__(self, *, extrapolation, - dataset_name=None, - dataset_label=None, - axes_labels=None, - argument_names=None, - coordinate_names=None, - sample_names=None): + def __init__( + self, + *, + extrapolation: Evaluator, + dataset_name: Optional[str] = None, + dataset_label: Optional[str] = None, + axes_labels: Optional[Tuple[Optional[str], ...]] = None, + argument_names: Optional[Tuple[Optional[str], ...]] = None, + coordinate_names: Optional[Tuple[Optional[str], ...]] = None, + sample_names: Optional[Tuple[Optional[str], ...]] = None, + ) -> None: self.extrapolation = extrapolation self.dataset_name = dataset_name @@ -47,136 +70,162 @@ def __init__(self, *, extrapolation, self.argument_names = argument_names self.coordinate_names = coordinate_names - self.axes_labels = axes_labels + if axes_labels is not None: + self.axes_labels = axes_labels self.sample_names = sample_names @property - def dataset_label(self): - warnings.warn("Parameter dataset_label is deprecated. Use the " - "parameter dataset_name instead.", - DeprecationWarning) + def dataset_label(self) -> Optional[str]: + warnings.warn( + "Parameter dataset_label is deprecated. Use the " + "parameter dataset_name instead.", + DeprecationWarning, + ) return self.dataset_name @dataset_label.setter - def dataset_label(self, name): - warnings.warn("Parameter dataset_label is deprecated. Use the " - "parameter dataset_name instead.", - DeprecationWarning) + def dataset_label(self, name: Optional[str]) -> None: + warnings.warn( + "Parameter dataset_label is deprecated. Use the " + "parameter dataset_name instead.", + DeprecationWarning, + ) self.dataset_name = name @property - def argument_names(self): + def argument_names(self) -> Tuple[Optional[str], ...]: return self._argument_names @argument_names.setter - def argument_names(self, names): + def argument_names( + self, + names: Optional[Tuple[Optional[str], ...]], + ) -> None: if names is None: names = (None,) * self.dim_domain else: names = tuple(names) if len(names) != self.dim_domain: - raise ValueError("There must be a name for each of the " - "dimensions of the domain.") + raise ValueError( + "There must be a name for each of the " + "dimensions of the domain.", + ) self._argument_names = names @property - def coordinate_names(self): + def coordinate_names(self) -> Tuple[Optional[str], ...]: return self._coordinate_names @coordinate_names.setter - def coordinate_names(self, names): + def coordinate_names( + self, + names: Optional[Tuple[Optional[str], ...]], + ) -> None: if names is None: names = (None,) * self.dim_codomain else: names = tuple(names) if len(names) != self.dim_codomain: - raise ValueError("There must be a name for each of the " - "dimensions of the codomain.") + raise ValueError( + "There must be a name for each of the " + "dimensions of the codomain.", + ) self._coordinate_names = names @property - def axes_labels(self): - warnings.warn("Parameter axes_labels is deprecated. Use the " - "parameters argument_names and " - "coordinate_names instead.", DeprecationWarning) + def axes_labels(self) -> Tuple[Optional[str], ...]: + warnings.warn( + "Parameter axes_labels is deprecated. Use the " + "parameters argument_names and " + "coordinate_names instead.", + DeprecationWarning, + ) return self.argument_names + self.coordinate_names @axes_labels.setter - def axes_labels(self, labels): - """Sets the list of labels""" - + def axes_labels(self, labels: Tuple[Optional[str], ...]) -> None: + """Set the list of labels.""" if labels is not None: - warnings.warn("Parameter axes_labels is deprecated. Use the " - "parameters argument_names and " - "coordinate_names instead.", DeprecationWarning) - - labels = np.asarray(labels) - if len(labels) > (self.dim_domain + self.dim_codomain): - raise ValueError("There must be a label for each of the " - "dimensions of the domain and the image.") - if len(labels) < (self.dim_domain + self.dim_codomain): - diff = (self.dim_domain + self.dim_codomain) - len(labels) - labels = np.concatenate((labels, diff * [None])) - - self.argument_names = labels[:self.dim_domain] - self.coordinate_names = labels[self.dim_domain:] + warnings.warn( + "Parameter axes_labels is deprecated. Use the " + "parameters argument_names and " + "coordinate_names instead.", + DeprecationWarning, + ) + + labels_array = np.asarray(labels) + if len(labels_array) > (self.dim_domain + self.dim_codomain): + raise ValueError( + "There must be a label for each of the " + "dimensions of the domain and the image.", + ) + if len(labels_array) < (self.dim_domain + self.dim_codomain): + diff = ( + (self.dim_domain + self.dim_codomain) + - len(labels_array) + ) + labels_array = np.concatenate((labels_array, diff * [None])) + + self.argument_names = labels_array[:self.dim_domain] + self.coordinate_names = labels_array[self.dim_domain:] @property - def sample_names(self): + def sample_names(self) -> Tuple[Optional[str], ...]: return self._sample_names @sample_names.setter - def sample_names(self, names): + def sample_names(self, names: Optional[Tuple[Optional[str], ...]]) -> None: if names is None: names = (None,) * self.n_samples else: names = tuple(names) if len(names) != self.n_samples: - raise ValueError("There must be a name for each of the " - "samples.") + raise ValueError( + "There must be a name for each of the samples.", + ) self._sample_names = names @property @abstractmethod - def n_samples(self): + def n_samples(self) -> int: """Return the number of samples. Returns: - int: Number of samples of the FData object. + Number of samples of the FData object. """ pass @property @abstractmethod - def dim_domain(self): + def dim_domain(self) -> int: """Return number of dimensions of the :term:`domain`. Returns: - int: Number of dimensions of the domain. + Number of dimensions of the domain. """ pass @property @abstractmethod - def dim_codomain(self): + def dim_codomain(self) -> int: """Return number of dimensions of the :term:`codomain`. Returns: - int: Number of dimensions of the codomain. + Number of dimensions of the codomain. """ pass @property @abstractmethod - def coordinates(self): + def coordinates(self: T) -> T: r"""Return a component of the FDataGrid. If the functional object contains multivariate samples @@ -187,23 +236,18 @@ def coordinates(self): pass @property - def extrapolation(self): + def extrapolation(self) -> Optional[Evaluator]: """Return default type of extrapolation.""" - return self._extrapolation @extrapolation.setter - def extrapolation(self, value): - """Sets the type of extrapolation.""" - - if value is None: - self._extrapolation = None - else: - self._extrapolation = _parse_extrapolation(value) + def extrapolation(self, value: Optional[Union[str, Evaluator]]) -> None: + """Set the type of extrapolation.""" + self._extrapolation = _parse_extrapolation(value) @property @abstractmethod - def domain_range(self): + def domain_range(self) -> DomainRange: """Return the :term:`domain` range of the object Returns: @@ -211,20 +255,19 @@ def domain_range(self): """ pass - def _extrapolation_index(self, eval_points): - """Checks the points that need to be extrapolated. + def _extrapolation_index(self, eval_points: np.ndarray) -> np.ndarray: + """Check the points that need to be extrapolated. Args: - eval_points (np.ndarray): Array with shape `n_eval_points` x + eval_points: Array with shape `n_eval_points` x `dim_domain` with the evaluation points, or shape ´n_samples´ x `n_eval_points` x `dim_domain` with different evaluation points for each sample. Returns: - - (np.ndarray): Array with boolean index. The positions with True - in the index are outside the domain range and extrapolation - should be applied. + Array with boolean index. The positions with True + in the index are outside the domain range and extrapolation + should be applied. """ index = np.zeros(eval_points.shape[:-1], dtype=np.bool) @@ -236,29 +279,38 @@ def _extrapolation_index(self, eval_points): return index - def _join_evaluation(self, index_matrix, index_ext, index_ev, - res_extrapolation, res_evaluation): + def _join_evaluation( + self, + index_matrix: np.ndarray, + index_ext: np.ndarray, + index_ev: np.ndarray, + res_extrapolation: np.ndarray, + res_evaluation: np.ndarray, + ) -> np.ndarray: """Join the points evaluated. This method is used internally by :func:`evaluate` to join the result of the evaluation and the result of the extrapolation. Args: - index_matrix (ndarray): Boolean index with the points extrapolated. - index_ext (ndarray): Boolean index with the columns that contains + index_matrix: Boolean index with the points extrapolated. + index_ext: Boolean index with the columns that contains points extrapolated. - index_ev (ndarray): Boolean index with the columns that contains + index_ev: Boolean index with the columns that contains points evaluated. - res_extrapolation (ndarray): Result of the extrapolation. - res_evaluation (ndarray): Result of the evaluation. + res_extrapolation: Result of the extrapolation. + res_evaluation: Result of the evaluation. Returns: - (ndarray): Matrix with the points evaluated with shape + Matrix with the points evaluated with shape `n_samples` x `number of points evaluated` x `dim_codomain`. """ - res = np.empty((self.n_samples, index_matrix.shape[-1], - self.dim_codomain)) + res = np.empty(( + self.n_samples, + index_matrix.shape[-1], + self.dim_codomain, + )) # Case aligned evaluation if index_matrix.ndim == 1: @@ -273,78 +325,95 @@ def _join_evaluation(self, index_matrix, index_ext, index_ev, return res @abstractmethod - def _evaluate(self, eval_points, *, aligned=True): - """Internal evaluation method, defines the evaluation of the FData. + def _evaluate( + self, + eval_points: np.ndarray, + *, + aligned: bool = True, + ) -> np.ndarray: + """Define the evaluation of the FData. Evaluates the samples of an FData object at several points. Subclasses must override this method to implement evaluation. Args: - eval_points (array_like): List of points where the functions are + eval_points: List of points where the functions are evaluated. If `aligned` is `True`, then a list of lists of points must be passed, with one list per sample. - aligned (bool, optional): Whether the input points are + aligned: Whether the input points are the same for each sample, or an array of points per sample is passed. Returns: - (numpy.darray): Numpy 3d array with shape `(n_samples, - len(eval_points), dim_codomain)` with the result of the - evaluation. The entry (i,j,k) will contain the value k-th image - dimension of the i-th sample, at the j-th evaluation point. + Numpy 3d array with shape `(n_samples, + len(eval_points), dim_codomain)` with the result of the + evaluation. The entry (i,j,k) will contain the value k-th image + dimension of the i-th sample, at the j-th evaluation point. """ pass - def evaluate(self, eval_points, *, derivative=0, extrapolation=None, - grid=False, aligned=True): - """Evaluate the object or its derivatives at a list of values or - a grid. + def evaluate( + self, + eval_points: np.ndarray, + *, + derivative: int = 0, + extrapolation: Optional[Union[str, Evaluator]] = None, + grid: bool = False, + aligned: bool = True, + ) -> np.ndarray: + """Evaluate the object at a list of values or a grid. Args: - eval_points (array_like): List of points where the functions are + eval_points: List of points where the functions are evaluated. If ``grid`` is ``True``, a list of axes, one per :term:`domain` dimension, must be passed instead. If ``aligned`` is ``True``, then a list of lists (of points or axes, as explained) must be passed, with one list per sample. - extrapolation (str or Extrapolation, optional): Controls the + derivative: Deprecated. Order of the derivative to evaluate. + extrapolation: Controls the extrapolation mode for elements outside the domain range. By default it is used the mode defined during the instance of the object. - grid (bool, optional): Whether to evaluate the results on a grid + grid: Whether to evaluate the results on a grid spanned by the input arrays, or at points specified by the input arrays. If true the eval_points should be a list of size dim_domain with the corresponding times for each axis. The return matrix has shape n_samples x len(t1) x len(t2) x ... x len(t_dim_domain) x dim_codomain. If the domain dimension is 1 the parameter has no efect. Defaults to False. - aligned (bool, optional): Whether the input points are - the same for each sample, or an array of points per sample is - passed. + aligned: Whether the input points are the same for each sample, + or an array of points per sample is passed. Returns: - (np.darray): Matrix whose rows are the values of the each + Matrix whose rows are the values of the each function at the values specified in eval_points. """ if derivative != 0: - warnings.warn("Parameter derivative is deprecated. Use the " - "derivative function instead.", DeprecationWarning) + warnings.warn( + "Parameter derivative is deprecated. Use the " + "derivative function instead.", + DeprecationWarning, + ) return self.derivative(order=derivative)( eval_points, extrapolation=extrapolation, grid=grid, - aligned=aligned) + aligned=aligned, + ) if grid: # Evaluation of a grid performed in auxiliar function - return _evaluate_grid(eval_points, - evaluate_method=self.evaluate, - n_samples=self.n_samples, - dim_domain=self.dim_domain, - dim_codomain=self.dim_codomain, - extrapolation=extrapolation, - aligned=aligned) + return _evaluate_grid( + eval_points, + evaluate_method=self.evaluate, + n_samples=self.n_samples, + dim_domain=self.dim_domain, + dim_codomain=self.dim_codomain, + extrapolation=extrapolation, + aligned=aligned, + ) if extrapolation is None: extrapolation = self.extrapolation @@ -353,270 +422,301 @@ def evaluate(self, eval_points, *, derivative=0, extrapolation=None, extrapolation = _parse_extrapolation(extrapolation) # Convert to array and check dimensions of eval points - eval_points = _reshape_eval_points(eval_points, - aligned=aligned, - n_samples=self.n_samples, - dim_domain=self.dim_domain) + eval_points = _reshape_eval_points( + eval_points, + aligned=aligned, + n_samples=self.n_samples, + dim_domain=self.dim_domain, + ) - # Check if extrapolation should be applied if extrapolation is not None: - index_matrix = self._extrapolation_index(eval_points) - extrapolate = index_matrix.any() - else: - extrapolate = False - - if not extrapolate: # Direct evaluation + index_matrix = self._extrapolation_index(eval_points) - res = self._evaluate( - eval_points, aligned=aligned) + if index_matrix.any(): - else: - # Partition of eval points - if aligned: + # Partition of eval points + if aligned: - index_ext = index_matrix - index_ev = ~index_matrix + index_ext = index_matrix + index_ev = ~index_matrix - eval_points_extrapolation = eval_points[index_ext] - eval_points_evaluation = eval_points[index_ev] + eval_points_extrapolation = eval_points[index_ext] + eval_points_evaluation = eval_points[index_ev] - else: - index_ext = np.logical_or.reduce(index_matrix, axis=0) - eval_points_extrapolation = eval_points[:, index_ext] + else: + index_ext = np.logical_or.reduce(index_matrix, axis=0) + eval_points_extrapolation = eval_points[:, index_ext] - index_ev = np.logical_or.reduce(~index_matrix, axis=0) - eval_points_evaluation = eval_points[:, index_ev] + index_ev = np.logical_or.reduce(~index_matrix, axis=0) + eval_points_evaluation = eval_points[:, index_ev] - # Direct evaluation - res_evaluation = self._evaluate( - eval_points_evaluation, - aligned=aligned) + # Direct evaluation + res_evaluation = self._evaluate( + eval_points_evaluation, + aligned=aligned, + ) - res_extrapolation = extrapolation.evaluate( - self, - eval_points_extrapolation, - aligned=aligned) + res_extrapolation = extrapolation.evaluate( + self, + eval_points_extrapolation, + aligned=aligned, + ) - res = self._join_evaluation(index_matrix, index_ext, index_ev, - res_extrapolation, res_evaluation) + return self._join_evaluation( + index_matrix, + index_ext, + index_ev, + res_extrapolation, + res_evaluation, + ) - return res + return self._evaluate( + eval_points, + aligned=aligned, + ) - def __call__(self, eval_points, *, derivative=0, extrapolation=None, - grid=False, aligned=True): - """Evaluate the object or its derivatives at a list of values or a + def __call__( + self, + eval_points: np.ndarray, + *, + derivative: int = 0, + extrapolation: Optional[Union[str, Evaluator]] = None, + grid: bool = False, + aligned: bool = True, + ) -> np.ndarray: + """Evaluate the :term:`functional object`. + + Evaluate the object or its derivatives at a list of values or a grid. This method is a wrapper of :meth:`evaluate`. Args: - eval_points (array_like): List of points where the functions are + eval_points: List of points where the functions are evaluated. If a matrix of shape nsample x eval_points is given each sample is evaluated at the values in the corresponding row in eval_points. - derivative (int, optional): Order of the derivative. Defaults to 0. - extrapolation (str or Extrapolation, optional): Controls the + derivative: Order of the derivative. Defaults to 0. + extrapolation: Controls the extrapolation mode for elements outside the domain range. By default it is used the mode defined during the instance of the object. - grid (bool, optional): Whether to evaluate the results on a grid + grid: Whether to evaluate the results on a grid spanned by the input arrays, or at points specified by the input arrays. If true the eval_points should be a list of size dim_domain with the corresponding times for each axis. The return matrix has shape n_samples x len(t1) x len(t2) x ... x len(t_dim_domain) x dim_codomain. If the domain dimension is 1 the parameter has no efect. Defaults to False. + aligned: Whether the input points are the same for each sample, + or an array of points per sample is passed. Returns: - (np.ndarray): Matrix whose rows are the values of the each + Matrix whose rows are the values of the each function at the values specified in eval_points. """ - return self.evaluate(eval_points, derivative=derivative, - extrapolation=extrapolation, grid=grid, - aligned=aligned) + return self.evaluate( + eval_points, + derivative=derivative, + extrapolation=extrapolation, + grid=grid, + aligned=aligned, + ) @abstractmethod - def derivative(self, order=1): + def derivative(self: T, order: int = 1) -> T: """Differentiate a FData object. - Args: - order (int, optional): Order of the derivative. Defaults to one. + order: Order of the derivative. Defaults to one. Returns: - :class:`FData`: Functional object containg the derivative. + Functional object containg the derivative. + """ pass @abstractmethod - def shift(self, shifts, *, restrict_domain=False, extrapolation=None, - discretization_points=None, **kwargs): + def shift( + self: T, + shifts: Union[float, np.ndarray], + *, + restrict_domain: bool = False, + extrapolation: Optional[Union[str, Evaluator]] = None, + eval_points: np.ndarray = None, + **kwargs: Any, + ) -> T: """Perform a shift of the curves. Args: - shifts (array_like or numeric): List with the shift corresponding + shifts: List with the shift corresponding for each sample or numeric with the shift to apply to all samples. - restrict_domain (bool, optional): If True restricts the domain to + restrict_domain: If True restricts the domain to avoid evaluate points outside the domain using extrapolation. Defaults uses extrapolation. - extrapolation (str or Extrapolation, optional): Controls the + extrapolation: Controls the extrapolation mode for elements outside the domain range. By default uses the method defined in fd. See extrapolation to more information. - discretization_points (array_like, optional): Set of points where + eval_points: Set of points where the functions are evaluated to obtain the discrete representation of the object to operate. If an empty list is passed it calls np.linspace with bounds equal to the ones defined in fd.domain_range and the number of points the maximum between 201 and 10 times the number of basis plus 1. + kwargs: Additional arguments. Returns: :class:`FData` with the shifted functional data. + """ pass - def plot(self, *args, **kwargs): + def plot(self, *args: Any, **kwargs: Any) -> Any: """Plot the FDatGrid object. Args: - chart (figure object, axe or list of axes, optional): figure over - with the graphs are plotted or axis over where the graphs are - plotted. If None and ax is also None, the figure is - initialized. - derivative (int or tuple, optional): Order of derivative to be - plotted. In case of surfaces a tuple with the order of - derivation in each direction can be passed. See - :func:`evaluate` to obtain more information. Defaults 0. - fig (figure object, optional): figure over with the graphs are - plotted in case ax is not specified. If None and ax is also - None, the figure is initialized. - ax (list of axis objects, optional): axis over where the graphs are - plotted. If None, see param fig. - n_rows (int, optional): designates the number of rows of the figure - to plot the different dimensions of the image. Only specified - if fig and ax are None. - n_cols (int, optional): designates the number of columns of the - figure to plot the different dimensions of the image. Only - specified if fig and ax are None. - n_points (int or tuple, optional): Number of points to evaluate in - the plot. In case of surfaces a tuple of length 2 can be pased - with the number of points to plot in each axis, otherwise the - same number of points will be used in the two axes. By default - in unidimensional plots will be used 501 points; in surfaces - will be used 30 points per axis, wich makes a grid with 900 - points. - domain_range (tuple or list of tuples, optional): Range where the - function will be plotted. In objects with unidimensional - :term:`domain` the domain range should be a tuple with the - bounds of the interval; in the case of surfaces a list with 2 - tuples with the ranges for each dimension. Default uses the - domain range of the functional object. - group (list of int): contains integers from [0 to number of - labels) indicating to which group each sample belongs to. Then, - the samples with the same label are plotted in the same color. - If None, the default value, each sample is plotted in the color - assigned by matplotlib.pyplot.rcParams['axes.prop_cycle']. - group_colors (list of colors): colors in which groups are - represented, there must be one for each group. If None, each - group is shown with distict colors in the "Greys" colormap. - group_names (list of str): name of each of the groups which appear - in a legend, there must be one for each one. Defaults to None - and the legend is not shown. - **kwargs: if dim_domain is 1, keyword arguments to be passed to - the matplotlib.pyplot.plot function; if dim_domain is 2, - keyword arguments to be passed to the - matplotlib.pyplot.plot_surface function. + args: positional arguments for :func:`plot_graph`. + kwargs: keyword arguments for :func:`plot_graph`. Returns: fig (figure object): figure object in which the graphs are plotted. """ - from ..exploratory.visualization.representation import ( - plot_graph) + from ..exploratory.visualization.representation import plot_graph return plot_graph(self, *args, **kwargs) @abstractmethod - def copy(self, **kwargs): + def copy(self: T, **kwargs: Any) -> T: """Make a copy of the object. Args: kwargs: named args with attributes to be changed in the new copy. Returns: - FData: A copy of the FData object. + A copy of the FData object. """ pass @abstractmethod - def sum(self, *, axis=None, out=None, keepdims=False, skipna=False, - min_count=0): + def sum( + self: T, + *, + axis: None = None, + out: None = None, + keepdims: bool = False, + skipna: bool = False, + min_count: int = 0, + ) -> T: """Compute the sum of all the samples. + Args: + axis: Used for compatibility with numpy. Must be None or 0. + out: Used for compatibility with numpy. Must be None. + keepdims: Used for compatibility with numpy. Must be False. + skipna: Wether the NaNs are ignored or not. + min_count: Number of valid (non NaN) data to have in order + for the a variable to not be NaN when `skipna` is + `True`. + Returns: - FData : A FData object with just one sample representing + A FData object with just one sample representing the sum of all the samples in the original object. """ - if ((axis is not None and axis != 0) or - out is not None or keepdims is not False): + if ( + (axis is not None and axis != 0) + or out is not None + or keepdims is not False + ): raise NotImplementedError( - "Not implemented for that parameter combination") + "Not implemented for that parameter combination", + ) + + return self + + def mean( + self: T, + *, + axis: None = None, + dtype: None = None, + out: None = None, + keepdims: bool = False, + skipna: bool = False, + ) -> T: + """Compute the mean of all the samples. + + Args: + axis: Used for compatibility with numpy. Must be None or 0. + dtype: Used for compatibility with numpy. Must be None. + out: Used for compatibility with numpy. Must be None. + keepdims: Used for compatibility with numpy. Must be False. + skipna: Wether the NaNs are ignored or not. - def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, - skipna=False): + Returns: + A FData object with just one sample representing + the mean of all the samples in the original object. + """ if dtype is not None: raise NotImplementedError( - "Not implemented for that parameter combination") + "Not implemented for that parameter combination", + ) - return (self.sum(axis=axis, out=out, keepdims=keepdims, skipna=skipna) - / self.n_samples) + return ( + self.sum(axis=axis, out=out, keepdims=keepdims, skipna=skipna) + / self.n_samples + ) @abstractmethod - def to_grid(self, grid_points=None): + def to_grid(self, grid_points: np.ndarray = None) -> 'FDataGrid': """Return the discrete representation of the object. Args: - grid_points (array_like, optional): Points per axis - where the function is going to be evaluated. + grid_points: Points per axis + where the function is going to be evaluated. Returns: - FDataGrid: Discrete representation of the functional data - object. - """ + Discrete representation of the functional data + object. + """ pass @abstractmethod - def to_basis(self, basis, eval_points=None, **kwargs): + def to_basis( + self, + basis: 'Basis', + **kwargs: Any, + ) -> 'FDataBasis': """Return the basis representation of the object. Args: - basis(Basis): basis object in which the functional data are + basis: basis object in which the functional data are going to be represented. - **kwargs: keyword arguments to be passed to + kwargs: keyword arguments to be passed to FDataBasis.from_data(). Returns: - FDataBasis: Basis representation of the funtional data + Basis representation of the funtional data object. - """ + """ pass @abstractmethod - def concatenate(self, *others, as_coordinates=False): + def concatenate(self: T, *others: T, as_coordinates: bool = False) -> T: """Join samples from a similar FData object. Joins samples from another FData object if it has the same dimensions and has compatible representations. Args: - others (:class:`FData`): other FData objects. - as_coordinates (boolean, optional): If False concatenates as + others: other FData objects. + as_coordinates: If False concatenates as new samples, else, concatenates the other functions as new components of the image. Defaults to False. @@ -628,31 +728,37 @@ def concatenate(self, *others, as_coordinates=False): pass @abstractmethod - def compose(self, fd, *, eval_points=None, **kwargs): + def compose( + self: T, + fd: T, + *, + eval_points: np.ndarray = None, + **kwargs: Any, + ) -> T: """Composition of functions. Performs the composition of functions. Args: - fd (:class:`FData`): FData object to make the composition. Should + fd: FData object to make the composition. Should have the same number of samples and image dimension equal to the domain dimension of the object composed. - eval_points (array_like): Points to perform the evaluation. - **kwargs: Named arguments to be passed to the composition method of + eval_points: Points to perform the evaluation. + kwargs: Named arguments to be passed to the composition method of the specific functional object. """ pass @abstractmethod - def __getitem__(self, key): + def __getitem__(self: T, key: Union[int, slice]) -> T: """Return self[key].""" - pass - def equals(self, other): + def equals(self, other: Any) -> bool: + """Whole object equality.""" return ( - type(self) == type(other) + isinstance(other, type(self)) # noqa: WPS222 and self.extrapolation == other.extrapolation and self.dataset_name == other.dataset_name and self.argument_names == other.argument_names @@ -660,93 +766,85 @@ def equals(self, other): ) @abstractmethod - def __eq__(self, key): + def __eq__(self, other: Any) -> np.ndarray: pass - def __ne__(self, other): - """ - Return for `self != other` (element-wise in-equality). - """ + def __ne__(self, other: Any) -> np.ndarray: + """Return for `self != other` (element-wise in-equality).""" result = self.__eq__(other) if result is NotImplemented: return NotImplemented return ~result - def _copy_op(self, other, **kwargs): + def _copy_op( + self: T, + other: Union[T, np.ndarray, float], + **kwargs: Any, + ) -> T: - base_copy = (other if isinstance(other, type(self)) - and self.n_samples == 1 and other.n_samples != 1 - else self) + base_copy = ( + other if isinstance(other, type(self)) + and self.n_samples == 1 and other.n_samples != 1 + else self + ) return base_copy.copy(**kwargs) @abstractmethod - def __add__(self, other): + def __add__(self: T, other: Union[T, np.ndarray, float]) -> T: """Addition for FData object.""" - pass @abstractmethod - def __radd__(self, other): + def __radd__(self: T, other: Union[T, np.ndarray, float]) -> T: """Addition for FData object.""" - pass @abstractmethod - def __sub__(self, other): + def __sub__(self: T, other: Union[T, np.ndarray, float]) -> T: """Subtraction for FData object.""" - pass @abstractmethod - def __rsub__(self, other): + def __rsub__(self: T, other: Union[T, np.ndarray, float]) -> T: """Right subtraction for FData object.""" - pass @abstractmethod - def __mul__(self, other): + def __mul__(self: T, other: Union[np.ndarray, float]) -> T: """Multiplication for FData object.""" - pass @abstractmethod - def __rmul__(self, other): + def __rmul__(self: T, other: Union[np.ndarray, float]) -> T: """Multiplication for FData object.""" - pass @abstractmethod - def __truediv__(self, other): + def __truediv__(self: T, other: Union[np.ndarray, float]) -> T: """Division for FData object.""" - pass @abstractmethod - def __rtruediv__(self, other): + def __rtruediv__(self: T, other: Union[np.ndarray, float]) -> T: """Right division for FData object.""" - pass - def __iter__(self): - """Iterate over the samples""" - - for i in range(self.n_samples): - yield self[i] - - def __len__(self): - """Returns the number of samples of the FData object.""" + def __iter__(self: T) -> Iterator[T]: + """Iterate over the samples.""" + yield from (self[i] for i in range(self.n_samples)) + def __len__(self) -> int: + """Return the number of samples of the FData object.""" return self.n_samples ##################################################################### # Numpy methods ##################################################################### - def __array__(self, *args, **kwargs): - """Returns a numpy array with the objects""" - + def __array__(self, *args: Any, **kwargs: Any) -> np.ndarray: + """Return a numpy array with the objects.""" # This is to prevent numpy to access inner dimensions array = np.empty(shape=len(self), dtype=np.object_) @@ -759,60 +857,74 @@ def __array__(self, *args, **kwargs): # Pandas ExtensionArray methods ##################################################################### @property - def ndim(self): + def ndim(self) -> int: """ - Return number of dimensions of the functional data. It is - always 1, as each observation is considered a "scalar" object. + Return number of dimensions of the functional data. + + It is always 1, as each observation is considered a "scalar" object. Returns: - int: Number of dimensions of the functional data. + Number of dimensions of the functional data. """ return 1 @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): - if isinstance(scalars, cls): - scalars = [scalars] + def _from_sequence( + cls, + scalars: Union['FData', Sequence['FData']], + dtype: Any = None, + copy: bool = False, + ) -> 'FData': + + scalars_seq: Sequence['FData'] = ( + [scalars] if isinstance(scalars, cls) else scalars + ) if copy: - scalars = [f.copy() for f in scalars] + scalars_seq = [f.copy() for f in scalars_seq] if dtype is None: - first_element = next(s for s in scalars if s is not pandas.NA) + first_element = next(s for s in scalars_seq if s is not pandas.NA) dtype = first_element.dtype - scalars = [s if s is not pandas.NA else dtype._na_repr() - for s in scalars] + scalars_seq = [ + s if s is not pandas.NA else dtype._na_repr() # noqa: WPS437 + for s in scalars_seq + ] - if len(scalars) == 0: - scalars = [dtype._na_repr()[0:0]] + if len(scalars_seq) == 0: + scalars_seq = [dtype._na_repr()[:0]] # noqa: WPS437 - return cls._concat_same_type(scalars) + return cls._concat_same_type(scalars_seq) @classmethod - def _from_factorized(cls, values, original): - raise NotImplementedError("Factorization does not make sense for " - "functional data") + def _from_factorized(cls, values: Any, original: Any) -> NoReturn: + raise NotImplementedError( + "Factorization does not make sense for functional data", + ) - def take(self, indices, allow_fill=False, fill_value=None, axis=0): + def take( + self: T, + indices: Sequence[int], + allow_fill: bool = False, + fill_value: Optional[T] = None, + axis: int = 0, + ) -> T: """Take elements from an array. Parameters: - indices (sequence of integers): + indices: Indices to be taken. - allow_fill (bool, default False): How to handle negative values - in `indices`. - + allow_fill: How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other negative values raise a ``ValueError``. - - fill_value (any, optional): - Fill value to use for NA-indices when `allow_fill` is True. + fill_value: Fill value to use for NA-indices + when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of @@ -820,6 +932,7 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=0): physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. + axis: Parameter for compatibility with numpy. Must be always 0. Returns: FData @@ -854,23 +967,27 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=0): # fill value should always be translated from the scalar # type for the array, to the physical storage type for # the data, before passing to take. - result = take(data, indices, fill_value=fill_value, - allow_fill=allow_fill) + result = take( + data, + indices, + fill_value=fill_value, + allow_fill=allow_fill, + ) return self._from_sequence(result, dtype=self.dtype) @classmethod def _concat_same_type( - cls, - to_concat - ): + cls, + to_concat: Sequence[T], + ) -> T: """ - Concatenate multiple array + Concatenate multiple array. Parameters: - to_concat (sequence of FData) + to_concat: Sequence of FData objects to concat. Returns: - FData + Concatenation of the objects. """ if isinstance(to_concat, cls): @@ -878,36 +995,39 @@ def _concat_same_type( return concatenate(to_concat) - def astype(self, dtype, copy=True): + def astype(self, dtype: Any, copy: bool = True) -> Any: + """Cast to a new dtype.""" if isinstance(dtype, type(self.dtype)): + new_obj = self if copy: - self = self.copy() - return self + new_obj = self.copy() + return new_obj return super().astype(dtype) - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs: Any) -> Any: meth = getattr(self, name, None) if meth: return meth(skipna=skipna, **kwargs) - else: - msg = (f"'{type(self).__name__}' does not implement " - f"reduction '{name}'") - raise TypeError(msg) + + raise TypeError( + f"'{type(self).__name__}' does not implement " + f"reduction '{name}'", + ) -def concatenate(objects, as_coordinates=False): +def concatenate(functions: Iterable[T], as_coordinates: bool = False) -> T: """ Join samples from an iterable of similar FData objects. Joins samples of FData objects if they have the same dimensions and sampling points. Args: - objects (list of :obj:`FDataBasis`): Objects to be concatenated. - as_coordinates (boolean, optional): If False concatenates as + objects: Objects to be concatenated. + as_coordinates: If False concatenates as new samples, else, concatenates the other functions as new components of the image. Defaults to False. Returns: - :obj:`FData`: FData object with the samples from the + FData object with the samples from the original objects. Raises: ValueError: In case the provided list of FData objects is @@ -916,11 +1036,12 @@ def concatenate(objects, as_coordinates=False): By the moment, only unidimensional objects are supported in basis representation. """ - objects = iter(objects) - first = next(objects, None) + functions = iter(functions) + first = next(functions, None) if first is None: - raise ValueError("At least one FData object must be provided " - "to concatenate.") + raise ValueError( + "At least one FData object must be provided to concatenate.", + ) - return first.concatenate(*objects, as_coordinates=as_coordinates) + return first.concatenate(*functions, as_coordinates=as_coordinates) diff --git a/skfda/representation/evaluator.py b/skfda/representation/evaluator.py index 3740ce987..78d4b0a12 100644 --- a/skfda/representation/evaluator.py +++ b/skfda/representation/evaluator.py @@ -2,6 +2,10 @@ object for extrapolation and evaluation of FDataGrids""" from abc import ABC, abstractmethod +from typing import Any, Callable + +import numpy as np +from typing_extensions import Protocol class Evaluator(ABC): @@ -21,7 +25,13 @@ class Evaluator(ABC): """ @abstractmethod - def evaluate(self, fdata, eval_points, *, aligned=True): + def evaluate( + self, + fdata: Callable[[np.ndarray], np.ndarray], + eval_points: np.ndarray, + *, + aligned: bool = True, + ) -> np.ndarray: """Evaluation method. Evaluates the samples at evaluation points. The evaluation @@ -44,14 +54,47 @@ def evaluate(self, fdata, eval_points, *, aligned=True): """ pass - def __repr__(self): + def __repr__(self) -> str: return f"{type(self)}()" - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: """Equality operator between evaluators.""" return type(self) == type(other) +class EvaluateFunction(Protocol): + """ Callback of an evaluation function.""" + + def __call__( + self, + fdata: Callable[[np.ndarray], np.ndarray], + eval_points: np.ndarray, + *, + aligned: bool = True, + ) -> np.ndarray: + """Evaluation method. + + Evaluates the samples at evaluation points. The evaluation + call will receive a 2-d array with the evaluation points, or + a 3-d array with the evaluation points per sample if ``aligned`` + is ``False``. + + Args: + eval_points (numpy.ndarray): Numpy array with shape + ``(number_eval_points, dim_domain)`` with the + evaluation points. + + Returns: + (numpy.darray): Numpy 3d array with shape + ``(n_samples, number_eval_points, dim_codomain)`` with the + result of the evaluation. The entry ``(i,j,k)`` will contain + the value k-th image dimension of the i-th sample, at the + j-th evaluation point. + + """ + pass + + class GenericEvaluator(Evaluator): """Generic Evaluator. @@ -61,8 +104,14 @@ class GenericEvaluator(Evaluator): """ - def __init__(self, evaluate_function): + def __init__(self, evaluate_function: EvaluateFunction) -> None: self.evaluate_function = evaluate_function - def evaluate(self, fdata, eval_points, *, aligned=True): + def evaluate( + self, + fdata: Callable[[np.ndarray], np.ndarray], + eval_points: np.ndarray, + *, + aligned: bool = True, + ) -> np.ndarray: return self.evaluate_function(fdata, eval_points, aligned=aligned) diff --git a/skfda/representation/extrapolation.py b/skfda/representation/extrapolation.py index fa6a057c3..e9e761546 100644 --- a/skfda/representation/extrapolation.py +++ b/skfda/representation/extrapolation.py @@ -4,6 +4,8 @@ """ +from typing import Optional, Union + import numpy as np from .evaluator import Evaluator @@ -201,7 +203,9 @@ def __eq__(self, other): or np.isnan(self.fill_value) and np.isnan(other.fill_value)) -def _parse_extrapolation(extrapolation): +def _parse_extrapolation( + extrapolation: Optional[Union[str, Evaluator]], +) -> Optional[Evaluator]: """Parse the argument `extrapolation` of `FData`. If extrapolation is None returns the default extrapolator. From 0717d877d00a5f62381df79bae645ce8b7f20ecd Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 21 Dec 2020 13:45:50 +0100 Subject: [PATCH 197/210] Fix style. --- setup.cfg | 5 ++++ skfda/representation/_functional_data.py | 38 +++++++++++++----------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/setup.cfg b/setup.cfg index 183436e12..f9ef6b6b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,8 @@ ignore = WPS111, # Trailing underscores are a scikit-learn convention WPS120, + # The number of imported things may be large, especially for typing + WPS235, # We like local imports, thanks WPS300, # Dotted imports are ok @@ -43,6 +45,8 @@ ignore = WPS326, # We allow multiline conditions WPS337, + # We order methods differently + WPS338, # We need multine loops WPS352, # All keywords are beautiful @@ -87,6 +91,7 @@ allowed-domain-names = data, obj, result, value, values, var # Needs to be tuned max-arguments = 10 +max-attributes = 10 max-line-complexity = 25 max-methods = 30 max-local-variables = 15 diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 03565fba2..1d29301bc 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -32,9 +32,13 @@ T = TypeVar('T', bound='FData') DomainRange = Tuple[Tuple[float, float], ...] +LabelTuple = Tuple[Optional[str], ...] -class FData(ABC, pandas.api.extensions.ExtensionArray): # type: ignore +class FData( # noqa: WPS214 + ABC, + pandas.api.extensions.ExtensionArray, # type: ignore +): """Defines the structure of a functional data object. Attributes: @@ -56,10 +60,10 @@ def __init__( extrapolation: Evaluator, dataset_name: Optional[str] = None, dataset_label: Optional[str] = None, - axes_labels: Optional[Tuple[Optional[str], ...]] = None, - argument_names: Optional[Tuple[Optional[str], ...]] = None, - coordinate_names: Optional[Tuple[Optional[str], ...]] = None, - sample_names: Optional[Tuple[Optional[str], ...]] = None, + axes_labels: Optional[LabelTuple] = None, + argument_names: Optional[LabelTuple] = None, + coordinate_names: Optional[LabelTuple] = None, + sample_names: Optional[LabelTuple] = None, ) -> None: self.extrapolation = extrapolation @@ -68,11 +72,11 @@ def __init__( if dataset_label is not None: self.dataset_label = dataset_label - self.argument_names = argument_names - self.coordinate_names = coordinate_names + self.argument_names = argument_names # type: ignore + self.coordinate_names = coordinate_names # type: ignore if axes_labels is not None: self.axes_labels = axes_labels - self.sample_names = sample_names + self.sample_names = sample_names # type: ignore @property def dataset_label(self) -> Optional[str]: @@ -93,13 +97,13 @@ def dataset_label(self, name: Optional[str]) -> None: self.dataset_name = name @property - def argument_names(self) -> Tuple[Optional[str], ...]: + def argument_names(self) -> LabelTuple: return self._argument_names @argument_names.setter def argument_names( self, - names: Optional[Tuple[Optional[str], ...]], + names: Optional[LabelTuple], ) -> None: if names is None: names = (None,) * self.dim_domain @@ -114,13 +118,13 @@ def argument_names( self._argument_names = names @property - def coordinate_names(self) -> Tuple[Optional[str], ...]: + def coordinate_names(self) -> LabelTuple: return self._coordinate_names @coordinate_names.setter def coordinate_names( self, - names: Optional[Tuple[Optional[str], ...]], + names: Optional[LabelTuple], ) -> None: if names is None: names = (None,) * self.dim_codomain @@ -135,7 +139,7 @@ def coordinate_names( self._coordinate_names = names @property - def axes_labels(self) -> Tuple[Optional[str], ...]: + def axes_labels(self) -> LabelTuple: warnings.warn( "Parameter axes_labels is deprecated. Use the " "parameters argument_names and " @@ -146,7 +150,7 @@ def axes_labels(self) -> Tuple[Optional[str], ...]: return self.argument_names + self.coordinate_names @axes_labels.setter - def axes_labels(self, labels: Tuple[Optional[str], ...]) -> None: + def axes_labels(self, labels: LabelTuple) -> None: """Set the list of labels.""" if labels is not None: @@ -174,11 +178,11 @@ def axes_labels(self, labels: Tuple[Optional[str], ...]) -> None: self.coordinate_names = labels_array[self.dim_domain:] @property - def sample_names(self) -> Tuple[Optional[str], ...]: + def sample_names(self) -> LabelTuple: return self._sample_names @sample_names.setter - def sample_names(self, names: Optional[Tuple[Optional[str], ...]]) -> None: + def sample_names(self, names: Optional[LabelTuple]) -> None: if names is None: names = (None,) * self.n_samples else: @@ -601,7 +605,7 @@ def copy(self: T, **kwargs: Any) -> T: """ pass - @abstractmethod + @abstractmethod # noqa: WPS125 def sum( self: T, *, From 4615bcdf70f424f0cfdc85f709b713f5ae3f4977 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 21 Dec 2020 18:09:50 +0100 Subject: [PATCH 198/210] Fix style errors. --- .github/workflows/mypy.yml | 1 + skfda/_utils/_utils.py | 24 ++++++------ skfda/representation/_functional_data.py | 2 +- skfda/representation/evaluator.py | 49 +++++++++++++++--------- 4 files changed, 46 insertions(+), 30 deletions(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 0e90e4224..82b4b8d1c 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -17,3 +17,4 @@ jobs: # Change reporter level if you need. # GitHub Status Check won't become failure with warning. level: warning + mypy_flags: '' \ No newline at end of file diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index e4d023cb1..528c93c36 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -2,12 +2,14 @@ import functools import numbers -from typing import Optional, Sequence, Union +from typing import Any, Optional, Sequence, Union import numpy as np import scipy.integrate from pandas.api.indexers import check_array_indexer +from ..representation.evaluator import Evaluator + RandomStateLike = Optional[Union[int, np.random.RandomState]] @@ -293,13 +295,13 @@ def _one_grid_to_points(axes, *, dim_domain): def _evaluate_grid( axes: Sequence[np.ndarray], *, - evaluate_method, - n_samples, - dim_domain, - dim_codomain, - extrapolation=None, - aligned=True -): + evaluate_method: Any, + n_samples: int, + dim_domain: int, + dim_codomain: int, + extrapolation: Optional[Evaluator]=None, + aligned: bool=True, +) -> np.ndarray: """Evaluate the functional object in the cartesian grid. This method is called internally by :meth:`evaluate` when the argument @@ -325,15 +327,15 @@ def _evaluate_grid( Args: axes: List of axes to generated the grid where the object will be evaluated. - extrapolation (str or Extrapolation, optional): Controls the + extrapolation: Controls the extrapolation mode for elements outside the domain range. By default it is used the mode defined during the instance of the object. - aligned (bool, optional): If False evaluates each sample + aligned: If False evaluates each sample in a different grid. Returns: - (numpy.darray): Numpy array with dim_domain + 1 dimensions with + Numpy array with dim_domain + 1 dimensions with the result of the evaluation. Raises: diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 1d29301bc..5843778fb 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -606,7 +606,7 @@ def copy(self: T, **kwargs: Any) -> T: pass @abstractmethod # noqa: WPS125 - def sum( + def sum( # noqa: WPS125 self: T, *, axis: None = None, diff --git a/skfda/representation/evaluator.py b/skfda/representation/evaluator.py index 78d4b0a12..24ad1f1bc 100644 --- a/skfda/representation/evaluator.py +++ b/skfda/representation/evaluator.py @@ -1,5 +1,9 @@ -"""This module contains the structure of the evaluator, the core of the FData -object for extrapolation and evaluation of FDataGrids""" +""" +This module contains the structure of the evaluator. + +The evaluator is the core of the FData object for extrapolation and +evaluation of FDataGrids. +""" from abc import ABC, abstractmethod from typing import Any, Callable @@ -9,7 +13,8 @@ class Evaluator(ABC): - """Structure of an evaluator. + """ + Structure of an evaluator. An evaluator defines how to evaluate points of a functional object, it can be used as extrapolator to evaluate points outside the :term:`domain` @@ -22,8 +27,8 @@ class Evaluator(ABC): Should implement the methods :func:`evaluate` and :func:`evaluate_composed`. - """ + @abstractmethod def evaluate( self, @@ -32,17 +37,21 @@ def evaluate( *, aligned: bool = True, ) -> np.ndarray: - """Evaluation method. + """ + Evaluate the samples at evaluation points. - Evaluates the samples at evaluation points. The evaluation - call will receive a 2-d array with the evaluation points, or - a 3-d array with the evaluation points per sample if ``aligned`` - is ``False``. + The evaluation call will receive a 2-d array with the + evaluation points, or a 3-d array with the evaluation points per + sample if ``aligned`` is ``False``. Args: - eval_points (numpy.ndarray): Numpy array with shape + fdata: Object to evaluate. + eval_points: Numpy array with shape ``(number_eval_points, dim_domain)`` with the evaluation points. + aligned: Whether the input points are + the same for each sample, or an array of points per sample is + passed. Returns: (numpy.darray): Numpy 3d array with shape @@ -59,11 +68,11 @@ def __repr__(self) -> str: def __eq__(self, other: Any) -> bool: """Equality operator between evaluators.""" - return type(self) == type(other) + return isinstance(other, type(self)) class EvaluateFunction(Protocol): - """ Callback of an evaluation function.""" + """Callback of an evaluation function.""" def __call__( self, @@ -72,17 +81,21 @@ def __call__( *, aligned: bool = True, ) -> np.ndarray: - """Evaluation method. + """ + Evaluate the samples at evaluation points. - Evaluates the samples at evaluation points. The evaluation - call will receive a 2-d array with the evaluation points, or - a 3-d array with the evaluation points per sample if ``aligned`` - is ``False``. + The evaluation call will receive a 2-d array with the + evaluation points, or a 3-d array with the evaluation points per + sample if ``aligned`` is ``False``. Args: + fdata: Object to evaluate. eval_points (numpy.ndarray): Numpy array with shape ``(number_eval_points, dim_domain)`` with the evaluation points. + aligned: Whether the input points are + the same for each sample, or an array of points per sample is + passed. Returns: (numpy.darray): Numpy 3d array with shape @@ -107,7 +120,7 @@ class GenericEvaluator(Evaluator): def __init__(self, evaluate_function: EvaluateFunction) -> None: self.evaluate_function = evaluate_function - def evaluate( + def evaluate( # noqa: D102 self, fdata: Callable[[np.ndarray], np.ndarray], eval_points: np.ndarray, From c143a28c5ac957fa8afe5d26543c912479694556 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 21 Dec 2020 19:03:42 +0100 Subject: [PATCH 199/210] Fix style. --- skfda/_utils/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 528c93c36..ba6808e3d 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -299,8 +299,8 @@ def _evaluate_grid( n_samples: int, dim_domain: int, dim_codomain: int, - extrapolation: Optional[Evaluator]=None, - aligned: bool=True, + extrapolation: Optional[Union[str, Evaluator]] = None, + aligned: bool = True, ) -> np.ndarray: """Evaluate the functional object in the cartesian grid. From ae8f4b5c7e3dd0cac7604fe5367690a222cdffe5 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 27 Dec 2020 17:31:05 +0100 Subject: [PATCH 200/210] Add tests for the Tensor basis. --- .../registration/_shift_registration.py | 7 +- tests/test_basis.py | 118 +++++++++++++++++- 2 files changed, 115 insertions(+), 10 deletions(-) diff --git a/skfda/preprocessing/registration/_shift_registration.py b/skfda/preprocessing/registration/_shift_registration.py index f7c4c7a20..19aae9f6c 100644 --- a/skfda/preprocessing/registration/_shift_registration.py +++ b/skfda/preprocessing/registration/_shift_registration.py @@ -3,13 +3,12 @@ # Pablo Marcos Manchón # pablo.marcosm@protonmail.com +import numpy as np from scipy.integrate import simps from sklearn.utils.validation import check_is_fitted -import numpy as np - from ... import FData, FDataGrid -from ..._utils import constants, check_is_univariate +from ..._utils import check_is_univariate, constants from .base import RegistrationTransformer @@ -155,7 +154,7 @@ def _compute_deltas(self, fd, template): domain_range = fd.domain_range[0] # Initial estimation of the shifts - if self.initial is "zeros": + if self.initial == "zeros": delta = np.zeros(fd.n_samples) elif len(self.initial) != fd.n_samples: diff --git a/tests/test_basis.py b/tests/test_basis.py index 0f9ef2d31..61b75af8e 100644 --- a/tests/test_basis.py +++ b/tests/test_basis.py @@ -1,13 +1,21 @@ -from skfda import concatenate -import skfda -from skfda.misc import inner_product, inner_product_matrix -from skfda.representation.basis import (Basis, FDataBasis, Constant, Monomial, - BSpline, Fourier) -from skfda.representation.grid import FDataGrid +import itertools import unittest import numpy as np +import skfda +from skfda import concatenate +from skfda.misc import inner_product, inner_product_matrix +from skfda.representation.basis import ( + Basis, + BSpline, + Constant, + FDataBasis, + Fourier, + Monomial, +) +from skfda.representation.grid import FDataGrid + class TestBasis(unittest.TestCase): @@ -408,6 +416,104 @@ def test_vector_valued(self): X.coordinates[1].to_basis(basis_dim).coefficients) +class TestTensorBasis(unittest.TestCase): + + def setUp(self) -> None: + """Create original and tensor bases.""" + self.n_x = 4 + self.n_y = 3 + self.n_z = 5 + + self.n = self.n_x * self.n_y * self.n_z + + self.dims = (self.n_x, self.n_y, self.n_z) + + self.basis_x = skfda.representation.basis.Monomial(n_basis=self.n_x) + self.basis_y = skfda.representation.basis.Fourier(n_basis=self.n_y) + self.basis_z = skfda.representation.basis.BSpline(n_basis=self.n_z) + + self.basis = skfda.representation.basis.Tensor([ + self.basis_x, + self.basis_y, + self.basis_z, + ]) + + def test_tensor_order(self) -> None: + """ + Check the order of the elements in the tensor basis. + + The order should be: + + a_1 b_1 c_1, a_1 b_1 c_2, ..., a_1 b_1 c_n, + a_1 b_2 c_1, a_1 b_1 c_2, ..., a_1 b_2 c_n, + . + . + . + a_1 b_m c_1, a_1 b_1 c_2, ..., a_1 b_m c_n, + a_2 b_1 c_1, a_2 b_1 c_2, ..., a_2 b_1 c_n, + . + . + . + + where the bases of the original spaces are A, B and C. + + """ + x_vals = [0, 0.3, 0.7] + y_vals = [0.2, 0.5, 0.9] + z_vals = [0.1, 0.4, 0.8] + + for t in itertools.product(x_vals, y_vals, z_vals): + + val_x = self.basis_x(t[0]) + val_y = self.basis_y(t[1]) + val_z = self.basis_z(t[2]) + val = self.basis([t]) + + for x in range(self.n_x): + for y in range(self.n_y): + for z in range(self.n_z): + + index = ( + x * self.n_y * self.n_z + + y * self.n_z + + z + ) + + index2 = np.ravel_multi_index( + [x, y, z], + dims=self.dims) + + self.assertEqual(index, index2) + + self.assertAlmostEqual( + val[index], + val_x[x] * val_y[y] * val_z[z], + ) + + def test_tensor_gram_matrix(self) -> None: + """Check that the Gram matrix is right.""" + gram_x = self.basis_x.gram_matrix() + gram_y = self.basis_y.gram_matrix() + gram_z = self.basis_z.gram_matrix() + + gram = self.basis.gram_matrix() + + for i in range(self.n): + for j in range(self.n): + left = np.unravel_index(i, shape=self.dims) + right = np.unravel_index(j, shape=self.dims) + + value_gram = gram[i, j] + value_gram_x = gram_x[left[0], right[0]] + value_gram_y = gram_y[left[1], right[1]] + value_gram_z = gram_z[left[2], right[2]] + + self.assertAlmostEqual( + value_gram, + value_gram_x * value_gram_y * value_gram_z, + ) + + if __name__ == '__main__': print() unittest.main() From f8dea0773b333d2a99897d43b5d7181a657e76d4 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 27 Dec 2020 17:50:01 +0100 Subject: [PATCH 201/210] Add typing to Tensor basis. --- skfda/representation/basis/_tensor_basis.py | 28 ++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index 9c20f88f8..7ad86472a 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -1,15 +1,15 @@ import itertools +from typing import Any, Iterable, Tuple import numpy as np -from ..._utils import _same_domain from ._basis import Basis class Tensor(Basis): r"""Tensor basis. - Basis for multivariate functions constructed as a tensor product of + Basis for multivariate functions constructed as a tensor product of :math:`\mathbb{R} \to \mathbb{R}` bases. @@ -60,30 +60,28 @@ class Tensor(Basis): """ - def __init__(self, basis_list): + def __init__(self, basis_list: Iterable[Basis]): - basis_list = tuple(basis_list) + self._basis_list = tuple(basis_list) if not all(b.dim_domain == 1 and b.dim_codomain == 1 - for b in basis_list): + for b in self._basis_list): raise ValueError("The basis functions must be " "univariate and scalar valued") - self._basis_list = basis_list - super().__init__( domain_range=[b.domain_range[0] for b in basis_list], n_basis=np.prod([b.n_basis for b in basis_list])) @property - def basis_list(self): + def basis_list(self) -> Tuple[Basis, ...]: return self._basis_list @property - def dim_domain(self): + def dim_domain(self) -> int: return len(self.basis_list) - def _evaluate(self, eval_points): + def _evaluate(self, eval_points: np.ndarray) -> np.ndarray: matrix = np.zeros((self.n_basis, len(eval_points), self.dim_codomain)) @@ -96,11 +94,7 @@ def _evaluate(self, eval_points): return matrix - def _derivative_basis_and_coefs(self, coefs, order=1): - - pass - - def _gram_matrix(self): + def _gram_matrix(self) -> np.ndarray: gram_matrices = [b.gram_matrix().ravel() for b in self.basis_list] @@ -111,8 +105,8 @@ def _gram_matrix(self): return gram.reshape((self.n_basis, self.n_basis)) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: return super().__eq__(other) and self.basis_list == other.basis_list - def __hash__(self): + def __hash__(self) -> int: return hash((super().__hash__(), self.basis_list)) From 40c04926c83626ea6c0b738450bd6c078486a149 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 27 Dec 2020 19:04:08 +0100 Subject: [PATCH 202/210] Fix tensor Gram matrix. --- skfda/representation/basis/_tensor_basis.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index 7ad86472a..31a995905 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -85,8 +85,10 @@ def _evaluate(self, eval_points: np.ndarray) -> np.ndarray: matrix = np.zeros((self.n_basis, len(eval_points), self.dim_codomain)) - basis_evaluations = [b._evaluate(eval_points[:, i:i + 1]) - for i, b in enumerate(self.basis_list)] + basis_evaluations = [ + b._evaluate(eval_points[:, i:i + 1]) + for i, b in enumerate(self.basis_list) + ] for i, ev in enumerate(itertools.product(*basis_evaluations)): @@ -96,14 +98,17 @@ def _evaluate(self, eval_points: np.ndarray) -> np.ndarray: def _gram_matrix(self) -> np.ndarray: - gram_matrices = [b.gram_matrix().ravel() for b in self.basis_list] + gram_matrices = [b.gram_matrix() for b in self.basis_list] gram = gram_matrices[0] for g in gram_matrices[1:]: - gram = np.outer(gram, g).ravel() + n_rows = len(gram) * len(g) + gram = np.multiply.outer(gram, g) + gram = np.moveaxis(gram, [1, 2], [2, 1]) + gram = gram.reshape(n_rows, n_rows) - return gram.reshape((self.n_basis, self.n_basis)) + return gram def __eq__(self, other: Any) -> bool: return super().__eq__(other) and self.basis_list == other.basis_list From 5e9ec2eaf069ec2e3f3ca83037b27c2f4cde8c75 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 27 Dec 2020 19:58:20 +0100 Subject: [PATCH 203/210] =?UTF-8?q?Fix=20error=20due=20to=20FinDiff=20`?= =?UTF-8?q?=C3=ACnt`=20assert.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- skfda/representation/grid.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index e0fc2e186..040520446 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -8,22 +8,25 @@ import copy import numbers -from typing import Any import warnings +from typing import Any import findiff +import numpy as np import pandas.api.extensions import scipy.stats.mstats -import numpy as np - +from .._utils import ( + _check_array_key, + _domain_range, + _int_to_real, + _tuple_of_arrays, + constants, +) from . import basis as fdbasis -from .._utils import (_tuple_of_arrays, constants, - _domain_range, _int_to_real, _check_array_key) from ._functional_data import FData from .interpolation import SplineInterpolation - __author__ = "Miguel Carbajo Berrocal" __email__ = "miguel.carbajo@estudiante.uam.es" @@ -428,7 +431,7 @@ def derivative(self, *, order=1): if order_list.ndim != 1 or len(order_list) != self.dim_domain: raise ValueError("The order for each partial should be specified.") - operator = findiff.FinDiff(*[(1 + i, p, o) + operator = findiff.FinDiff(*[(1 + i, p, int(o)) for i, (p, o) in enumerate( zip(self.grid_points, order_list))]) data_matrix = operator(self.data_matrix.astype(float)) From 6a284fc27b791564c62ba070b465d0470fa6f605 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 29 Dec 2020 20:46:04 +0100 Subject: [PATCH 204/210] Fix style errors. --- setup.cfg | 2 +- skfda/representation/basis/_tensor_basis.py | 19 ++++++---- tests/test_basis.py | 41 ++++++++++++--------- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/setup.cfg b/setup.cfg index f9ef6b6b1..4c2082f20 100644 --- a/setup.cfg +++ b/setup.cfg @@ -87,7 +87,7 @@ rst-directives = rst-roles = attr,class,func,meth,mod,obj,ref,term, -allowed-domain-names = data, obj, result, value, values, var +allowed-domain-names = data, obj, result, val, value, values, var # Needs to be tuned max-arguments = 10 diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index 31a995905..ae625a1a9 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -19,7 +19,6 @@ class Tensor(Basis): n_basis (int): number of functions in the basis. Examples: - Defines a tensor basis over the interval :math:`[0, 5] \times [0, 3]` consisting on the functions @@ -64,14 +63,18 @@ def __init__(self, basis_list: Iterable[Basis]): self._basis_list = tuple(basis_list) - if not all(b.dim_domain == 1 and b.dim_codomain == 1 - for b in self._basis_list): - raise ValueError("The basis functions must be " - "univariate and scalar valued") + if not all( + b.dim_domain == 1 and b.dim_codomain == 1 + for b in self._basis_list + ): + raise ValueError( + "The basis functions must be univariate and scalar valued", + ) super().__init__( domain_range=[b.domain_range[0] for b in basis_list], - n_basis=np.prod([b.n_basis for b in basis_list])) + n_basis=np.prod([b.n_basis for b in basis_list]), + ) @property def basis_list(self) -> Tuple[Basis, ...]: @@ -86,13 +89,13 @@ def _evaluate(self, eval_points: np.ndarray) -> np.ndarray: matrix = np.zeros((self.n_basis, len(eval_points), self.dim_codomain)) basis_evaluations = [ - b._evaluate(eval_points[:, i:i + 1]) + b(eval_points[:, i:i + 1]) for i, b in enumerate(self.basis_list) ] for i, ev in enumerate(itertools.product(*basis_evaluations)): - matrix[i, :, 0] = np.prod(ev, axis=0) + matrix[i, :, :] = np.prod(ev, axis=0) return matrix diff --git a/tests/test_basis.py b/tests/test_basis.py index 61b75af8e..220441cef 100644 --- a/tests/test_basis.py +++ b/tests/test_basis.py @@ -1,3 +1,5 @@ +"""Tests of basis functions.""" + import itertools import unittest @@ -5,9 +7,8 @@ import skfda from skfda import concatenate -from skfda.misc import inner_product, inner_product_matrix +from skfda.misc import inner_product_matrix from skfda.representation.basis import ( - Basis, BSpline, Constant, FDataBasis, @@ -417,6 +418,7 @@ def test_vector_valued(self): class TestTensorBasis(unittest.TestCase): + """Tests for the Tensor basis.""" def setUp(self) -> None: """Create original and tensor bases.""" @@ -469,26 +471,29 @@ def test_tensor_order(self) -> None: val_z = self.basis_z(t[2]) val = self.basis([t]) - for x in range(self.n_x): - for y in range(self.n_y): - for z in range(self.n_z): + for x, y, z in itertools.product( + range(self.n_x), + range(self.n_y), + range(self.n_z) + ): - index = ( - x * self.n_y * self.n_z - + y * self.n_z - + z - ) + index = ( + x * self.n_y * self.n_z + + y * self.n_z + + z + ) - index2 = np.ravel_multi_index( - [x, y, z], - dims=self.dims) + index2 = np.ravel_multi_index( + [x, y, z], + dims=self.dims, + ) - self.assertEqual(index, index2) + self.assertEqual(index, index2) - self.assertAlmostEqual( - val[index], - val_x[x] * val_y[y] * val_z[z], - ) + self.assertAlmostEqual( + val[index], + val_x[x] * val_y[y] * val_z[z], + ) def test_tensor_gram_matrix(self) -> None: """Check that the Gram matrix is right.""" From faaef7a6840492c293a95321bf5c53cb7cc95ee8 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 29 Dec 2020 20:52:12 +0100 Subject: [PATCH 205/210] Style fixes. --- tests/test_basis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_basis.py b/tests/test_basis.py index 220441cef..276bd23fc 100644 --- a/tests/test_basis.py +++ b/tests/test_basis.py @@ -472,9 +472,9 @@ def test_tensor_order(self) -> None: val = self.basis([t]) for x, y, z in itertools.product( - range(self.n_x), - range(self.n_y), - range(self.n_z) + range(self.n_x), + range(self.n_y), + range(self.n_z), ): index = ( From e89156e155ef270295988e6fe45bc115e193f007 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 30 Dec 2020 20:05:57 +0100 Subject: [PATCH 206/210] Fix equality for new Pandas version. --- skfda/representation/basis/_fdatabasis.py | 8 +++++--- skfda/representation/grid.py | 23 ++++++++++++++--------- tests/test_pandas_fdatabasis.py | 15 ++++++++++----- tests/test_pandas_fdatagrid.py | 13 +++++++++---- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 3fa04c0f9..0fb3a514b 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -627,14 +627,16 @@ def equals(self, other): def __eq__(self, other): """Elementwise equality of FDataBasis""" - if type(self) != type(other) or self.dtype != other.dtype: + if not isinstance(self, type(other)) or self.dtype != other.dtype: + if other is pandas.NA: + return self.isna() if pandas.api.types.is_list_like(other) and not isinstance( - other, (pandas.Series, pandas.Index)): + other, (pandas.Series, pandas.Index, pandas.DataFrame)): return np.concatenate([x == y for x, y in zip(self, other)]) else: return NotImplemented - if len(self) != len(other): + if len(self) != len(other) and len(self) != 1 and len(other) != 1: raise ValueError(f"Different lengths: " f"len(self)={len(self)} and " f"len(other)={len(other)}") diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index e0fc2e186..64d7f671b 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -8,22 +8,25 @@ import copy import numbers -from typing import Any import warnings +from typing import Any import findiff +import numpy as np import pandas.api.extensions import scipy.stats.mstats -import numpy as np - +from .._utils import ( + _check_array_key, + _domain_range, + _int_to_real, + _tuple_of_arrays, + constants, +) from . import basis as fdbasis -from .._utils import (_tuple_of_arrays, constants, - _domain_range, _int_to_real, _check_array_key) from ._functional_data import FData from .interpolation import SplineInterpolation - __author__ = "Miguel Carbajo Berrocal" __email__ = "miguel.carbajo@estudiante.uam.es" @@ -564,14 +567,16 @@ def equals(self, other): def __eq__(self, other): """Elementwise equality of FDataGrid""" - if type(self) != type(other) or self.dtype != other.dtype: + if not isinstance(self, type(other)) or self.dtype != other.dtype: + if other is pandas.NA: + return self.isna() if pandas.api.types.is_list_like(other) and not isinstance( - other, (pandas.Series, pandas.Index)): + other, (pandas.Series, pandas.Index, pandas.DataFrame)): return np.concatenate([x == y for x, y in zip(self, other)]) else: return NotImplemented - if len(self) != len(other): + if len(self) != len(other) and len(self) != 1 and len(other) != 1: raise ValueError(f"Different lengths: " f"len(self)={len(self)} and " f"len(other)={len(other)}") diff --git a/tests/test_pandas_fdatabasis.py b/tests/test_pandas_fdatabasis.py index a43857665..c01f41ff5 100644 --- a/tests/test_pandas_fdatabasis.py +++ b/tests/test_pandas_fdatabasis.py @@ -1,13 +1,13 @@ import operator -import skfda -from skfda.representation.basis import Monomial, Fourier, BSpline -from pandas import Series +import numpy as np import pandas -from pandas.tests.extension import base import pytest +from pandas import Series +from pandas.tests.extension import base -import numpy as np +import skfda +from skfda.representation.basis import BSpline, Fourier, Monomial ############################################################################## @@ -335,6 +335,11 @@ def test_copy(self, dtype): def test_view(self, dtype): pass + # Pending https://github.com/pandas-dev/pandas/issues/38812 resolution + @pytest.mark.skip(reason="Bugged") + def test_contains(self, data, data_missing): + pass + class TestArithmeticOps(base.BaseArithmeticOpsTests): diff --git a/tests/test_pandas_fdatagrid.py b/tests/test_pandas_fdatagrid.py index b459f13fb..48cc14f09 100644 --- a/tests/test_pandas_fdatagrid.py +++ b/tests/test_pandas_fdatagrid.py @@ -1,12 +1,12 @@ import operator -import skfda -from pandas import Series +import numpy as np import pandas -from pandas.tests.extension import base import pytest +from pandas import Series +from pandas.tests.extension import base -import numpy as np +import skfda ############################################################################## @@ -343,6 +343,11 @@ def test_copy(self, dtype): def test_view(self, dtype): pass + # Pending https://github.com/pandas-dev/pandas/issues/38812 resolution + @pytest.mark.skip(reason="Bugged") + def test_contains(self, data, data_missing): + pass + class TestArithmeticOps(base.BaseArithmeticOpsTests): From 66036e5baca72118369c0aa72b5a2202ff466abc Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 30 Dec 2020 20:23:55 +0100 Subject: [PATCH 207/210] Fix style errors. --- setup.cfg | 4 ++-- skfda/representation/basis/_fdatabasis.py | 3 ++- skfda/representation/grid.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index f9ef6b6b1..2ba8321fa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -75,8 +75,8 @@ per-file-ignores = # There are many datasets _real_datasets.py: WPS202 - # Tests benefit from magic numbers - test_*.py: WPS432 + # Tests benefit from magic numbers and fixtures + test_*.py: WPS432, WPS442 rst-directives = # These are sorted alphabetically - but that does not matter diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 0fb3a514b..b490e6ac7 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -631,7 +631,8 @@ def __eq__(self, other): if other is pandas.NA: return self.isna() if pandas.api.types.is_list_like(other) and not isinstance( - other, (pandas.Series, pandas.Index, pandas.DataFrame)): + other, (pandas.Series, pandas.Index, pandas.DataFrame) + ): return np.concatenate([x == y for x, y in zip(self, other)]) else: return NotImplemented diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 64d7f671b..73bf6e423 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -571,7 +571,8 @@ def __eq__(self, other): if other is pandas.NA: return self.isna() if pandas.api.types.is_list_like(other) and not isinstance( - other, (pandas.Series, pandas.Index, pandas.DataFrame)): + other, (pandas.Series, pandas.Index, pandas.DataFrame) + ): return np.concatenate([x == y for x, y in zip(self, other)]) else: return NotImplemented From 4897ab879a64e369321bcaf32206028c52eed809 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 30 Dec 2020 20:33:51 +0100 Subject: [PATCH 208/210] Last style fixes. --- skfda/representation/basis/_fdatabasis.py | 2 +- skfda/representation/grid.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index b490e6ac7..7c324bad7 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -631,7 +631,7 @@ def __eq__(self, other): if other is pandas.NA: return self.isna() if pandas.api.types.is_list_like(other) and not isinstance( - other, (pandas.Series, pandas.Index, pandas.DataFrame) + other, (pandas.Series, pandas.Index, pandas.DataFrame), ): return np.concatenate([x == y for x, y in zip(self, other)]) else: diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 73bf6e423..b75caf0b2 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -571,7 +571,7 @@ def __eq__(self, other): if other is pandas.NA: return self.isna() if pandas.api.types.is_list_like(other) and not isinstance( - other, (pandas.Series, pandas.Index, pandas.DataFrame) + other, (pandas.Series, pandas.Index, pandas.DataFrame), ): return np.concatenate([x == y for x, y in zip(self, other)]) else: From 35a4ae4c1cc564f7e92283f35c732d19f1517264 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 30 Dec 2020 21:04:06 +0100 Subject: [PATCH 209/210] Revert casting of order. --- skfda/representation/grid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 040520446..25f044f3d 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -431,7 +431,7 @@ def derivative(self, *, order=1): if order_list.ndim != 1 or len(order_list) != self.dim_domain: raise ValueError("The order for each partial should be specified.") - operator = findiff.FinDiff(*[(1 + i, p, int(o)) + operator = findiff.FinDiff(*[(1 + i, p, o) for i, (p, o) in enumerate( zip(self.grid_points, order_list))]) data_matrix = operator(self.data_matrix.astype(float)) From 1340df45059affcf32cd95f219fd8fbf13ddae1e Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 31 Dec 2020 12:31:51 +0100 Subject: [PATCH 210/210] Bump version number. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index bd73f4707..2eb3c4fe4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.4 +0.5