From b75f5c65f102ca33ad3574c473f3ce9334d1634c Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 3 Nov 2021 19:33:26 +0100 Subject: [PATCH 01/50] Changes on feature transformer --- .../_per_class_feature_transformer.py | 69 +++++++++++++++---- ... => test_per_class_feature_transformer.py} | 35 ++++++---- 2 files changed, 77 insertions(+), 27 deletions(-) rename tests/{test_per_class_feature_construction.py => test_per_class_feature_transformer.py} (72%) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index 55a9270be..6bd35a403 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -1,14 +1,16 @@ """Feature extraction transformers for dimensionality reduction.""" from __future__ import annotations -from typing import TypeVar +from typing import TypeVar, Union -import numpy as np +from numpy import ndarray +from pandas import DataFrame, concat from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from ...._utils import _fit_feature_transformer -from ....representation.grid import FData +from ....representation.basis import FDataBasis +from ....representation.grid import FData, FDataGrid T = TypeVar("T", bound=FData) @@ -29,10 +31,13 @@ class PerClassFeatureTransformer(TransformerMixin): :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. Parameters: - transformer: + transformer: TransformerMixin The transformer that we want to apply to the given data. It should use target data while fitting. This is checked by looking at the 'stateless' and 'requires_y' tags + np_array_output: bool + indicates if the transformed data is requested to be a NumPy array + output. By default the value is False. Examples: Firstly, we will import and split the Berkeley Growth Study dataset @@ -52,6 +57,8 @@ class PerClassFeatureTransformer(TransformerMixin): >>> from skfda.preprocessing.dim_reduction.variable_selection ... import RecursiveMaximaHunting + >>> t = PerClassFeatureTransformer(RecursiveMaximaHunting(), + ... np_array_output=True) Finally we need to fit the data and transform it @@ -64,8 +71,11 @@ class PerClassFeatureTransformer(TransformerMixin): def __init__( self, transformer: TransformerMixin, + *, + np_array_output=False, ) -> None: self.transformer = transformer + self.np_array_output = np_array_output self._validate_transformer() def _validate_transformer( @@ -82,10 +92,9 @@ def _validate_transformer( None """ if not (hasattr(self.transformer, "fit") - and hasattr(self.transformer, "fit_transform") and hasattr(self.transformer, "transform") + or hasattr(self.transformer, "fit_transform") ): - raise TypeError( "Transformer should implement fit and " "transform. " + str(self.transformer) @@ -106,7 +115,7 @@ def _validate_transformer( def fit( self, X: T, - y: np.ndarray, + y: ndarray, ) -> PerClassFeatureTransformer: """ Fit the model on each class using X as\ @@ -130,7 +139,7 @@ def fit( return self - def transform(self, X: T) -> np.ndarray: + def transform(self, X: T) -> Union[DataFrame, ndarray]: """ Transform the provided data using the already fitted transformer. @@ -138,16 +147,49 @@ def transform(self, X: T) -> np.ndarray: X: FDataGrid with the test samples. Returns: - Array of shape (n_samples, G). + Eiter array of shape (n_samples, G) or a Data Frame \ + including the transformed data. """ sklearn_check_is_fitted(self) - - return [ + transformed_data = [ feature_transformer.transform(X) for feature_transformer in self._class_feature_transformers_ ] - def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: + if self.np_array_output: + for i in transformed_data: + if isinstance(i, FDataGrid or FDataBasis): + raise TypeError( + "There are transformed instances of FDataGrid or " + "FDataBasis that can't be concatenated on a NumPy " + "array.", + ) + return transformed_data + + if not isinstance(transformed_data[0], FDataGrid or FDataBasis): + raise TypeError( + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(transformed_data[0]), + ) + + frames = [DataFrame( + {transformed_data[0].dataset_name.lower(): transformed_data[0]}, + )] + + for j in transformed_data[1:]: + if isinstance(j, FDataGrid or FDataBasis): + frames.append( + DataFrame({j.dataset_name.lower(): j}), + ) + else: + raise TypeError( + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(j), + ) + + return concat(frames, axis=1) + + def fit_transform(self, X: T, y: ndarray) -> Union[DataFrame, ndarray]: """ Fits and transforms the provided data\ using the transformer specified when initializing the class. @@ -157,6 +199,7 @@ def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: y: Target values of shape = (n_samples) Returns: - Array of shape (n_samples, G). + Eiter array of shape (n_samples, G) or a Data Frame \ + including the transformed data. """ return self.fit(X, y).transform(X) diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_transformer.py similarity index 72% rename from tests/test_per_class_feature_construction.py rename to tests/test_per_class_feature_transformer.py index c51271602..84fe9186b 100644 --- a/tests/test_per_class_feature_construction.py +++ b/tests/test_per_class_feature_transformer.py @@ -1,28 +1,35 @@ -"""Test to check the per class feature transformer module""" -from skfda.datasets import fetch_growth -from skfda.ml.classification import KNeighborsClassifier -from skfda.preprocessing.dim_reduction.feature_extraction.\ - _per_class_feature_transformer import PerClassFeatureTransformer -from skfda.preprocessing.dim_reduction.variable_selection \ - import RecursiveMaximaHunting -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda._utils import _classifier_get_classes +"""Test to check the per class feature transformer module.""" + +import unittest import numpy as np import pytest -import unittest + +from skfda._utils import _classifier_get_classes +from skfda.datasets import fetch_growth +from skfda.ml.classification import KNeighborsClassifier +from skfda.preprocessing.dim_reduction.feature_extraction import ( + FPCA, + PerClassFeatureTransformer, +) +from skfda.preprocessing.dim_reduction.variable_selection import ( + RecursiveMaximaHunting, +) class TestPCFT(unittest.TestCase): # This test fails because the transformers do not have yet tags implemented @pytest.mark.skip(reason="Tags are not yet implemented on transformers") - def test_transform(self): + def test_transform(self) -> None: X, y = fetch_growth(return_X_y=True, as_frame=True) X = X.iloc[:, 0].values y = y.values.codes - t = PerClassFeatureTransformer(RecursiveMaximaHunting()) + t = PerClassFeatureTransformer( + RecursiveMaximaHunting(), + np_array_output=True, + ) t.fit_transform(X, y) transformed = t.transform(X) @@ -35,14 +42,14 @@ def test_transform(self): a = feature_transformer.transform(X) np.testing.assert_array_equal(transformed[cur_class], a) - def test_not_transformer_argument(self): + def test_not_transformer_argument(self) -> None: self.assertRaises( TypeError, PerClassFeatureTransformer, KNeighborsClassifier(), ) - def test_not_taget_required_fitting(self): + def test_not_taget_required_fitting(self) -> None: self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) From 3d5ca51fa6fac428bdc0af10ddbbd416892595a6 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 3 Nov 2021 19:42:29 +0100 Subject: [PATCH 02/50] Fix branch with issue 376 --- .../feature_extraction/__init__.py | 1 - .../feature_extraction/_fda_feature_union.py | 130 ------------------ tests/test_fda_feature_union.py | 57 -------- 3 files changed, 188 deletions(-) delete mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py delete mode 100644 tests/test_fda_feature_union.py diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 1167a18a8..74f8aaba6 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,5 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer -from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA from ._per_class_feature_transformer import PerClassFeatureTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py deleted file mode 100644 index 7867cdeb1..000000000 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Feature extraction union for dimensionality reduction.""" -from __future__ import annotations - -from pandas import DataFrame -from sklearn.pipeline import FeatureUnion - -from ....representation.basis import FDataBasis -from ....representation.grid import FDataGrid - - -class FdaFeatureUnion(FeatureUnion): - """Concatenates results of multiple functional transformer objects. - - This estimator applies a list of transformer objects in parallel to the - input data, then concatenates the results (They can be either FDataGrid - and FDataBasis objects or multivariate data itself).This is useful to - combine several feature extraction mechanisms into a single transformer. - Parameters of the transformers may be set using its name and the parameter - name separated by a '__'. A transformer may be replaced entirely by - setting the parameter with its name to another transformer, - or removed by setting to 'drop'. - - Parameters: - transformer_list: - List of tuple containing `(str, transformer)`. The first element - of the tuple is name affected to the transformer while the - second element is a scikit-learn transformer instance. - The transformer instance can also be `"drop"` for it to be - ignored. - n_jobs: - Number of jobs to run in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` - context. - ``-1`` means using all processors. - The default value is None - transformer_weights: - Multiplicative weights for features per transformer. - Keys are transformer names, values the weights. - Raises ValueError if key not present in ``transformer_list``. - verbose: - If True, the time elapsed while fitting each transformer will be - printed as it is completed. - np_array_output: - indicates if the transformed data is requested to be a NumPy array - output. By default the value is False. - - Examples: - Firstly we will import the Berkeley Growth Study data set - >>> from skfda.datasets import fetch_growth - >>> X, y= fetch_growth(return_X_y=True, as_frame=True) - >>> X = X.iloc[:, 0].values - - Then we need to import the transformers we want to use - >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA - >>> from skfda.representation import EvaluationTransformer - - Finally we import the union and apply fit and transform - >>> from skfda.preprocessing.dim_reduction.feature_extraction. - ... _fda_feature_union import FdaFeatureUnion - >>> union = FdaFeatureUnion([ - ... ("Eval", EvaluationTransformer()), - ... ("fpca", FPCA()), ], np_array_output=True) - >>> union.fit_transform(X) - """ - - def __init__( - self, - transformer_list, - *, - n_jobs=None, - transformer_weights=None, - verbose=False, - np_array_output=False, - ) -> None: - self.np_array_output = np_array_output - super().__init__( - transformer_list, - n_jobs=n_jobs, - transformer_weights=transformer_weights, - verbose=verbose, - ) - - def _hstack(self, Xs): - - if (self.np_array_output): - for i in Xs: - if isinstance(i, FDataGrid or FDataBasis): - raise TypeError( - "There are transformed instances of FDataGrid or " - "FDataBasis that can't be concatenated on a NumPy " - "array.", - ) - return super()._hstack(Xs) - - first_grid = True - first_basis = True - for j in Xs: - if isinstance(j, FDataGrid): - if first_grid: - curves = j - first_grid = False - else: - curves = curves.concatenate(j) - elif isinstance(j, FDataBasis): - if first_basis: - target = j - first_basis = False - else: - target = target.concatenate(j) - else: - raise TypeError( - "Transformed instance is not of type FDataGrid or" - " FDataBasis. It is " + type(j), - ) - - feature_name = curves.dataset_name.lower() + " transformed" - target_name = "transformed target" - if first_grid: # There are only FDataBasis - return DataFrame({ - target_name: target, - }) - elif first_basis: # There are only FDataGrids - return DataFrame({ - feature_name: curves, - }) - - return DataFrame({ - feature_name: curves, - target_name: target, - }) diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py deleted file mode 100644 index 03ea49b30..000000000 --- a/tests/test_fda_feature_union.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Test to check the Fda Feature Union module.""" - -import unittest - -from pandas.core.frame import DataFrame - -from skfda.datasets import fetch_growth -from skfda.misc.operators import SRSF -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.preprocessing.smoothing.kernel_smoothers\ - import NadarayaWatsonSmoother -from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union\ - import FdaFeatureUnion -from skfda.representation import EvaluationTransformer - - -class TestFdaFeatureUnion(unittest.TestCase): - - def setUp(self) -> None: - X = fetch_growth(return_X_y=True, as_frame=True)[0] - self.X = X.iloc[:, 0].values - - def test_incompatible_array_output(self): - - u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer()), ("fpca", FPCA())], - np_array_output=False, - ) - self.assertRaises(TypeError, u.fit_transform, self.X) - - def test_incompatible_fdatagrid_output(self): - - u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer()), ("srsf", SRSF())], - np_array_output=True, - ) - self.assertRaises(TypeError, u.fit_transform, self.X) - - def test_correct_transformation_concat(self): - u = FdaFeatureUnion( - [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], - ) - created_frame = u.fit_transform(self.X) - - t1 = SRSF().fit_transform(self.X) - t2 = NadarayaWatsonSmoother().fit_transform(self.X) - t = t1.concatenate(t2) - - true_frame = DataFrame({ - t.dataset_name.lower() + " transformed": t, - }) - result = True - self.assertEqual(result, true_frame.equals(created_frame)) - - -if __name__ == '__main__': - unittest.main() From 8383f09343e7b238d53f0de9fd951f1d9563d3b6 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 26 Oct 2021 23:16:23 +0200 Subject: [PATCH 03/50] Per class feature constructor --- skfda/_utils/__init__.py | 1 + skfda/_utils/_utils.py | 16 ++- .../feature_extraction/__init__.py | 1 + .../_per_class_feature_transformer.py | 105 ++++++++++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index fc9972af9..dc194da54 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -13,6 +13,7 @@ _classifier_fit_depth_methods, _classifier_get_classes, _classifier_get_depth_methods, + _classifier_fit_feature_transformer, _compute_dependence, _DependenceMeasure, _evaluate_grid, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f718a55c5..9a8f42d3d 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -23,7 +23,7 @@ import scipy.integrate from numpy import ndarray from pandas.api.indexers import check_array_indexer -from sklearn.base import clone +from sklearn.base import TransformerMixin, clone from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import check_classification_targets from typing_extensions import Literal, Protocol @@ -729,6 +729,20 @@ def _classifier_fit_depth_methods( return classes, class_depth_methods_ +def _classifier_fit_feature_transformer( + X: T, + y: ndarray, + transformer: TransformerMixin +) -> Tuple[ndarray, Sequence[TransformerMixin]]: + classes, y_ind = _classifier_get_classes(y) + + class_feature_transformers = [ + clone(transformer).fit(X[y_ind == cur_class], y[y_ind == cur_class]) + for cur_class in range(classes.size) + ] + + return classes, class_feature_transformers + _DependenceMeasure = Callable[[np.ndarray, np.ndarray], np.ndarray] diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 16355e236..8c8f9895b 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,3 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer +from ._per_class_feature_transformer import PerClassFeatureTransformer from ._fpca import FPCA diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py new file mode 100644 index 000000000..cc646a5fb --- /dev/null +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -0,0 +1,105 @@ +"""Feature extraction transformers for dimensionality reduction.""" +from __future__ import annotations +import numpy as np +from typing import TypeVar +from sklearn.base import TransformerMixin +from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted +from ....representation.grid import FData +from ...._utils import _classifier_fit_feature_transformer + +T = TypeVar("T", bound=FData) + +class PerClassFeatureTransformer(TransformerMixin): + + def __init__( + self, + transformer: TransformerMixin + ) -> None: + self.transformer= transformer + self._validate_transformer() + + def _validate_transformer( + self + ) -> None: + """ + Checks that the transformer passed is scikit-learn-like and that uses target data in fit + + Args: + None + + Returns: + None + """ + if not (hasattr(self.transformer, "fit") or hasattr(self.transformer, "fit_transform")) or not hasattr( + self.transformer, "transform" + ): + raise TypeError( + "Transformer should implement fit and " + "transform. '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) + ) + + tags = self.transformer._get_tags() + + if not(tags['stateless'] and tags['requires_y']): + raise TypeError( + "Transformer should use target data in fit." + " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) + ) + + + def fit( + self, + X: T, + y: np.ndarray + ) -> PerClassFeatureTransformer: + """ + Fit the model on each class using X as training data and y as target values. + + Args: + X: FDataGrid with the training data. + y: Target values of shape = (n_samples). + + Returns: + self + """ + classes, class_feature_transformers = _classifier_fit_feature_transformer( + X, y, self.transformer + ) + + self._classes = classes + self._class_feature_transformers_ = class_feature_transformers + + return self + + + def transform(self, X: T) -> np.ndarray: + """ + Transform the provided data using the already fitted transformer. + + Args: + X: FDataGrid with the test samples. + + Returns: + Array of shape (n_samples, G). + """ + sklearn_check_is_fitted(self) + + return [ + feature_transformer.transform(X) + for feature_transformer in self._class_feature_transformers_ + ] + + + def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: + """ + Fits and transforms the provided data + using the transformer specified when initializing the class. + + Args: + X: FDataGrid with the samples. + y: Target values of shape = (n_samples) + + Returns: + Array of shape (n_samples, G). + """ + return self.fit(X, y).transform(X) From b39ac3840f86536f2080689222dac4329891668a Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 2 Nov 2021 16:00:48 +0100 Subject: [PATCH 04/50] Fda Feature Union + tests --- .../feature_extraction/__init__.py | 1 + .../feature_extraction/_fda_feature_union.py | 123 ++++++++++++++++++ .../_per_class_feature_transformer.py | 47 ++++++- tests/test_fda_feature_union.py | 44 +++++++ tests/test_per_class_feature_construction.py | 41 ++++++ 5 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py create mode 100644 tests/test_fda_feature_union.py create mode 100644 tests/test_per_class_feature_construction.py diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 8c8f9895b..ec7613a5a 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,4 +1,5 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer from ._per_class_feature_transformer import PerClassFeatureTransformer +from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py new file mode 100644 index 000000000..604adb3b8 --- /dev/null +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -0,0 +1,123 @@ +"""Feature extraction union for dimensionality reduction.""" +from __future__ import annotations +from typing import Any +from numpy import ndarray +from pandas import DataFrame +from sklearn.pipeline import FeatureUnion +from ....representation.grid import FDataGrid +from ....representation.basis import FDataBasis + +class FdaFeatureUnion(FeatureUnion): + """Concatenates results of multiple functional transformer objects. + + This estimator applies a list of transformer objects in parallel to the + input data, then concatenates the results (They can be either FDataGrid + and FDataBasis objects or multivariate data itself).This is useful to + combine several feature extraction mechanisms into a single transformer. + Parameters of the transformers may be set using its name and the parameter + name separated by a '__'. A transformer may be replaced entirely by + setting the parameter with its name to another transformer, + or removed by setting to 'drop'. + + Parameters: + transformer_list: + List of tuple containing `(str, transformer)`. The first element + of the tuple is name affected to the transformer while the + second element is a scikit-learn transformer instance. + The transformer instance can also be `"drop"` for it to be + ignored. + n_jobs: + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. + The default value is None + transformer_weights: + Multiplicative weights for features per transformer. + Keys are transformer names, values the weights. + Raises ValueError if key not present in ``transformer_list``. + verbose: + If True, the time elapsed while fitting each transformer will be + printed as it is completed. + np_array_output: + indicates if the transformed data is requested to be a NumPy array + output. By default the value is False. + + Examples: + Firstly we will import the Berkeley Growth Study data set + >>> from skfda.datasets import fetch_growth + >>> X, y= fetch_growth(return_X_y=True, as_frame=True) + >>> X = X.iloc[:, 0].values + + Then we need to import the transformers we want to use + >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA + >>> from skfda.representation import EvaluationTransformer + + Finally we import the union and apply fit and transform + >>> from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union + ... import FdaFeatureUnion + >>> union = FdaFeatureUnion([ + ... ("Eval", EvaluationTransformer()), + ... ("fpca", FPCA()), ], np_array_output=True) + >>> union.fit_transform(X) + """ + def __init__( + self, + transformer_list, + *, + n_jobs=None, + transformer_weights=None, + verbose=False, + np_array_output=False + ) -> None : + self.np_array_output = np_array_output + super().__init__(transformer_list, n_jobs=n_jobs, transformer_weights = transformer_weights, verbose=verbose) + + + + def _hstack(self, Xs) -> (ndarray | DataFrame | Any): + + if (self.np_array_output): + for i in Xs: + if(isinstance(i, FDataGrid) or isinstance(i, FDataBasis)): + raise TypeError( + "There are transformed instances of FDataGrid or FDataBasis" + " that can't be concatenated on a NumPy array." + ) + return super()._hstack(Xs) + + first_grid = True + first_basis = True + for j in Xs: + if isinstance(j, FDataGrid): + if first_grid: + curves = j + first_grid = False + else: + curves = curves.concatenate(j) + elif isinstance(j, FDataBasis): + if first_basis: + target = j + first_basis = False + else: + target = target.concatenate(j) + else: + raise TypeError( + "Transformed instance is not of type FDataGrid or FDataBasis." + "It is %s" %(type(j)) + ) + + feature_name = curves.dataset_name.lower() + " transformed" + target_name = "transformed target" + if first_grid: # There are only FDataBasis + return DataFrame({ + target_name:target + }) + elif first_basis: # There are only FDataGrids + return DataFrame({ + feature_name:curves + }) + else: + return DataFrame({ + feature_name : curves, + target_name: target, + }) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index cc646a5fb..bb61aa3ad 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -6,10 +6,55 @@ from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from ....representation.grid import FData from ...._utils import _classifier_fit_feature_transformer - T = TypeVar("T", bound=FData) class PerClassFeatureTransformer(TransformerMixin): + r"""Per class feature transformer for functional data. + + This class takes a transformer and performs the following map: + + .. math:: + \mathcal{X} &\rightarrow \mathbb{R}^G \\ + x &\rightarrow \textbf{t} = (T_1(x), T_2(x),...,T_k(x)) + + Where :math:`T_i(x)` is the transformation :math:`x` with respect to + the data in the :math:`i`-th group. + + Note that :math:`\mathcal{X}` is possibly multivariate, that is, + :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. + + Parameters: + transformer: + The transformer that we want to apply to the given data. + It should use target data while fitting. + This is checked by looking at the 'stateless' and 'requires_y' tags + Examples: + Firstly, we will import and split the Berkeley Growth Study dataset + + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> X, y = fetch_growth(return_X_y=True, as_frame=True) + >>> X = X.iloc[:, 0].values + >>> y = y.values.codes + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.25, stratify=y, random_state=0) + + >>> from skfda.preprocessing.dim_reduction.feature_extraction + ... import PerClassFeatureTransformer + + Then we will need to select a fda transformer, and so we will + use RecursiveMaximaHunting + + >>> from skfda.preprocessing.dim_reduction.variable_selection + ... import RecursiveMaximaHunting + + Finally we need to fit the data and transform it + + >>> t.fit(X_train, y_train) + >>> x_transformed = t.transform(X_test) + + x_transformed will be a vector with the transformed data + """ def __init__( self, diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py new file mode 100644 index 000000000..a33571488 --- /dev/null +++ b/tests/test_fda_feature_union.py @@ -0,0 +1,44 @@ +"""Test to check the Fda Feature Union module""" +from pandas.core.frame import DataFrame +from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union import FdaFeatureUnion +from skfda.preprocessing.dim_reduction.feature_extraction import FPCA +from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother +from skfda.representation import EvaluationTransformer +from skfda.misc.operators import SRSF +from skfda.datasets import fetch_growth +import unittest + + +class TestFdaFeatureUnion(unittest.TestCase): + def setUp(self) -> None: + X, y= fetch_growth(return_X_y=True, as_frame=True) + self.X = X.iloc[:, 0].values + + def test_incompatible_array_output(self): + + u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("fpca", FPCA()), ], np_array_output=False) + self.assertRaises(TypeError, u.fit_transform, self.X) + + def test_incompatible_FDataGrid_output(self): + + u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("srsf",SRSF()), ], np_array_output=True) + self.assertRaises(TypeError, u.fit_transform, self.X) + + def test_correct_transformation_concat(self): + u = FdaFeatureUnion([("srsf1",SRSF()), ("smooth",NadarayaWatsonSmoother())]) + created_frame = u.fit_transform(self.X) + + t1 = SRSF().fit_transform(self.X) + t2 = NadarayaWatsonSmoother().fit_transform(self.X) + t = t1.concatenate(t2) + + true_frame = DataFrame({ + t.dataset_name.lower() + " transformed": t + }) + + self.assertEqual(True, true_frame.equals(created_frame)) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py new file mode 100644 index 000000000..5096f6acb --- /dev/null +++ b/tests/test_per_class_feature_construction.py @@ -0,0 +1,41 @@ +"""Test to check the per class feature transformer module""" +from skfda.preprocessing.dim_reduction.feature_extraction._per_class_feature_transformer import PerClassFeatureTransformer +from skfda.preprocessing.dim_reduction.variable_selection import RecursiveMaximaHunting +from skfda.ml.classification import KNeighborsClassifier +from skfda.preprocessing.dim_reduction.feature_extraction import FPCA +from skfda.datasets import fetch_growth +from skfda._utils import _classifier_get_classes + +import unittest + +import numpy as np + + +class TestPCFT(unittest.TestCase): + + # This test fails because the transformers do not have yet tags implemented + def test_transform(self): + + X, y = fetch_growth(return_X_y=True, as_frame=True) + X = X.iloc[:, 0].values + y = y.values.codes + t = PerClassFeatureTransformer(RecursiveMaximaHunting()) + t.fit_transform(X, y) + transformed = t.transform(X) + + classes, y_ind = _classifier_get_classes(y) + for cur_class in range(classes.size): + feature_transformer = RecursiveMaximaHunting().fit(X[y_ind == cur_class], y[y_ind == cur_class]) + a = feature_transformer.transform(X) + np.testing.assert_array_equal(transformed[cur_class], a) + + def test_not_transformer_argument(self): + self.assertRaises(TypeError, PerClassFeatureTransformer, KNeighborsClassifier()) + + def test_not_taget_required_fitting(self): + self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) + + + +if __name__ == '__main__': + unittest.main() From 4f2f5118ac14c84ebb540d0339a1b3e1dc116cf5 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 2 Nov 2021 16:34:54 +0100 Subject: [PATCH 05/50] Skiped test correction --- tests/test_per_class_feature_construction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py index 5096f6acb..f286e9782 100644 --- a/tests/test_per_class_feature_construction.py +++ b/tests/test_per_class_feature_construction.py @@ -7,13 +7,14 @@ from skfda._utils import _classifier_get_classes import unittest - +import pytest import numpy as np class TestPCFT(unittest.TestCase): # This test fails because the transformers do not have yet tags implemented + @pytest.mark.skip(reason="Tags are not yet implemented on transformers") def test_transform(self): X, y = fetch_growth(return_X_y=True, as_frame=True) From a448f94d8699ebd1bc8e2e095b6877f000971187 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 2 Nov 2021 23:51:41 +0100 Subject: [PATCH 06/50] Style errors fixing --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 5 +- .../feature_extraction/__init__.py | 2 +- .../feature_extraction/_fda_feature_union.py | 79 +++++++++-------- .../_per_class_feature_transformer.py | 84 +++++++++++-------- tests/test_fda_feature_union.py | 53 +++++++----- tests/test_per_class_feature_construction.py | 36 ++++---- 7 files changed, 151 insertions(+), 110 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index dc194da54..ee90da10e 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -13,10 +13,10 @@ _classifier_fit_depth_methods, _classifier_get_classes, _classifier_get_depth_methods, - _classifier_fit_feature_transformer, _compute_dependence, _DependenceMeasure, _evaluate_grid, + _fit_feature_transformer, _int_to_real, _pairwise_symmetric, _reshape_eval_points, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9a8f42d3d..b4b3d381b 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -729,10 +729,11 @@ def _classifier_fit_depth_methods( return classes, class_depth_methods_ -def _classifier_fit_feature_transformer( + +def _fit_feature_transformer( X: T, y: ndarray, - transformer: TransformerMixin + transformer: TransformerMixin, ) -> Tuple[ndarray, Sequence[TransformerMixin]]: classes, y_ind = _classifier_get_classes(y) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index ec7613a5a..1167a18a8 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,5 +1,5 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer -from ._per_class_feature_transformer import PerClassFeatureTransformer from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA +from ._per_class_feature_transformer import PerClassFeatureTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 604adb3b8..7867cdeb1 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -1,18 +1,19 @@ """Feature extraction union for dimensionality reduction.""" from __future__ import annotations -from typing import Any -from numpy import ndarray + from pandas import DataFrame from sklearn.pipeline import FeatureUnion -from ....representation.grid import FDataGrid + from ....representation.basis import FDataBasis +from ....representation.grid import FDataGrid + class FdaFeatureUnion(FeatureUnion): """Concatenates results of multiple functional transformer objects. This estimator applies a list of transformer objects in parallel to the - input data, then concatenates the results (They can be either FDataGrid - and FDataBasis objects or multivariate data itself).This is useful to + input data, then concatenates the results (They can be either FDataGrid + and FDataBasis objects or multivariate data itself).This is useful to combine several feature extraction mechanisms into a single transformer. Parameters of the transformers may be set using its name and the parameter name separated by a '__'. A transformer may be replaced entirely by @@ -28,7 +29,8 @@ class FdaFeatureUnion(FeatureUnion): ignored. n_jobs: Number of jobs to run in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. The default value is None transformer_weights: @@ -38,28 +40,29 @@ class FdaFeatureUnion(FeatureUnion): verbose: If True, the time elapsed while fitting each transformer will be printed as it is completed. - np_array_output: + np_array_output: indicates if the transformed data is requested to be a NumPy array output. By default the value is False. - + Examples: Firstly we will import the Berkeley Growth Study data set >>> from skfda.datasets import fetch_growth >>> X, y= fetch_growth(return_X_y=True, as_frame=True) >>> X = X.iloc[:, 0].values - + Then we need to import the transformers we want to use >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA >>> from skfda.representation import EvaluationTransformer - + Finally we import the union and apply fit and transform - >>> from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union - ... import FdaFeatureUnion + >>> from skfda.preprocessing.dim_reduction.feature_extraction. + ... _fda_feature_union import FdaFeatureUnion >>> union = FdaFeatureUnion([ ... ("Eval", EvaluationTransformer()), - ... ("fpca", FPCA()), ], np_array_output=True) + ... ("fpca", FPCA()), ], np_array_output=True) >>> union.fit_transform(X) """ + def __init__( self, transformer_list, @@ -67,22 +70,26 @@ def __init__( n_jobs=None, transformer_weights=None, verbose=False, - np_array_output=False - ) -> None : + np_array_output=False, + ) -> None: self.np_array_output = np_array_output - super().__init__(transformer_list, n_jobs=n_jobs, transformer_weights = transformer_weights, verbose=verbose) - - + super().__init__( + transformer_list, + n_jobs=n_jobs, + transformer_weights=transformer_weights, + verbose=verbose, + ) - def _hstack(self, Xs) -> (ndarray | DataFrame | Any): + def _hstack(self, Xs): if (self.np_array_output): for i in Xs: - if(isinstance(i, FDataGrid) or isinstance(i, FDataBasis)): + if isinstance(i, FDataGrid or FDataBasis): raise TypeError( - "There are transformed instances of FDataGrid or FDataBasis" - " that can't be concatenated on a NumPy array." - ) + "There are transformed instances of FDataGrid or " + "FDataBasis that can't be concatenated on a NumPy " + "array.", + ) return super()._hstack(Xs) first_grid = True @@ -100,24 +107,24 @@ def _hstack(self, Xs) -> (ndarray | DataFrame | Any): first_basis = False else: target = target.concatenate(j) - else: + else: raise TypeError( - "Transformed instance is not of type FDataGrid or FDataBasis." - "It is %s" %(type(j)) + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(j), ) feature_name = curves.dataset_name.lower() + " transformed" - target_name = "transformed target" - if first_grid: # There are only FDataBasis - return DataFrame({ - target_name:target - }) - elif first_basis: # There are only FDataGrids + target_name = "transformed target" + if first_grid: # There are only FDataBasis return DataFrame({ - feature_name:curves + target_name: target, }) - else: + elif first_basis: # There are only FDataGrids return DataFrame({ - feature_name : curves, - target_name: target, + feature_name: curves, }) + + return DataFrame({ + feature_name: curves, + target_name: target, + }) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index bb61aa3ad..55a9270be 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -1,13 +1,18 @@ """Feature extraction transformers for dimensionality reduction.""" from __future__ import annotations -import numpy as np + from typing import TypeVar + +import numpy as np from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted + +from ...._utils import _fit_feature_transformer from ....representation.grid import FData -from ...._utils import _classifier_fit_feature_transformer + T = TypeVar("T", bound=FData) + class PerClassFeatureTransformer(TransformerMixin): r"""Per class feature transformer for functional data. @@ -19,7 +24,7 @@ class PerClassFeatureTransformer(TransformerMixin): Where :math:`T_i(x)` is the transformation :math:`x` with respect to the data in the :math:`i`-th group. - + Note that :math:`\mathcal{X}` is possibly multivariate, that is, :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. @@ -41,7 +46,7 @@ class PerClassFeatureTransformer(TransformerMixin): >>> from skfda.preprocessing.dim_reduction.feature_extraction ... import PerClassFeatureTransformer - + Then we will need to select a fda transformer, and so we will use RecursiveMaximaHunting @@ -53,21 +58,22 @@ class PerClassFeatureTransformer(TransformerMixin): >>> t.fit(X_train, y_train) >>> x_transformed = t.transform(X_test) - x_transformed will be a vector with the transformed data + x_transformed will be a vector with the transformed data """ def __init__( self, - transformer: TransformerMixin + transformer: TransformerMixin, ) -> None: - self.transformer= transformer + self.transformer = transformer self._validate_transformer() - + def _validate_transformer( - self + self, ) -> None: """ - Checks that the transformer passed is scikit-learn-like and that uses target data in fit + Check that the transformer passed is\ + scikit-learn-like and that uses target data in fit. Args: None @@ -75,30 +81,36 @@ def _validate_transformer( Returns: None """ - if not (hasattr(self.transformer, "fit") or hasattr(self.transformer, "fit_transform")) or not hasattr( - self.transformer, "transform" - ): - raise TypeError( - "Transformer should implement fit and " - "transform. '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) - ) - + if not (hasattr(self.transformer, "fit") + and hasattr(self.transformer, "fit_transform") + and hasattr(self.transformer, "transform") + ): + + raise TypeError( + "Transformer should implement fit and " + "transform. " + str(self.transformer) + + " (type " + str(type(self.transformer)) + ")" + " doesn't", + ) + tags = self.transformer._get_tags() - - if not(tags['stateless'] and tags['requires_y']): - raise TypeError( - "Transformer should use target data in fit." - " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) - ) - - + + if not (tags['stateless'] and tags['requires_y']): + raise TypeError( + "Transformer should use target data in fit." + + str(self.transformer) + + " (type " + str(type(self.transformer)) + ")" + " doesn't", + ) + def fit( self, X: T, - y: np.ndarray + y: np.ndarray, ) -> PerClassFeatureTransformer: """ - Fit the model on each class using X as training data and y as target values. + Fit the model on each class using X as\ + training data and y as target values. Args: X: FDataGrid with the training data. @@ -107,16 +119,17 @@ def fit( Returns: self """ - classes, class_feature_transformers = _classifier_fit_feature_transformer( - X, y, self.transformer + classes, class_feature_transformers = _fit_feature_transformer( + X, + y, + self.transformer, ) - + self._classes = classes self._class_feature_transformers_ = class_feature_transformers return self - def transform(self, X: T) -> np.ndarray: """ Transform the provided data using the already fitted transformer. @@ -128,16 +141,15 @@ def transform(self, X: T) -> np.ndarray: Array of shape (n_samples, G). """ sklearn_check_is_fitted(self) - + return [ - feature_transformer.transform(X) + feature_transformer.transform(X) for feature_transformer in self._class_feature_transformers_ ] - def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: """ - Fits and transforms the provided data + Fits and transforms the provided data\ using the transformer specified when initializing the class. Args: diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index a33571488..03ea49b30 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -1,31 +1,45 @@ -"""Test to check the Fda Feature Union module""" +"""Test to check the Fda Feature Union module.""" + +import unittest + from pandas.core.frame import DataFrame -from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union import FdaFeatureUnion + +from skfda.datasets import fetch_growth +from skfda.misc.operators import SRSF from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother +from skfda.preprocessing.smoothing.kernel_smoothers\ + import NadarayaWatsonSmoother +from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union\ + import FdaFeatureUnion from skfda.representation import EvaluationTransformer -from skfda.misc.operators import SRSF -from skfda.datasets import fetch_growth -import unittest class TestFdaFeatureUnion(unittest.TestCase): + def setUp(self) -> None: - X, y= fetch_growth(return_X_y=True, as_frame=True) + X = fetch_growth(return_X_y=True, as_frame=True)[0] self.X = X.iloc[:, 0].values - + def test_incompatible_array_output(self): - - u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("fpca", FPCA()), ], np_array_output=False) + + u = FdaFeatureUnion( + [("EvaluationT", EvaluationTransformer()), ("fpca", FPCA())], + np_array_output=False, + ) self.assertRaises(TypeError, u.fit_transform, self.X) - - def test_incompatible_FDataGrid_output(self): - - u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("srsf",SRSF()), ], np_array_output=True) + + def test_incompatible_fdatagrid_output(self): + + u = FdaFeatureUnion( + [("EvaluationT", EvaluationTransformer()), ("srsf", SRSF())], + np_array_output=True, + ) self.assertRaises(TypeError, u.fit_transform, self.X) - + def test_correct_transformation_concat(self): - u = FdaFeatureUnion([("srsf1",SRSF()), ("smooth",NadarayaWatsonSmoother())]) + u = FdaFeatureUnion( + [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], + ) created_frame = u.fit_transform(self.X) t1 = SRSF().fit_transform(self.X) @@ -33,12 +47,11 @@ def test_correct_transformation_concat(self): t = t1.concatenate(t2) true_frame = DataFrame({ - t.dataset_name.lower() + " transformed": t + t.dataset_name.lower() + " transformed": t, }) + result = True + self.assertEqual(result, true_frame.equals(created_frame)) - self.assertEqual(True, true_frame.equals(created_frame)) - - if __name__ == '__main__': unittest.main() diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py index f286e9782..c51271602 100644 --- a/tests/test_per_class_feature_construction.py +++ b/tests/test_per_class_feature_construction.py @@ -1,22 +1,24 @@ """Test to check the per class feature transformer module""" -from skfda.preprocessing.dim_reduction.feature_extraction._per_class_feature_transformer import PerClassFeatureTransformer -from skfda.preprocessing.dim_reduction.variable_selection import RecursiveMaximaHunting +from skfda.datasets import fetch_growth from skfda.ml.classification import KNeighborsClassifier +from skfda.preprocessing.dim_reduction.feature_extraction.\ + _per_class_feature_transformer import PerClassFeatureTransformer +from skfda.preprocessing.dim_reduction.variable_selection \ + import RecursiveMaximaHunting from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.datasets import fetch_growth from skfda._utils import _classifier_get_classes -import unittest -import pytest import numpy as np +import pytest +import unittest class TestPCFT(unittest.TestCase): - + # This test fails because the transformers do not have yet tags implemented @pytest.mark.skip(reason="Tags are not yet implemented on transformers") - def test_transform(self): - + def test_transform(self): + X, y = fetch_growth(return_X_y=True, as_frame=True) X = X.iloc[:, 0].values y = y.values.codes @@ -26,17 +28,23 @@ def test_transform(self): classes, y_ind = _classifier_get_classes(y) for cur_class in range(classes.size): - feature_transformer = RecursiveMaximaHunting().fit(X[y_ind == cur_class], y[y_ind == cur_class]) + feature_transformer = RecursiveMaximaHunting().fit( + X[y_ind == cur_class], + y[y_ind == cur_class], + ) a = feature_transformer.transform(X) np.testing.assert_array_equal(transformed[cur_class], a) - + def test_not_transformer_argument(self): - self.assertRaises(TypeError, PerClassFeatureTransformer, KNeighborsClassifier()) - + self.assertRaises( + TypeError, + PerClassFeatureTransformer, + KNeighborsClassifier(), + ) + def test_not_taget_required_fitting(self): self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) - - + if __name__ == '__main__': unittest.main() From c49b72b5c9cd3705f6debb5744537b7300a557c2 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 3 Nov 2021 19:33:26 +0100 Subject: [PATCH 07/50] Changes on feature transformer --- .../_per_class_feature_transformer.py | 69 +++++++++++++++---- ... => test_per_class_feature_transformer.py} | 35 ++++++---- 2 files changed, 77 insertions(+), 27 deletions(-) rename tests/{test_per_class_feature_construction.py => test_per_class_feature_transformer.py} (72%) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index 55a9270be..6bd35a403 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -1,14 +1,16 @@ """Feature extraction transformers for dimensionality reduction.""" from __future__ import annotations -from typing import TypeVar +from typing import TypeVar, Union -import numpy as np +from numpy import ndarray +from pandas import DataFrame, concat from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from ...._utils import _fit_feature_transformer -from ....representation.grid import FData +from ....representation.basis import FDataBasis +from ....representation.grid import FData, FDataGrid T = TypeVar("T", bound=FData) @@ -29,10 +31,13 @@ class PerClassFeatureTransformer(TransformerMixin): :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. Parameters: - transformer: + transformer: TransformerMixin The transformer that we want to apply to the given data. It should use target data while fitting. This is checked by looking at the 'stateless' and 'requires_y' tags + np_array_output: bool + indicates if the transformed data is requested to be a NumPy array + output. By default the value is False. Examples: Firstly, we will import and split the Berkeley Growth Study dataset @@ -52,6 +57,8 @@ class PerClassFeatureTransformer(TransformerMixin): >>> from skfda.preprocessing.dim_reduction.variable_selection ... import RecursiveMaximaHunting + >>> t = PerClassFeatureTransformer(RecursiveMaximaHunting(), + ... np_array_output=True) Finally we need to fit the data and transform it @@ -64,8 +71,11 @@ class PerClassFeatureTransformer(TransformerMixin): def __init__( self, transformer: TransformerMixin, + *, + np_array_output=False, ) -> None: self.transformer = transformer + self.np_array_output = np_array_output self._validate_transformer() def _validate_transformer( @@ -82,10 +92,9 @@ def _validate_transformer( None """ if not (hasattr(self.transformer, "fit") - and hasattr(self.transformer, "fit_transform") and hasattr(self.transformer, "transform") + or hasattr(self.transformer, "fit_transform") ): - raise TypeError( "Transformer should implement fit and " "transform. " + str(self.transformer) @@ -106,7 +115,7 @@ def _validate_transformer( def fit( self, X: T, - y: np.ndarray, + y: ndarray, ) -> PerClassFeatureTransformer: """ Fit the model on each class using X as\ @@ -130,7 +139,7 @@ def fit( return self - def transform(self, X: T) -> np.ndarray: + def transform(self, X: T) -> Union[DataFrame, ndarray]: """ Transform the provided data using the already fitted transformer. @@ -138,16 +147,49 @@ def transform(self, X: T) -> np.ndarray: X: FDataGrid with the test samples. Returns: - Array of shape (n_samples, G). + Eiter array of shape (n_samples, G) or a Data Frame \ + including the transformed data. """ sklearn_check_is_fitted(self) - - return [ + transformed_data = [ feature_transformer.transform(X) for feature_transformer in self._class_feature_transformers_ ] - def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: + if self.np_array_output: + for i in transformed_data: + if isinstance(i, FDataGrid or FDataBasis): + raise TypeError( + "There are transformed instances of FDataGrid or " + "FDataBasis that can't be concatenated on a NumPy " + "array.", + ) + return transformed_data + + if not isinstance(transformed_data[0], FDataGrid or FDataBasis): + raise TypeError( + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(transformed_data[0]), + ) + + frames = [DataFrame( + {transformed_data[0].dataset_name.lower(): transformed_data[0]}, + )] + + for j in transformed_data[1:]: + if isinstance(j, FDataGrid or FDataBasis): + frames.append( + DataFrame({j.dataset_name.lower(): j}), + ) + else: + raise TypeError( + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(j), + ) + + return concat(frames, axis=1) + + def fit_transform(self, X: T, y: ndarray) -> Union[DataFrame, ndarray]: """ Fits and transforms the provided data\ using the transformer specified when initializing the class. @@ -157,6 +199,7 @@ def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: y: Target values of shape = (n_samples) Returns: - Array of shape (n_samples, G). + Eiter array of shape (n_samples, G) or a Data Frame \ + including the transformed data. """ return self.fit(X, y).transform(X) diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_transformer.py similarity index 72% rename from tests/test_per_class_feature_construction.py rename to tests/test_per_class_feature_transformer.py index c51271602..84fe9186b 100644 --- a/tests/test_per_class_feature_construction.py +++ b/tests/test_per_class_feature_transformer.py @@ -1,28 +1,35 @@ -"""Test to check the per class feature transformer module""" -from skfda.datasets import fetch_growth -from skfda.ml.classification import KNeighborsClassifier -from skfda.preprocessing.dim_reduction.feature_extraction.\ - _per_class_feature_transformer import PerClassFeatureTransformer -from skfda.preprocessing.dim_reduction.variable_selection \ - import RecursiveMaximaHunting -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda._utils import _classifier_get_classes +"""Test to check the per class feature transformer module.""" + +import unittest import numpy as np import pytest -import unittest + +from skfda._utils import _classifier_get_classes +from skfda.datasets import fetch_growth +from skfda.ml.classification import KNeighborsClassifier +from skfda.preprocessing.dim_reduction.feature_extraction import ( + FPCA, + PerClassFeatureTransformer, +) +from skfda.preprocessing.dim_reduction.variable_selection import ( + RecursiveMaximaHunting, +) class TestPCFT(unittest.TestCase): # This test fails because the transformers do not have yet tags implemented @pytest.mark.skip(reason="Tags are not yet implemented on transformers") - def test_transform(self): + def test_transform(self) -> None: X, y = fetch_growth(return_X_y=True, as_frame=True) X = X.iloc[:, 0].values y = y.values.codes - t = PerClassFeatureTransformer(RecursiveMaximaHunting()) + t = PerClassFeatureTransformer( + RecursiveMaximaHunting(), + np_array_output=True, + ) t.fit_transform(X, y) transformed = t.transform(X) @@ -35,14 +42,14 @@ def test_transform(self): a = feature_transformer.transform(X) np.testing.assert_array_equal(transformed[cur_class], a) - def test_not_transformer_argument(self): + def test_not_transformer_argument(self) -> None: self.assertRaises( TypeError, PerClassFeatureTransformer, KNeighborsClassifier(), ) - def test_not_taget_required_fitting(self): + def test_not_taget_required_fitting(self) -> None: self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) From 0491928ef61c835a93946ec43aaf6fde79981a93 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 3 Nov 2021 19:42:29 +0100 Subject: [PATCH 08/50] Fix branch with issue 376 --- .../feature_extraction/__init__.py | 1 - .../feature_extraction/_fda_feature_union.py | 130 ------------------ tests/test_fda_feature_union.py | 57 -------- 3 files changed, 188 deletions(-) delete mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py delete mode 100644 tests/test_fda_feature_union.py diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 1167a18a8..74f8aaba6 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,5 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer -from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA from ._per_class_feature_transformer import PerClassFeatureTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py deleted file mode 100644 index 7867cdeb1..000000000 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Feature extraction union for dimensionality reduction.""" -from __future__ import annotations - -from pandas import DataFrame -from sklearn.pipeline import FeatureUnion - -from ....representation.basis import FDataBasis -from ....representation.grid import FDataGrid - - -class FdaFeatureUnion(FeatureUnion): - """Concatenates results of multiple functional transformer objects. - - This estimator applies a list of transformer objects in parallel to the - input data, then concatenates the results (They can be either FDataGrid - and FDataBasis objects or multivariate data itself).This is useful to - combine several feature extraction mechanisms into a single transformer. - Parameters of the transformers may be set using its name and the parameter - name separated by a '__'. A transformer may be replaced entirely by - setting the parameter with its name to another transformer, - or removed by setting to 'drop'. - - Parameters: - transformer_list: - List of tuple containing `(str, transformer)`. The first element - of the tuple is name affected to the transformer while the - second element is a scikit-learn transformer instance. - The transformer instance can also be `"drop"` for it to be - ignored. - n_jobs: - Number of jobs to run in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` - context. - ``-1`` means using all processors. - The default value is None - transformer_weights: - Multiplicative weights for features per transformer. - Keys are transformer names, values the weights. - Raises ValueError if key not present in ``transformer_list``. - verbose: - If True, the time elapsed while fitting each transformer will be - printed as it is completed. - np_array_output: - indicates if the transformed data is requested to be a NumPy array - output. By default the value is False. - - Examples: - Firstly we will import the Berkeley Growth Study data set - >>> from skfda.datasets import fetch_growth - >>> X, y= fetch_growth(return_X_y=True, as_frame=True) - >>> X = X.iloc[:, 0].values - - Then we need to import the transformers we want to use - >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA - >>> from skfda.representation import EvaluationTransformer - - Finally we import the union and apply fit and transform - >>> from skfda.preprocessing.dim_reduction.feature_extraction. - ... _fda_feature_union import FdaFeatureUnion - >>> union = FdaFeatureUnion([ - ... ("Eval", EvaluationTransformer()), - ... ("fpca", FPCA()), ], np_array_output=True) - >>> union.fit_transform(X) - """ - - def __init__( - self, - transformer_list, - *, - n_jobs=None, - transformer_weights=None, - verbose=False, - np_array_output=False, - ) -> None: - self.np_array_output = np_array_output - super().__init__( - transformer_list, - n_jobs=n_jobs, - transformer_weights=transformer_weights, - verbose=verbose, - ) - - def _hstack(self, Xs): - - if (self.np_array_output): - for i in Xs: - if isinstance(i, FDataGrid or FDataBasis): - raise TypeError( - "There are transformed instances of FDataGrid or " - "FDataBasis that can't be concatenated on a NumPy " - "array.", - ) - return super()._hstack(Xs) - - first_grid = True - first_basis = True - for j in Xs: - if isinstance(j, FDataGrid): - if first_grid: - curves = j - first_grid = False - else: - curves = curves.concatenate(j) - elif isinstance(j, FDataBasis): - if first_basis: - target = j - first_basis = False - else: - target = target.concatenate(j) - else: - raise TypeError( - "Transformed instance is not of type FDataGrid or" - " FDataBasis. It is " + type(j), - ) - - feature_name = curves.dataset_name.lower() + " transformed" - target_name = "transformed target" - if first_grid: # There are only FDataBasis - return DataFrame({ - target_name: target, - }) - elif first_basis: # There are only FDataGrids - return DataFrame({ - feature_name: curves, - }) - - return DataFrame({ - feature_name: curves, - target_name: target, - }) diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py deleted file mode 100644 index 03ea49b30..000000000 --- a/tests/test_fda_feature_union.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Test to check the Fda Feature Union module.""" - -import unittest - -from pandas.core.frame import DataFrame - -from skfda.datasets import fetch_growth -from skfda.misc.operators import SRSF -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.preprocessing.smoothing.kernel_smoothers\ - import NadarayaWatsonSmoother -from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union\ - import FdaFeatureUnion -from skfda.representation import EvaluationTransformer - - -class TestFdaFeatureUnion(unittest.TestCase): - - def setUp(self) -> None: - X = fetch_growth(return_X_y=True, as_frame=True)[0] - self.X = X.iloc[:, 0].values - - def test_incompatible_array_output(self): - - u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer()), ("fpca", FPCA())], - np_array_output=False, - ) - self.assertRaises(TypeError, u.fit_transform, self.X) - - def test_incompatible_fdatagrid_output(self): - - u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer()), ("srsf", SRSF())], - np_array_output=True, - ) - self.assertRaises(TypeError, u.fit_transform, self.X) - - def test_correct_transformation_concat(self): - u = FdaFeatureUnion( - [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], - ) - created_frame = u.fit_transform(self.X) - - t1 = SRSF().fit_transform(self.X) - t2 = NadarayaWatsonSmoother().fit_transform(self.X) - t = t1.concatenate(t2) - - true_frame = DataFrame({ - t.dataset_name.lower() + " transformed": t, - }) - result = True - self.assertEqual(result, true_frame.equals(created_frame)) - - -if __name__ == '__main__': - unittest.main() From f5c12f965fc1d654f9a7dfaf6dd4ffbc15c87f7b Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 9 Nov 2021 00:38:34 +0100 Subject: [PATCH 09/50] Some fixes --- .../feature_extraction/__init__.py | 2 +- .../_per_class_feature_transformer.py | 51 ++++++++----------- tests/test_per_class_feature_transformer.py | 41 +++++++-------- 3 files changed, 40 insertions(+), 54 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 74f8aaba6..0b399631f 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,4 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer from ._fpca import FPCA -from ._per_class_feature_transformer import PerClassFeatureTransformer +from ._per_class_feature_transformer import PerClassTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index 6bd35a403..8f86b7070 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -1,10 +1,11 @@ """Feature extraction transformers for dimensionality reduction.""" from __future__ import annotations +import warnings from typing import TypeVar, Union from numpy import ndarray -from pandas import DataFrame, concat +from pandas import DataFrame from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted @@ -15,7 +16,7 @@ T = TypeVar("T", bound=FData) -class PerClassFeatureTransformer(TransformerMixin): +class PerClassTransformer(TransformerMixin): r"""Per class feature transformer for functional data. This class takes a transformer and performs the following map: @@ -35,7 +36,7 @@ class PerClassFeatureTransformer(TransformerMixin): The transformer that we want to apply to the given data. It should use target data while fitting. This is checked by looking at the 'stateless' and 'requires_y' tags - np_array_output: bool + array_output: bool indicates if the transformed data is requested to be a NumPy array output. By default the value is False. Examples: @@ -50,15 +51,15 @@ class PerClassFeatureTransformer(TransformerMixin): ... X, y, test_size=0.25, stratify=y, random_state=0) >>> from skfda.preprocessing.dim_reduction.feature_extraction - ... import PerClassFeatureTransformer + ... import PerClassTransformer Then we will need to select a fda transformer, and so we will use RecursiveMaximaHunting >>> from skfda.preprocessing.dim_reduction.variable_selection ... import RecursiveMaximaHunting - >>> t = PerClassFeatureTransformer(RecursiveMaximaHunting(), - ... np_array_output=True) + >>> t = PerClassTransformer(RecursiveMaximaHunting(), + ... array_output=True) Finally we need to fit the data and transform it @@ -72,11 +73,10 @@ def __init__( self, transformer: TransformerMixin, *, - np_array_output=False, + array_output=False, ) -> None: self.transformer = transformer - self.np_array_output = np_array_output - self._validate_transformer() + self.array_output = array_output def _validate_transformer( self, @@ -93,7 +93,7 @@ def _validate_transformer( """ if not (hasattr(self.transformer, "fit") and hasattr(self.transformer, "transform") - or hasattr(self.transformer, "fit_transform") + and hasattr(self.transformer, "fit_transform") ): raise TypeError( "Transformer should implement fit and " @@ -104,8 +104,8 @@ def _validate_transformer( tags = self.transformer._get_tags() - if not (tags['stateless'] and tags['requires_y']): - raise TypeError( + if tags['stateless'] and not tags['requires_y']: + warnings.warn( "Transformer should use target data in fit." + str(self.transformer) + " (type " + str(type(self.transformer)) + ")" @@ -116,7 +116,7 @@ def fit( self, X: T, y: ndarray, - ) -> PerClassFeatureTransformer: + ) -> PerClassTransformer: """ Fit the model on each class using X as\ training data and y as target values. @@ -128,6 +128,7 @@ def fit( Returns: self """ + self._validate_transformer() classes, class_feature_transformers = _fit_feature_transformer( X, y, @@ -156,7 +157,7 @@ def transform(self, X: T) -> Union[DataFrame, ndarray]: for feature_transformer in self._class_feature_transformers_ ] - if self.np_array_output: + if self.array_output: for i in transformed_data: if isinstance(i, FDataGrid or FDataBasis): raise TypeError( @@ -166,28 +167,16 @@ def transform(self, X: T) -> Union[DataFrame, ndarray]: ) return transformed_data - if not isinstance(transformed_data[0], FDataGrid or FDataBasis): - raise TypeError( - "Transformed instance is not of type FDataGrid or" - " FDataBasis. It is " + type(transformed_data[0]), - ) - - frames = [DataFrame( - {transformed_data[0].dataset_name.lower(): transformed_data[0]}, - )] - - for j in transformed_data[1:]: - if isinstance(j, FDataGrid or FDataBasis): - frames.append( - DataFrame({j.dataset_name.lower(): j}), - ) - else: + for j in transformed_data: + if not isinstance(j, FDataGrid or FDataBasis): raise TypeError( "Transformed instance is not of type FDataGrid or" " FDataBasis. It is " + type(j), ) - return concat(frames, axis=1) + return DataFrame( + {'Transformed data': transformed_data}, + ) def fit_transform(self, X: T, y: ndarray) -> Union[DataFrame, ndarray]: """ diff --git a/tests/test_per_class_feature_transformer.py b/tests/test_per_class_feature_transformer.py index 84fe9186b..35542dac0 100644 --- a/tests/test_per_class_feature_transformer.py +++ b/tests/test_per_class_feature_transformer.py @@ -3,55 +3,52 @@ import unittest import numpy as np -import pytest from skfda._utils import _classifier_get_classes from skfda.datasets import fetch_growth from skfda.ml.classification import KNeighborsClassifier from skfda.preprocessing.dim_reduction.feature_extraction import ( - FPCA, - PerClassFeatureTransformer, + PerClassTransformer, ) from skfda.preprocessing.dim_reduction.variable_selection import ( RecursiveMaximaHunting, ) -class TestPCFT(unittest.TestCase): +class TestPCT(unittest.TestCase): + def setUp(self) -> None: + X, y = fetch_growth(return_X_y=True, as_frame=True) + self.X = X.iloc[:, 0].values + self.y = y.values.codes - # This test fails because the transformers do not have yet tags implemented - @pytest.mark.skip(reason="Tags are not yet implemented on transformers") def test_transform(self) -> None: - X, y = fetch_growth(return_X_y=True, as_frame=True) - X = X.iloc[:, 0].values - y = y.values.codes - t = PerClassFeatureTransformer( + t = PerClassTransformer( RecursiveMaximaHunting(), - np_array_output=True, + array_output=True, ) - t.fit_transform(X, y) - transformed = t.transform(X) + t.fit_transform(self.X, self.y) + transformed = t.transform(self.X) - classes, y_ind = _classifier_get_classes(y) + classes, y_ind = _classifier_get_classes(self.y) for cur_class in range(classes.size): feature_transformer = RecursiveMaximaHunting().fit( - X[y_ind == cur_class], - y[y_ind == cur_class], + self.X[y_ind == cur_class], + self.y[y_ind == cur_class], ) - a = feature_transformer.transform(X) + a = feature_transformer.transform(self.X) np.testing.assert_array_equal(transformed[cur_class], a) def test_not_transformer_argument(self) -> None: + + t = PerClassTransformer(KNeighborsClassifier()) self.assertRaises( TypeError, - PerClassFeatureTransformer, - KNeighborsClassifier(), + t.fit, + self.X, + self.y, ) - def test_not_taget_required_fitting(self) -> None: - self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) - if __name__ == '__main__': unittest.main() From d020ec346abbfae771aef503fb4a1983bb568f51 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 9 Nov 2021 17:27:28 +0100 Subject: [PATCH 10/50] Issue 376 corrections --- skfda/_utils/_utils.py | 2 +- .../feature_extraction/__init__.py | 2 +- ...ansformer.py => _per_class_transformer.py} | 29 ++++++++++++------- ...ormer.py => test_per_class_transformer.py} | 9 ++++-- 4 files changed, 27 insertions(+), 15 deletions(-) rename skfda/preprocessing/dim_reduction/feature_extraction/{_per_class_feature_transformer.py => _per_class_transformer.py} (88%) rename tests/{test_per_class_feature_transformer.py => test_per_class_transformer.py} (85%) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index a9f474ea2..f1419a75c 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -35,8 +35,8 @@ DomainRangeLike, GridPoints, GridPointsLike, - NDArrayInt, NDArrayFloat, + NDArrayInt, ) from ..representation.extrapolation import ExtrapolationLike diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 0b399631f..2bcf01bf5 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,4 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer from ._fpca import FPCA -from ._per_class_feature_transformer import PerClassTransformer +from ._per_class_transformer import PerClassTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py similarity index 88% rename from skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py rename to skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 67e6d99fb..da6da0d02 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -40,33 +40,41 @@ class PerClassTransformer(TransformerMixin): indicates if the transformed data is requested to be a NumPy array output. By default the value is False. Examples: - Firstly, we will import and split the Berkeley Growth Study dataset + Firstly, we will import the Berkeley Growth Study dataset >>> from skfda.datasets import fetch_growth - >>> from sklearn.model_selection import train_test_split >>> X, y = fetch_growth(return_X_y=True, as_frame=True) >>> X = X.iloc[:, 0].values >>> y = y.values.codes - >>> X_train, X_test, y_train, y_test = train_test_split( - ... X, y, test_size=0.25, stratify=y, random_state=0) >>> from skfda.preprocessing.dim_reduction.feature_extraction ... import PerClassTransformer Then we will need to select a fda transformer, and so we will - use RecursiveMaximaHunting + use RecursiveMaximaHunting. We need to fit the data and transform it >>> from skfda.preprocessing.dim_reduction.variable_selection ... import RecursiveMaximaHunting >>> t = PerClassTransformer(RecursiveMaximaHunting(), ... array_output=True) + >>> x_transformed = t.fit_transform(X, y) + + x_transformed will be a vector with the transformed data. + We will split the generated data and fit a KNN classifier. - Finally we need to fit the data and transform it + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.neighbors import KNeighborsClassifier + >>> X_train, X_test, y_train, y_test = train_test_split( + ... x_transformed, y, test_size=0.25, stratify=y, random_state=0) + >>> neigh = KNeighborsClassifier() + >>> neigh.fit(X_train, y_train) - >>> t.fit(X_train, y_train) - >>> x_transformed = t.transform(X_test) + Finally we can predict and check the score + >>> neigh.predict(X_test) + [0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 1] - x_transformed will be a vector with the transformed data + >>> neigh.score(X_test, y_test) + 0.958 """ def __init__( @@ -107,6 +115,7 @@ def _validate_transformer( if tags['stateless'] and not tags['requires_y']: warnings.warn( "Transformer should use target data in fit." + + "requires_y tag should be enabled and stateless disabled" + str(self.transformer) + " (type " + str(type(self.transformer)) + ")" " doesn't", @@ -153,7 +162,7 @@ def transform(self, X: T) -> Union[DataFrame, np.ndarray]: """ sklearn_check_is_fitted(self) - transformed_data = np.empty((93, 0)) + transformed_data = np.empty((len(X), 0)) for feature_transformer in self._class_feature_transformers_: elem = feature_transformer.transform(X) data = np.array(elem) diff --git a/tests/test_per_class_feature_transformer.py b/tests/test_per_class_transformer.py similarity index 85% rename from tests/test_per_class_feature_transformer.py rename to tests/test_per_class_transformer.py index bce891fc5..f2ef65326 100644 --- a/tests/test_per_class_feature_transformer.py +++ b/tests/test_per_class_transformer.py @@ -1,4 +1,4 @@ -"""Test to check the per class feature transformer module.""" +"""Test to check the per class transformer module.""" import unittest @@ -16,13 +16,16 @@ class TestPCT(unittest.TestCase): + """Tests for PCT module.""" + def setUp(self) -> None: + """Fetch the Berkeley Growth Study dataset.""" X, y = fetch_growth(return_X_y=True, as_frame=True) self.X = X.iloc[:, 0].values self.y = y.values.codes def test_transform(self) -> None: - + """Check the data transformation is done correctly.""" t = PerClassTransformer( RecursiveMaximaHunting(), array_output=True, @@ -43,7 +46,7 @@ def test_transform(self) -> None: np.testing.assert_array_equal(transformed, manual) def test_not_transformer_argument(self) -> None: - + """Check that invalid arguments in fit raise exception.""" t = PerClassTransformer(KNeighborsClassifier()) self.assertRaises( TypeError, From d245b1cfe20dc2299742ae4c3840daa7ed0d73f2 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 10 Nov 2021 19:38:36 +0100 Subject: [PATCH 11/50] Corrections --- .../_per_class_transformer.py | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index da6da0d02..ec2a76f68 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -47,16 +47,20 @@ class PerClassTransformer(TransformerMixin): >>> X = X.iloc[:, 0].values >>> y = y.values.codes - >>> from skfda.preprocessing.dim_reduction.feature_extraction - ... import PerClassTransformer + >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( + ... PerClassTransformer, + ... ) Then we will need to select a fda transformer, and so we will use RecursiveMaximaHunting. We need to fit the data and transform it - >>> from skfda.preprocessing.dim_reduction.variable_selection - ... import RecursiveMaximaHunting - >>> t = PerClassTransformer(RecursiveMaximaHunting(), - ... array_output=True) + >>> from skfda.preprocessing.dim_reduction.variable_selection import ( + ... RecursiveMaximaHunting, + ... ) + >>> t = PerClassTransformer( + ... RecursiveMaximaHunting(), + ... array_output=True, + ... ) >>> x_transformed = t.fit_transform(X, y) x_transformed will be a vector with the transformed data. @@ -65,7 +69,12 @@ class PerClassTransformer(TransformerMixin): >>> from sklearn.model_selection import train_test_split >>> from sklearn.neighbors import KNeighborsClassifier >>> X_train, X_test, y_train, y_test = train_test_split( - ... x_transformed, y, test_size=0.25, stratify=y, random_state=0) + ... x_transformed, + ... y, + ... test_size=0.25, + ... stratify=y, + ... random_state=0, + ... ) >>> neigh = KNeighborsClassifier() >>> neigh.fit(X_train, y_train) @@ -99,10 +108,11 @@ def _validate_transformer( Returns: None """ - if not (hasattr(self.transformer, "fit") - and hasattr(self.transformer, "transform") - and hasattr(self.transformer, "fit_transform") - ): + if not ( + hasattr(self.transformer, "fit") + and hasattr(self.transformer, "transform") + and hasattr(self.transformer, "fit_transform") + ): raise TypeError( "Transformer should implement fit and " "transform. " + str(self.transformer) @@ -112,7 +122,7 @@ def _validate_transformer( tags = self.transformer._get_tags() - if tags['stateless'] and not tags['requires_y']: + if tags['stateless'] or not tags['requires_y']: warnings.warn( "Transformer should use target data in fit." + "requires_y tag should be enabled and stateless disabled" @@ -170,7 +180,7 @@ def transform(self, X: T) -> Union[DataFrame, np.ndarray]: if self.array_output: for i in transformed_data: - if isinstance(i, FDataGrid or FDataBasis): + if isinstance(i, (FDataGrid, FDataBasis)): raise TypeError( "There are transformed instances of FDataGrid or " "FDataBasis that can't be concatenated on a NumPy " @@ -178,13 +188,6 @@ def transform(self, X: T) -> Union[DataFrame, np.ndarray]: ) return np.array(transformed_data) - for j in transformed_data: - if not isinstance(j, FDataGrid or FDataBasis): - raise TypeError( - "Transformed instance is not of type FDataGrid or" - " FDataBasis. It is " + type(j), - ) - return DataFrame( {'Transformed data': transformed_data}, ) From 1f02e8098422c027692b9bd0bcd88df3c9886c44 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Thu, 11 Nov 2021 22:41:45 +0100 Subject: [PATCH 12/50] Mypy warnings ignored --- .../feature_extraction/_per_class_transformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index ec2a76f68..1b09e14e4 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -109,9 +109,9 @@ def _validate_transformer( None """ if not ( - hasattr(self.transformer, "fit") - and hasattr(self.transformer, "transform") - and hasattr(self.transformer, "fit_transform") + hasattr(self.transformer, "fit") # noqa: WPS421 + and hasattr(self.transformer, "transform") # noqa: WPS421 + and hasattr(self.transformer, "fit_transform") # noqa: WPS421 ): raise TypeError( "Transformer should implement fit and " @@ -120,7 +120,7 @@ def _validate_transformer( " doesn't", ) - tags = self.transformer._get_tags() + tags = self.transformer._get_tags() # noqa: WPS437 if tags['stateless'] or not tags['requires_y']: warnings.warn( From fde8ab00cd672e40347ecd358aad716a812513db Mon Sep 17 00:00:00 2001 From: dSerna4 <91683791+dSerna4@users.noreply.github.com> Date: Sat, 13 Nov 2021 19:54:40 +0100 Subject: [PATCH 13/50] Added _logistic_regression.py --- __init__.py | 12 +++ _logistic_regression.py | 220 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 __init__.py create mode 100644 _logistic_regression.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 000000000..213410993 --- /dev/null +++ b/__init__.py @@ -0,0 +1,12 @@ +"""Classification.""" +from ._centroid_classifiers import DTMClassifier, NearestCentroid +from ._depth_classifiers import ( + DDClassifier, + DDGClassifier, + MaximumDepthClassifier, +) +from ._neighbors_classifiers import ( + KNeighborsClassifier, + RadiusNeighborsClassifier, +) +from ._logistic_regression import LogisticRegression diff --git a/_logistic_regression.py b/_logistic_regression.py new file mode 100644 index 000000000..9194f32b3 --- /dev/null +++ b/_logistic_regression.py @@ -0,0 +1,220 @@ +from __future__ import annotations + +from typing import Callable, Tuple + +from numpy import append, array, ndarray, zeros +from numpy.core.fromnumeric import argmax, mean +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.linear_model import LogisticRegression as mvLogisticRegression +from sklearn.utils.validation import check_is_fitted + +from ..._utils import _classifier_get_classes +from ...representation import FData, FDataGrid + + +class LogisticRegression( + BaseEstimator, # type: ignore + ClassifierMixin, # type: ignore +): + r"""Logistic Regression classifier for functional data. + + This class implements the sequential “greedy” algorithm + for functional logistic regression proposed in + https://arxiv.org/abs/1812.00721. + + .. warning:: + For now, only functional data whith one dimensional domains + are supported. + + Args: + p (int): number of points (and coefficients) to be selected by + the algoritm. + + Attributes: + points\_: A list containing the selected points. + coef\_: A list containing the coefficient for each selected point. + intercept\_: Independent term. + + Examples: + >>> from numpy import array + >>> from skfda.datasets import make_gaussian_process + >>> from skfda.ml.classification import LogisticRegression + + >>> fd1 = make_gaussian_process(n_samples = 50, n_features = 100, + ... noise = 0.5, random_state = 0) + >>> fd2 = make_gaussian_process(n_samples=50, n_features = 100, + ... mean = array([1]*100), noise = 0.5, + ... random_state=0) + + >>> fd = fd1.concatenate(fd2) + >>> y = 50*[0] + 50*[1] + + >>> lr = LogisticRegression(p=2) + >>> _ = lr.fit(fd[::2], y[::2]) + >>> lr.coef_.round(2) + array([[ 2.41, 1.68]]) + >>> lr.points_.round(2) + array([ 0.11, 0. ]) + >>> lr.score(fd[1::2],y[1::2]) + 0.92 + + """ + + def __init__( + self, + p: int = 5, + ) -> None: + + self.p = p + + def fit( + self, + X: FData, + y: ndarray, + ) -> LogisticRegression: + + X, classes, y_ind = self._argcheck_X_y(X, y) + + self.classes_ = classes + + n_samples = len(y) + n_features = len(X.grid_points[0]) + + ts = zeros((self.p, )) # set of indexes of the selected points + + mvlr = mvLogisticRegression() # multivariate logistic regression + ts_values = [[] for _ in range(n_samples)] + + LL = zeros((n_features, )) + for q in range(self.p): + for t in range(n_features): + + x_mv = self._multivariate_append( + ts_values, + X.data_matrix[:, t, 0], + ) + mvlr.fit(x_mv, y_ind) + + # log-likelihood function at t + log_probs = mvlr.predict_log_proba(x_mv) + log_probs = array( + [log_probs[i, y[i]] for i in range(n_samples)], + ) + LL[t] = mean(log_probs) + + tmax = argmax(LL) + ts[q] = tmax + ts_values = self._multivariate_append( + ts_values, + X.data_matrix[:, tmax, 0], + ) + + # fit for the complete set of points + mvlr.fit(ts_values, y_ind) + self.coef_ = mvlr.coef_ + self.intercept_ = mvlr.intercept_ + self._mvlr = mvlr + + self._ts = ts + self.points_ = array( + [X.grid_points[0][int(t)] for t in ts], # noqa: WPS441 + ) + + return self + + def predict(self, X: FData) -> ndarray: + return self._wrapper(self._mvlr.predict, X) + + def predict_log_proba(self, X: FData) -> ndarray: + return self._wrapper(self._mvlr.predict_log_proba, X) + + def predict_proba(self, X: FData) -> ndarray: + return self._wrapper(self._mvlr.predict_proba, X) + + def _argcheck_X( + self, + X: FData, + ) -> FDataGrid: + + X = X.to_grid() + + dim = len(X.grid_points) + if dim > 1: + raise ValueError( + f'The dimension of the domain has to be one' + f'; got {dim} dimensions', + ) + + return X + + def _argcheck_X_y( + self, + X: FData, + y: ndarray, + ) -> Tuple[FDataGrid, ndarray, ndarray]: + + self._argcheck_X(X) + + classes, y_ind = _classifier_get_classes(y) + + if classes.size > 2: + raise ValueError( + f'The number of classes has to be two' + f'; got {classes.size} classes', + ) + + if (len(y) != len(X)): + raise ValueError( + "The number of samples on independent variables" + " and classes should be the same", + ) + + return (X, classes, y_ind) + + def _to_multivariate( + self, + ts: ndarray, + X: FData, + ) -> ndarray: + """Transform the data for multivariate logistic regression.""" + X = self._argcheck_X(X) + + return array([X.data_matrix[:, int(t), 0] for t in ts]).T + + def _multivariate_append( + self, + a: ndarray, + b: ndarray, + ) -> ndarray: + """Append two arrays in a particular manner. + + Args: + a: ndarray of shape (n, m). + b: ndarray of shape (n,). + + Returns: + Array of shape (n, m + 1) + """ + return append(a, b.reshape(-1, 1), axis=1) + + def _wrapper( + self, + method: Callable[[ndarray], ndarray], + X: FData, + ): + """Wrap multivariate logistic regression method. + + This function transforms functional data in order to pass + them to a multivariate logistic regression method. + + .. warning:: + This function can't be called before fit. + """ + + check_is_fitted(self) + + X = self._argcheck_X(X) + + ts_values = self._to_multivariate(self._ts, X) + + return method(ts_values) From a6d38fdca65277c65b7ef93439860604b7ec1ef6 Mon Sep 17 00:00:00 2001 From: dSerna4 <91683791+dSerna4@users.noreply.github.com> Date: Sat, 13 Nov 2021 22:32:39 +0100 Subject: [PATCH 14/50] fixed some style errors --- __init__.py | 2 +- _logistic_regression.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/__init__.py b/__init__.py index 213410993..c7eada4b7 100644 --- a/__init__.py +++ b/__init__.py @@ -5,8 +5,8 @@ DDGClassifier, MaximumDepthClassifier, ) +from ._logistic_regression import LogisticRegression from ._neighbors_classifiers import ( KNeighborsClassifier, RadiusNeighborsClassifier, ) -from ._logistic_regression import LogisticRegression diff --git a/_logistic_regression.py b/_logistic_regression.py index 9194f32b3..22713ae1a 100644 --- a/_logistic_regression.py +++ b/_logistic_regression.py @@ -67,7 +67,7 @@ def __init__( self.p = p - def fit( + def fit( # noqa: D102 self, X: FData, y: ndarray, @@ -122,13 +122,16 @@ def fit( return self - def predict(self, X: FData) -> ndarray: + def predict(self, X: FData) -> ndarray: # noqa: D102 + check_is_fitted(self) return self._wrapper(self._mvlr.predict, X) - def predict_log_proba(self, X: FData) -> ndarray: + def predict_log_proba(self, X: FData) -> ndarray: # noqa: D102 + check_is_fitted(self) return self._wrapper(self._mvlr.predict_log_proba, X) - def predict_proba(self, X: FData) -> ndarray: + def predict_proba(self, X: FData) -> ndarray: # noqa: D102 + check_is_fitted(self) return self._wrapper(self._mvlr.predict_proba, X) def _argcheck_X( @@ -211,8 +214,6 @@ def _wrapper( This function can't be called before fit. """ - check_is_fitted(self) - X = self._argcheck_X(X) ts_values = self._to_multivariate(self._ts, X) From 897eb1efffc452d130a80427b87d94a67566a26c Mon Sep 17 00:00:00 2001 From: Alvaro Date: Thu, 18 Nov 2021 21:21:41 +0100 Subject: [PATCH 15/50] pytest passed --- .../feature_extraction/_per_class_transformer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 1b09e14e4..2044cab97 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -76,13 +76,14 @@ class PerClassTransformer(TransformerMixin): ... random_state=0, ... ) >>> neigh = KNeighborsClassifier() - >>> neigh.fit(X_train, y_train) + >>> neigh = neigh.fit(X_train, y_train) Finally we can predict and check the score >>> neigh.predict(X_test) - [0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 1] + array([0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1], dtype=int8) - >>> neigh.score(X_test, y_test) + >>> round(neigh.score(X_test, y_test), 3) 0.958 """ @@ -125,7 +126,7 @@ def _validate_transformer( if tags['stateless'] or not tags['requires_y']: warnings.warn( "Transformer should use target data in fit." - + "requires_y tag should be enabled and stateless disabled" + + " requires_y tag should be enabled and stateless disabled" + str(self.transformer) + " (type " + str(type(self.transformer)) + ")" " doesn't", From 755c31f29f4739ffb5819ebbe7f3ee1db030df38 Mon Sep 17 00:00:00 2001 From: dSerna4 <91683791+dSerna4@users.noreply.github.com> Date: Thu, 18 Nov 2021 21:24:02 +0100 Subject: [PATCH 16/50] Add suggested changes --- _logistic_regression.py | 144 ++++++--------- refs.bib | 385 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 443 insertions(+), 86 deletions(-) create mode 100644 refs.bib diff --git a/_logistic_regression.py b/_logistic_regression.py index 22713ae1a..c310810c4 100644 --- a/_logistic_regression.py +++ b/_logistic_regression.py @@ -2,14 +2,14 @@ from typing import Callable, Tuple -from numpy import append, array, ndarray, zeros -from numpy.core.fromnumeric import argmax, mean +import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.linear_model import LogisticRegression as mvLogisticRegression from sklearn.utils.validation import check_is_fitted from ..._utils import _classifier_get_classes -from ...representation import FData, FDataGrid +from ...representation import FDataGrid +from ...representation._typing import NDArrayAny, NDArrayInt class LogisticRegression( @@ -20,17 +20,18 @@ class LogisticRegression( This class implements the sequential “greedy” algorithm for functional logistic regression proposed in - https://arxiv.org/abs/1812.00721. + :footcite:ts:`bueno+larraz_2021_functional`. .. warning:: - For now, only functional data whith one dimensional domains - are supported. + For now, only binary classification for functional + data with one dimensional domains is supported. Args: - p (int): number of points (and coefficients) to be selected by - the algoritm. + p: number of points (and coefficients) to be selected by + the algorithm. Attributes: + classes\_: A list containing the name of the classes points\_: A list containing the selected points. coef\_: A list containing the coefficient for each selected point. intercept\_: Independent term. @@ -39,16 +40,21 @@ class LogisticRegression( >>> from numpy import array >>> from skfda.datasets import make_gaussian_process >>> from skfda.ml.classification import LogisticRegression - - >>> fd1 = make_gaussian_process(n_samples = 50, n_features = 100, - ... noise = 0.5, random_state = 0) - >>> fd2 = make_gaussian_process(n_samples=50, n_features = 100, - ... mean = array([1]*100), noise = 0.5, - ... random_state=0) - + >>> fd1 = make_gaussian_process( + ... n_samples=50, + ... n_features=100, + ... noise=0.5, + ... random_state=0, + ... ) + >>> fd2 = make_gaussian_process( + ... n_samples=50, + ... n_features = 100, + ... mean = np.array([1]*100), + ... noise = 0.5, + ... random_state=0 + ... ) >>> fd = fd1.concatenate(fd2) >>> y = 50*[0] + 50*[1] - >>> lr = LogisticRegression(p=2) >>> _ = lr.fit(fd[::2], y[::2]) >>> lr.coef_.round(2) @@ -58,6 +64,9 @@ class LogisticRegression( >>> lr.score(fd[1::2],y[1::2]) 0.92 + References: + .. footbibliography:: + """ def __init__( @@ -69,8 +78,8 @@ def __init__( def fit( # noqa: D102 self, - X: FData, - y: ndarray, + X: FDataGrid, + y: NDArrayAny, ) -> LogisticRegression: X, classes, y_ind = self._argcheck_X_y(X, y) @@ -80,83 +89,73 @@ def fit( # noqa: D102 n_samples = len(y) n_features = len(X.grid_points[0]) - ts = zeros((self.p, )) # set of indexes of the selected points + selected_indexes = np.zeros(self.p, dtype=np.intc) mvlr = mvLogisticRegression() # multivariate logistic regression - ts_values = [[] for _ in range(n_samples)] - LL = zeros((n_features, )) + x_mv = np.zeros((n_samples, self.p)) + LL = np.zeros(n_features) for q in range(self.p): for t in range(n_features): - x_mv = self._multivariate_append( - ts_values, - X.data_matrix[:, t, 0], - ) - mvlr.fit(x_mv, y_ind) + x_mv[:, q] = X.data_matrix[:, t, 0] + mvlr.fit(x_mv[:, :self.p + 1], y_ind) # log-likelihood function at t log_probs = mvlr.predict_log_proba(x_mv) - log_probs = array( + log_probs = np.array( [log_probs[i, y[i]] for i in range(n_samples)], ) - LL[t] = mean(log_probs) + LL[t] = np.mean(log_probs) - tmax = argmax(LL) - ts[q] = tmax - ts_values = self._multivariate_append( - ts_values, - X.data_matrix[:, tmax, 0], - ) + tmax = np.argmax(LL) + selected_indexes[q] = tmax + x_mv[:, q] = X.data_matrix[:, tmax, 0] # fit for the complete set of points - mvlr.fit(ts_values, y_ind) + mvlr.fit(x_mv, y_ind) + self.coef_ = mvlr.coef_ self.intercept_ = mvlr.intercept_ self._mvlr = mvlr - self._ts = ts - self.points_ = array( - [X.grid_points[0][int(t)] for t in ts], # noqa: WPS441 - ) + self._selected_indexes = selected_indexes + self.points_ = X.grid_points[0][selected_indexes] return self - def predict(self, X: FData) -> ndarray: # noqa: D102 + def predict(self, X: FDataGrid) -> NDArrayInt: # noqa: D102 check_is_fitted(self) return self._wrapper(self._mvlr.predict, X) - def predict_log_proba(self, X: FData) -> ndarray: # noqa: D102 + def predict_log_proba(self, X: FDataGrid) -> NDArrayInt: # noqa: D102 check_is_fitted(self) return self._wrapper(self._mvlr.predict_log_proba, X) - def predict_proba(self, X: FData) -> ndarray: # noqa: D102 + def predict_proba(self, X: FDataGrid) -> NDArrayInt: # noqa: D102 check_is_fitted(self) return self._wrapper(self._mvlr.predict_proba, X) - def _argcheck_X( + def _argcheck_X( # noqa: N802 self, - X: FData, + X: FDataGrid, ) -> FDataGrid: - X = X.to_grid() - - dim = len(X.grid_points) - if dim > 1: + if X.dim_domain > 1: raise ValueError( f'The dimension of the domain has to be one' - f'; got {dim} dimensions', + f'; got {X.dim_domain} dimensions', ) return X - def _argcheck_X_y( + def _argcheck_X_y( # noqa: N802 self, - X: FData, - y: ndarray, - ) -> Tuple[FDataGrid, ndarray, ndarray]: + X: FDataGrid, + y: NDArrayAny, + ) -> Tuple[FDataGrid, NDArrayAny, NDArrayAny]: - self._argcheck_X(X) + X = self._argcheck_X(X) classes, y_ind = _classifier_get_classes(y) @@ -174,37 +173,11 @@ def _argcheck_X_y( return (X, classes, y_ind) - def _to_multivariate( - self, - ts: ndarray, - X: FData, - ) -> ndarray: - """Transform the data for multivariate logistic regression.""" - X = self._argcheck_X(X) - - return array([X.data_matrix[:, int(t), 0] for t in ts]).T - - def _multivariate_append( - self, - a: ndarray, - b: ndarray, - ) -> ndarray: - """Append two arrays in a particular manner. - - Args: - a: ndarray of shape (n, m). - b: ndarray of shape (n,). - - Returns: - Array of shape (n, m + 1) - """ - return append(a, b.reshape(-1, 1), axis=1) - def _wrapper( self, - method: Callable[[ndarray], ndarray], - X: FData, - ): + method: Callable[[NDArrayAny], NDArrayAny], + X: FDataGrid, + ) -> NDArrayAny: """Wrap multivariate logistic regression method. This function transforms functional data in order to pass @@ -213,9 +186,8 @@ def _wrapper( .. warning:: This function can't be called before fit. """ - X = self._argcheck_X(X) - ts_values = self._to_multivariate(self._ts, X) + x_mv = X.data_matrix[:, self._selected_indexes, 0] - return method(ts_values) + return method(x_mv) diff --git a/refs.bib b/refs.bib new file mode 100644 index 000000000..058b6d588 --- /dev/null +++ b/refs.bib @@ -0,0 +1,385 @@ +@article{berrendero+cuevas+torrecilla_2016_hunting, + author = {Berrendero, J.R. and Cuevas, Antonio and Torrecilla, José}, + year = {2016}, + pages = {619 -- 638}, + title = {Variable selection in functional data classification: A maxima-hunting proposal}, + number = {2}, + volume = {26}, + journal = {Statistica Sinica}, + doi = {10.5705/ss.202014.0014} +} + +@article{berrendero+cuevas+torrecilla_2018_hilbert, + author = {José R. Berrendero and Antonio Cuevas and José L. Torrecilla}, + title = {On the Use of Reproducing Kernel Hilbert Spaces in Functional Classification}, + journal = {Journal of the American Statistical Association}, + volume = {113}, + number = {523}, + pages = {1210 -- 1218}, + year = {2018}, + publisher = {Taylor & Francis}, + doi = {10.1080/01621459.2017.1320287}, + URL = {https://doi.org/10.1080/01621459.2017.1320287} +} + +@inproceedings{breunig++_2000_outliers, + author = {Breunig, Markus and Kriegel, Hans-Peter and Ng, Raymond and Sander, Joerg}, + year = {2000}, + month = {06}, + pages = {93 -- 104}, + title = {LOF: Identifying Density-Based Local Outliers.}, + volume = {29}, + journal = {ACM Sigmod Record}, + doi = {10.1145/342009.335388} +} + +@article{bueno+larraz_2021_functional, + title={On functional logistic regression: some conceptual issues}, + author={Beatriz Bueno-Larraz and José R. Berrendero and Antonio Cuevas}, + year={2021}, + eprint={1812.00721}, + archivePrefix={arXiv}, + primaryClass={math.ST} +} + +@article{cuesta-albertos++_2015_ddg, + title = {The DDG-classifier in the functional setting}, + author = {J. A. Cuesta-Albertos and M. Febrero-Bande and M. Oviedo de la Fuente}, + journal = {TEST}, + year = {2015}, + volume = {26}, + pages = {119 -- 142} +} + +@article{cuevas++_2004_anova + author = {Cuevas, Antonio and Febrero-Bande, Manuel and Fraiman, Ricardo}, + year = {2004}, + month = {02}, + pages = {111 -- 122}, + title = {An ANOVA test for functional data}, + volume = {47}, + journal = {Computational Statistics & Data Analysis}, + doi = {10.1016/j.csda.2003.10.021} +} + +@article{dai+genton_2018_visualization, + author = {Wenlin Dai and Marc G. Genton}, + title = {Multivariate Functional Data Visualization and Outlier Detection}, + journal = {Journal of Computational and Graphical Statistics}, + volume = {27}, + number = {4}, + pages = {923 -- 934}, + year = {2018}, + publisher = {Taylor & Francis}, + doi = {10.1080/10618600.2018.1473781}, + URL = {https://doi.org/10.1080/10618600.2018.1473781} +} + +@inbook{ferraty+vieu_2006_nonparametric_knn, + author = {Frédéric Ferraty and Philippe Vieu}, + title = {Nonparametric Functional Data Analysis. Theory and Practice}, + chapter = {Functional Nonparametric Supervised Classification}, + pages = {116}, + publisher = {Springer-Verlag New York}, + year = {2006}, + isbn = {978-0-387-30369-7}, + doi = {10.1007/0-387-36620-2} +} + +@article{fraiman+muniz_2001_trimmed, + author = {Fraiman, Ricardo and Muniz, Graciela}, + year = {2001}, + month = {02}, + pages = {419 -- 440}, + title = {Trimmed means for functional data}, + volume = {10}, + journal = {TEST: An Official Journal of the Spanish Society of Statistics and Operations Research}, + doi = {10.1007/BF02595706} +} + +@article{gervini_2008_estimation, + author = {Gervini, Daniel}, + title = "{Robust functional estimation using the median and spherical principal components}", + journal = {Biometrika}, + volume = {95}, + number = {3}, + pages = {587 -- 600}, + year = {2008}, + month = {09}, + issn = {0006-3444}, + doi = {10.1093/biomet/asn031}, + url = {https://doi.org/10.1093/biomet/asn031} +} + +@article{ghosh+chaudhuri_2005_depth, + author = {Ghosh, Anil and Chaudhuri, Probal}, + year = {2005}, + month = {02}, + pages = {327 -- 350}, + title = {On Maximum Depth and Related Classifiers}, + volume = {32}, + journal = {Scandinavian Journal of Statistics}, + doi = {10.1111/j.1467-9469.2005.00423.x} +} + +@article{pini+stamm+vantini_2018_hotellings, + title = {Hotelling's T2 in separable Hilbert spaces}, + author = {Alessia Pini and Aymeric Stamm and Simone Vantini}, + journal = {Journal of Multivariate Analysis}, + year = {2018}, + month = {05}, + volume = {167}, + pages = {284 -- 305}, + doi = {10.1016/j.jmva.2018.05.007} +} + +@inbook{ramsay+silverman_2005_functional_bspline, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {From functional data to smooth functions}, + pages = {50 -- 51}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_spline, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {Smoothing functional data with a roughness penalty}, + pages = {86 -- 87}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_spline_squares, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {Smoothing functional data with a roughness penalty}, + pages = {89 -- 90}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_shift, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {The registration and display of functional data}, + pages = {129 -- 132}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_landmark, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {The registration and display of functional data}, + pages = {132 -- 136}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_newton-raphson, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {The registration and display of functional data}, + pages = {142 -- 144}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_discretizing, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {Principal components analysis for functional data}, + pages = {161}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{ramsay+silverman_2005_functional_basis, + author = {James Ramsay and B. W. Silverman}, + title = {Functional Data Analysis}, + chapter = {Principal components analysis for functional data}, + pages = {161 -- 164}, + publisher = {Springer-Verlag New York}, + year = {2005}, + isbn = {978-0-387-40080-8}, + doi = {110.1007/b98888} +} + +@inbook{srivastava+klassen_2016_analysis_elastic, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Functional Data and Elastic Registration}, + pages = {73 -- 122}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@inbook{srivastava+klassen_2016_analysis_square, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Functional Data and Elastic Registration}, + pages = {91 -- 93}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@inbook{srivastava+klassen_2016_analysis_amplitude, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Functional Data and Elastic Registration}, + pages = {107 -- 109}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@inbook{srivastava+klassen_2016_analysis_phase, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Functional Data and Elastic Registration}, + pages = {109 -- 111}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@inbook{srivastava+klassen_2016_analysis_probability, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Functional Data and Elastic Registration}, + pages = {113 -- 117}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@inbook{srivastava+klassen_2016_analysis_karcher, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Statistical Modeling of Functional Data}, + pages = {273 -- 274}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@inbook{srivastava+klassen_2016_analysis_orbit, + author = {Srivastava, Anuj and Klassen, Eric}, + title = {Functional and Shape Data Analysis}, + chapter = {Statistical Modeling of Functional Data}, + pages = {274 -- 277}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-1-4939-4018-9}, + doi = {10.1007/978-1-4939-4020-2} +} + +@article{srivastava++_2011_ficher-rao, + author = {Srivastava, Anuj and Wu, Wei and Kurtek, Sebastian and Klassen, Eric and Marron, J.}, + year = {2011}, + journal={}, + title = {Registration of Functional Data Using Fisher-Rao Metric}, + pages = {5 -- 7}, + URL = {https://arxiv.org/abs/1103.3817v2} +} + +@article{srivastava++_2011_ficher-rao_karcher, + author = {Srivastava, Anuj and Wu, Wei and Kurtek, Sebastian and Klassen, Eric and Marron, J.}, + year = {2011}, + journal={}, + title = {Registration of Functional Data Using Fisher-Rao Metric}, + pages = {7 -- 10}, + URL = {https://arxiv.org/abs/1103.3817v2} +} + +@article{srivastava++_2011_ficher-rao_orbit, + author = {Srivastava, Anuj and Wu, Wei and Kurtek, Sebastian and Klassen, Eric and Marron, J.}, + year = {2011}, + journal={}, + title = {Registration of Functional Data Using Fisher-Rao Metric}, + pages = {9 -- 10}, + URL = {https://arxiv.org/abs/1103.3817v2} +} + +@article{sun+genton_2011_boxplots, + author = {Ying Sun and Marc G. Genton}, + title = {Functional Boxplots}, + journal = {Journal of Computational and Graphical Statistics}, + volume = {20}, + number = {2}, + pages = {316 -- 334}, + year = {2011}, + publisher = {Taylor & Francis}, + doi = {10.1198/jcgs.2011.09224}, + URL = {https://doi.org/10.1198/jcgs.2011.09224} +} + +@article{szekely+rizzo_2010_brownian, + author = {Gábor J. Székely and Maria L. Rizzo}, + title = {Brownian distance covariance}, + volume = {3}, + journal = {The Annals of Applied Statistics}, + number = {4}, + publisher = {Institute of Mathematical Statistics}, + pages = {1236 -- 1265}, + year = {2009}, + doi = {10.1214/09-AOAS312}, + URL = {https://doi.org/10.1214/09-AOAS312} +} + +@inproceedings{torrecilla+suarez_2016_hunting, + author = {Torrecilla, Jose L. and Su\'{a}rez, Alberto}, + title = {Feature Selection in Functional Data Classification with Recursive Maxima Hunting}, + year = {2016}, + volume = {29}, + publisher = {Curran Associates Inc.}, + booktitle = {Proceedings of the 30th International Conference on Neural Information Processing Systems}, + pages = {4835 -- 4843}, + series = {NIPS'16} +} + +@inbook{wasserman_2006_nonparametric_nw, + author = {Larry Wasserman}, + title = {All of Nonparametric Statistics}, + chapter = {Nonparametric Regression}, + pages = {71}, + publisher = {Springer-Verlag New York}, + year = {2006}, + isbn = {978-0-387-25145-5}, + doi = {10.1007/0-387-30623-4} +} + +@inbook{wasserman_2006_nonparametric_llr, + author = {Larry Wasserman}, + title = {All of Nonparametric Statistics}, + chapter = {Nonparametric Regression}, + pages = {77}, + publisher = {Springer-Verlag New York}, + year = {2006}, + isbn = {978-0-387-25145-5}, + doi = {10.1007/0-387-30623-4} +} \ No newline at end of file From a5165e75371f8d86c63632032080660d12b2a892 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sun, 21 Nov 2021 00:14:41 +0100 Subject: [PATCH 17/50] More mypy corrections --- skfda/_utils/_utils.py | 2 +- .../feature_extraction/_per_class_transformer.py | 2 +- tests/test_per_class_transformer.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f1419a75c..67aeb54d5 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -736,7 +736,7 @@ def _classifier_fit_depth_methods( def _fit_feature_transformer( X: Union[NDArrayInt, NDArrayFloat], y: Union[NDArrayInt, NDArrayFloat], - transformer: TransformerMixin, + transformer: TransformerMixin, # type: ignore ) -> Tuple[Union[NDArrayInt, NDArrayFloat], Sequence[TransformerMixin]]: classes, y_ind = _classifier_get_classes(y) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 2044cab97..3e1415ef9 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -16,7 +16,7 @@ T = TypeVar("T", bound=FData) -class PerClassTransformer(TransformerMixin): +class PerClassTransformer(TransformerMixin): # type: ignore r"""Per class feature transformer for functional data. This class takes a transformer and performs the following map: diff --git a/tests/test_per_class_transformer.py b/tests/test_per_class_transformer.py index f2ef65326..a4c6d671d 100644 --- a/tests/test_per_class_transformer.py +++ b/tests/test_per_class_transformer.py @@ -27,7 +27,7 @@ def setUp(self) -> None: def test_transform(self) -> None: """Check the data transformation is done correctly.""" t = PerClassTransformer( - RecursiveMaximaHunting(), + RecursiveMaximaHunting(), # type: ignore array_output=True, ) t.fit_transform(self.X, self.y) @@ -36,7 +36,7 @@ def test_transform(self) -> None: manual = np.empty((93, 0)) classes, y_ind = _classifier_get_classes(self.y) for cur_class in range(classes.size): - feature_transformer = RecursiveMaximaHunting().fit( + feature_transformer = RecursiveMaximaHunting().fit( # type: ignore self.X[y_ind == cur_class], self.y[y_ind == cur_class], ) @@ -47,7 +47,7 @@ def test_transform(self) -> None: def test_not_transformer_argument(self) -> None: """Check that invalid arguments in fit raise exception.""" - t = PerClassTransformer(KNeighborsClassifier()) + t = PerClassTransformer(KNeighborsClassifier()) # type: ignore self.assertRaises( TypeError, t.fit, From 4a49a2b672b9ba83da010334342da84ce10b614f Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sun, 21 Nov 2021 00:31:53 +0100 Subject: [PATCH 18/50] Another mypy error corrected --- .../dim_reduction/feature_extraction/_per_class_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 3e1415ef9..5e24aca16 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -91,7 +91,7 @@ def __init__( self, transformer: TransformerMixin, *, - array_output=False, + array_output: bool = False, ) -> None: self.transformer = transformer self.array_output = array_output From 3ec4e8b2912e6043414535ac8f575132e8b2dd8f Mon Sep 17 00:00:00 2001 From: dSerna4 <91683791+dSerna4@users.noreply.github.com> Date: Fri, 26 Nov 2021 18:26:57 +0100 Subject: [PATCH 19/50] Fix error in fit method and improve doctest --- _logistic_regression.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/_logistic_regression.py b/_logistic_regression.py index c310810c4..ec340d21f 100644 --- a/_logistic_regression.py +++ b/_logistic_regression.py @@ -43,26 +43,26 @@ class LogisticRegression( >>> fd1 = make_gaussian_process( ... n_samples=50, ... n_features=100, - ... noise=0.5, + ... noise=0.7, ... random_state=0, ... ) >>> fd2 = make_gaussian_process( ... n_samples=50, ... n_features = 100, - ... mean = np.array([1]*100), - ... noise = 0.5, + ... mean = array([1]*100), + ... noise = 0.7, ... random_state=0 ... ) >>> fd = fd1.concatenate(fd2) >>> y = 50*[0] + 50*[1] - >>> lr = LogisticRegression(p=2) + >>> lr = LogisticRegression() >>> _ = lr.fit(fd[::2], y[::2]) >>> lr.coef_.round(2) - array([[ 2.41, 1.68]]) + array([[ 1.28, 1.17, 1.27, 1.27, 0.96]]) >>> lr.points_.round(2) - array([ 0.11, 0. ]) + array([ 0.11, 0.06, 0.07, 0.03, 0. ]) >>> lr.score(fd[1::2],y[1::2]) - 0.92 + 0.94 References: .. footbibliography:: @@ -99,10 +99,10 @@ def fit( # noqa: D102 for t in range(n_features): x_mv[:, q] = X.data_matrix[:, t, 0] - mvlr.fit(x_mv[:, :self.p + 1], y_ind) + mvlr.fit(x_mv[:, :q + 1], y_ind) # log-likelihood function at t - log_probs = mvlr.predict_log_proba(x_mv) + log_probs = mvlr.predict_log_proba(x_mv[:, :q + 1]) log_probs = np.array( [log_probs[i, y[i]] for i in range(n_samples)], ) From 59260e8e4360872093186a4660e9aba2d3bf6c58 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 7 Dec 2021 20:28:11 +0100 Subject: [PATCH 20/50] Closes #376 --- skfda/_utils/_utils.py | 6 ++- .../_per_class_transformer.py | 48 +++++++++++-------- tests/test_per_class_transformer.py | 2 +- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 8a724e62e..ef789c967 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -48,6 +48,10 @@ from ..representation.basis import Basis T = TypeVar("T", bound=FData) +Input = TypeVar("Input") +Output = TypeVar("Output") +Target = TypeVar("Target") + def check_is_univariate(fd: FData) -> None: """Check if an FData is univariate and raises an error. @@ -749,7 +753,7 @@ def _classifier_fit_depth_methods( def _fit_feature_transformer( X: Union[NDArrayInt, NDArrayFloat], y: Union[NDArrayInt, NDArrayFloat], - transformer: TransformerMixin, # type: ignore + transformer: TransformerMixin[Input, Output, Target], ) -> Tuple[Union[NDArrayInt, NDArrayFloat], Sequence[TransformerMixin]]: classes, y_ind = _classifier_get_classes(y) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 5e24aca16..4435579e7 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -14,9 +14,12 @@ from ....representation.grid import FData, FDataGrid T = TypeVar("T", bound=FData) +Input = TypeVar("Input") +Output = TypeVar("Output") +Target = TypeVar("Target") -class PerClassTransformer(TransformerMixin): # type: ignore +class PerClassTransformer(TransformerMixin): r"""Per class feature transformer for functional data. This class takes a transformer and performs the following map: @@ -32,15 +35,15 @@ class PerClassTransformer(TransformerMixin): # type: ignore :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. Parameters: - transformer: TransformerMixin + transformer: The transformer that we want to apply to the given data. It should use target data while fitting. This is checked by looking at the 'stateless' and 'requires_y' tags - array_output: bool - indicates if the transformed data is requested to be a NumPy array + array_output: + Indicates if the transformed data is requested to be a NumPy array output. By default the value is False. Examples: - Firstly, we will import the Berkeley Growth Study dataset + Firstly, we will import the Berkeley Growth Study dataset: >>> from skfda.datasets import fetch_growth >>> X, y = fetch_growth(return_X_y=True, as_frame=True) @@ -52,7 +55,7 @@ class PerClassTransformer(TransformerMixin): # type: ignore ... ) Then we will need to select a fda transformer, and so we will - use RecursiveMaximaHunting. We need to fit the data and transform it + use RecursiveMaximaHunting. We need to fit the data and transform it: >>> from skfda.preprocessing.dim_reduction.variable_selection import ( ... RecursiveMaximaHunting, @@ -63,7 +66,7 @@ class PerClassTransformer(TransformerMixin): # type: ignore ... ) >>> x_transformed = t.fit_transform(X, y) - x_transformed will be a vector with the transformed data. + ``x_transformed`` will be a vector with the transformed data. We will split the generated data and fit a KNN classifier. >>> from sklearn.model_selection import train_test_split @@ -78,7 +81,7 @@ class PerClassTransformer(TransformerMixin): # type: ignore >>> neigh = KNeighborsClassifier() >>> neigh = neigh.fit(X_train, y_train) - Finally we can predict and check the score + Finally we can predict and check the score: >>> neigh.predict(X_test) array([0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int8) @@ -89,7 +92,7 @@ class PerClassTransformer(TransformerMixin): # type: ignore def __init__( self, - transformer: TransformerMixin, + transformer: TransformerMixin[Input, Output, Target], *, array_output: bool = False, ) -> None: @@ -100,8 +103,10 @@ def _validate_transformer( self, ) -> None: """ - Check that the transformer passed is\ - scikit-learn-like and that uses target data in fit. + Check that the transformer passed is valid. + + Check that it is scikit-learn-like and that + uses target data in fit. Args: None @@ -125,11 +130,10 @@ def _validate_transformer( if tags['stateless'] or not tags['requires_y']: warnings.warn( - "Transformer should use target data in fit." - + " requires_y tag should be enabled and stateless disabled" - + str(self.transformer) - + " (type " + str(type(self.transformer)) + ")" - " doesn't", + f"Parameter ``transformer`` with type" # noqa: WPS237 + f" {type(self.transformer)} should use class information." + f" It should have the ``requires_y`` tag set to ``True`` and" + f" the ``stateless`` tag set to ``False``", ) def fit( @@ -138,8 +142,9 @@ def fit( y: np.ndarray, ) -> PerClassTransformer: """ - Fit the model on each class using X as\ - training data and y as target values. + Fit the model on each class. + + It uses X as training data and y as target values. Args: X: FDataGrid with the training data. @@ -168,7 +173,7 @@ def transform(self, X: T) -> Union[DataFrame, np.ndarray]: X: FDataGrid with the test samples. Returns: - Eiter array of shape (n_samples, G) or a Data Frame \ + Eiter array of shape (n_samples, G) or a Data Frame including the transformed data. """ sklearn_check_is_fitted(self) @@ -199,8 +204,9 @@ def fit_transform( y: np.ndarray, ) -> Union[DataFrame, np.ndarray]: """ - Fits and transforms the provided data\ - using the transformer specified when initializing the class. + Fits and transforms the provided data. + + It uses the transformer specified when initializing the class. Args: X: FDataGrid with the samples. diff --git a/tests/test_per_class_transformer.py b/tests/test_per_class_transformer.py index a4c6d671d..666b914f9 100644 --- a/tests/test_per_class_transformer.py +++ b/tests/test_per_class_transformer.py @@ -15,7 +15,7 @@ ) -class TestPCT(unittest.TestCase): +class TestPerClassTransformer(unittest.TestCase): """Tests for PCT module.""" def setUp(self) -> None: From 3a10a7f2cd8523e53d6006e592fff9f93ddb6148 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 9 Dec 2021 18:55:45 +0100 Subject: [PATCH 21/50] Move logistic regression to where it belongs. --- __init__.py | 12 ------------ skfda/ml/classification/__init__.py | 1 + .../ml/classification/_logistic_regression.py | 0 3 files changed, 1 insertion(+), 12 deletions(-) delete mode 100644 __init__.py rename _logistic_regression.py => skfda/ml/classification/_logistic_regression.py (100%) diff --git a/__init__.py b/__init__.py deleted file mode 100644 index c7eada4b7..000000000 --- a/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Classification.""" -from ._centroid_classifiers import DTMClassifier, NearestCentroid -from ._depth_classifiers import ( - DDClassifier, - DDGClassifier, - MaximumDepthClassifier, -) -from ._logistic_regression import LogisticRegression -from ._neighbors_classifiers import ( - KNeighborsClassifier, - RadiusNeighborsClassifier, -) diff --git a/skfda/ml/classification/__init__.py b/skfda/ml/classification/__init__.py index 2e5689d8b..c7eada4b7 100644 --- a/skfda/ml/classification/__init__.py +++ b/skfda/ml/classification/__init__.py @@ -5,6 +5,7 @@ DDGClassifier, MaximumDepthClassifier, ) +from ._logistic_regression import LogisticRegression from ._neighbors_classifiers import ( KNeighborsClassifier, RadiusNeighborsClassifier, diff --git a/_logistic_regression.py b/skfda/ml/classification/_logistic_regression.py similarity index 100% rename from _logistic_regression.py rename to skfda/ml/classification/_logistic_regression.py From 6c3828da5b01b0e605379432c846766424d9277b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 9 Dec 2021 19:15:45 +0100 Subject: [PATCH 22/50] Change refs. --- docs/refs.bib | 9 ++ refs.bib | 385 -------------------------------------------------- 2 files changed, 9 insertions(+), 385 deletions(-) delete mode 100644 refs.bib diff --git a/docs/refs.bib b/docs/refs.bib index 86bcbaa32..e788da62e 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -33,6 +33,15 @@ @inproceedings{breunig++_2000_outliers doi = {10.1145/342009.335388} } +@article{bueno+larraz_2021_functional, + title={On functional logistic regression: some conceptual issues}, + author={Beatriz Bueno-Larraz and José R. Berrendero and Antonio Cuevas}, + year={2021}, + eprint={1812.00721}, + archivePrefix={arXiv}, + primaryClass={math.ST} +} + @article{cuesta-albertos++_2015_ddg, title = {The DDG-classifier in the functional setting}, author = {J. A. Cuesta-Albertos and M. Febrero-Bande and M. Oviedo de la Fuente}, diff --git a/refs.bib b/refs.bib deleted file mode 100644 index 058b6d588..000000000 --- a/refs.bib +++ /dev/null @@ -1,385 +0,0 @@ -@article{berrendero+cuevas+torrecilla_2016_hunting, - author = {Berrendero, J.R. and Cuevas, Antonio and Torrecilla, José}, - year = {2016}, - pages = {619 -- 638}, - title = {Variable selection in functional data classification: A maxima-hunting proposal}, - number = {2}, - volume = {26}, - journal = {Statistica Sinica}, - doi = {10.5705/ss.202014.0014} -} - -@article{berrendero+cuevas+torrecilla_2018_hilbert, - author = {José R. Berrendero and Antonio Cuevas and José L. Torrecilla}, - title = {On the Use of Reproducing Kernel Hilbert Spaces in Functional Classification}, - journal = {Journal of the American Statistical Association}, - volume = {113}, - number = {523}, - pages = {1210 -- 1218}, - year = {2018}, - publisher = {Taylor & Francis}, - doi = {10.1080/01621459.2017.1320287}, - URL = {https://doi.org/10.1080/01621459.2017.1320287} -} - -@inproceedings{breunig++_2000_outliers, - author = {Breunig, Markus and Kriegel, Hans-Peter and Ng, Raymond and Sander, Joerg}, - year = {2000}, - month = {06}, - pages = {93 -- 104}, - title = {LOF: Identifying Density-Based Local Outliers.}, - volume = {29}, - journal = {ACM Sigmod Record}, - doi = {10.1145/342009.335388} -} - -@article{bueno+larraz_2021_functional, - title={On functional logistic regression: some conceptual issues}, - author={Beatriz Bueno-Larraz and José R. Berrendero and Antonio Cuevas}, - year={2021}, - eprint={1812.00721}, - archivePrefix={arXiv}, - primaryClass={math.ST} -} - -@article{cuesta-albertos++_2015_ddg, - title = {The DDG-classifier in the functional setting}, - author = {J. A. Cuesta-Albertos and M. Febrero-Bande and M. Oviedo de la Fuente}, - journal = {TEST}, - year = {2015}, - volume = {26}, - pages = {119 -- 142} -} - -@article{cuevas++_2004_anova - author = {Cuevas, Antonio and Febrero-Bande, Manuel and Fraiman, Ricardo}, - year = {2004}, - month = {02}, - pages = {111 -- 122}, - title = {An ANOVA test for functional data}, - volume = {47}, - journal = {Computational Statistics & Data Analysis}, - doi = {10.1016/j.csda.2003.10.021} -} - -@article{dai+genton_2018_visualization, - author = {Wenlin Dai and Marc G. Genton}, - title = {Multivariate Functional Data Visualization and Outlier Detection}, - journal = {Journal of Computational and Graphical Statistics}, - volume = {27}, - number = {4}, - pages = {923 -- 934}, - year = {2018}, - publisher = {Taylor & Francis}, - doi = {10.1080/10618600.2018.1473781}, - URL = {https://doi.org/10.1080/10618600.2018.1473781} -} - -@inbook{ferraty+vieu_2006_nonparametric_knn, - author = {Frédéric Ferraty and Philippe Vieu}, - title = {Nonparametric Functional Data Analysis. Theory and Practice}, - chapter = {Functional Nonparametric Supervised Classification}, - pages = {116}, - publisher = {Springer-Verlag New York}, - year = {2006}, - isbn = {978-0-387-30369-7}, - doi = {10.1007/0-387-36620-2} -} - -@article{fraiman+muniz_2001_trimmed, - author = {Fraiman, Ricardo and Muniz, Graciela}, - year = {2001}, - month = {02}, - pages = {419 -- 440}, - title = {Trimmed means for functional data}, - volume = {10}, - journal = {TEST: An Official Journal of the Spanish Society of Statistics and Operations Research}, - doi = {10.1007/BF02595706} -} - -@article{gervini_2008_estimation, - author = {Gervini, Daniel}, - title = "{Robust functional estimation using the median and spherical principal components}", - journal = {Biometrika}, - volume = {95}, - number = {3}, - pages = {587 -- 600}, - year = {2008}, - month = {09}, - issn = {0006-3444}, - doi = {10.1093/biomet/asn031}, - url = {https://doi.org/10.1093/biomet/asn031} -} - -@article{ghosh+chaudhuri_2005_depth, - author = {Ghosh, Anil and Chaudhuri, Probal}, - year = {2005}, - month = {02}, - pages = {327 -- 350}, - title = {On Maximum Depth and Related Classifiers}, - volume = {32}, - journal = {Scandinavian Journal of Statistics}, - doi = {10.1111/j.1467-9469.2005.00423.x} -} - -@article{pini+stamm+vantini_2018_hotellings, - title = {Hotelling's T2 in separable Hilbert spaces}, - author = {Alessia Pini and Aymeric Stamm and Simone Vantini}, - journal = {Journal of Multivariate Analysis}, - year = {2018}, - month = {05}, - volume = {167}, - pages = {284 -- 305}, - doi = {10.1016/j.jmva.2018.05.007} -} - -@inbook{ramsay+silverman_2005_functional_bspline, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {From functional data to smooth functions}, - pages = {50 -- 51}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_spline, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {Smoothing functional data with a roughness penalty}, - pages = {86 -- 87}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_spline_squares, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {Smoothing functional data with a roughness penalty}, - pages = {89 -- 90}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_shift, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {The registration and display of functional data}, - pages = {129 -- 132}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_landmark, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {The registration and display of functional data}, - pages = {132 -- 136}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_newton-raphson, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {The registration and display of functional data}, - pages = {142 -- 144}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_discretizing, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {Principal components analysis for functional data}, - pages = {161}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{ramsay+silverman_2005_functional_basis, - author = {James Ramsay and B. W. Silverman}, - title = {Functional Data Analysis}, - chapter = {Principal components analysis for functional data}, - pages = {161 -- 164}, - publisher = {Springer-Verlag New York}, - year = {2005}, - isbn = {978-0-387-40080-8}, - doi = {110.1007/b98888} -} - -@inbook{srivastava+klassen_2016_analysis_elastic, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Functional Data and Elastic Registration}, - pages = {73 -- 122}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@inbook{srivastava+klassen_2016_analysis_square, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Functional Data and Elastic Registration}, - pages = {91 -- 93}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@inbook{srivastava+klassen_2016_analysis_amplitude, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Functional Data and Elastic Registration}, - pages = {107 -- 109}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@inbook{srivastava+klassen_2016_analysis_phase, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Functional Data and Elastic Registration}, - pages = {109 -- 111}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@inbook{srivastava+klassen_2016_analysis_probability, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Functional Data and Elastic Registration}, - pages = {113 -- 117}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@inbook{srivastava+klassen_2016_analysis_karcher, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Statistical Modeling of Functional Data}, - pages = {273 -- 274}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@inbook{srivastava+klassen_2016_analysis_orbit, - author = {Srivastava, Anuj and Klassen, Eric}, - title = {Functional and Shape Data Analysis}, - chapter = {Statistical Modeling of Functional Data}, - pages = {274 -- 277}, - publisher = {Springer-Verlag New York}, - year = {2016}, - isbn = {978-1-4939-4018-9}, - doi = {10.1007/978-1-4939-4020-2} -} - -@article{srivastava++_2011_ficher-rao, - author = {Srivastava, Anuj and Wu, Wei and Kurtek, Sebastian and Klassen, Eric and Marron, J.}, - year = {2011}, - journal={}, - title = {Registration of Functional Data Using Fisher-Rao Metric}, - pages = {5 -- 7}, - URL = {https://arxiv.org/abs/1103.3817v2} -} - -@article{srivastava++_2011_ficher-rao_karcher, - author = {Srivastava, Anuj and Wu, Wei and Kurtek, Sebastian and Klassen, Eric and Marron, J.}, - year = {2011}, - journal={}, - title = {Registration of Functional Data Using Fisher-Rao Metric}, - pages = {7 -- 10}, - URL = {https://arxiv.org/abs/1103.3817v2} -} - -@article{srivastava++_2011_ficher-rao_orbit, - author = {Srivastava, Anuj and Wu, Wei and Kurtek, Sebastian and Klassen, Eric and Marron, J.}, - year = {2011}, - journal={}, - title = {Registration of Functional Data Using Fisher-Rao Metric}, - pages = {9 -- 10}, - URL = {https://arxiv.org/abs/1103.3817v2} -} - -@article{sun+genton_2011_boxplots, - author = {Ying Sun and Marc G. Genton}, - title = {Functional Boxplots}, - journal = {Journal of Computational and Graphical Statistics}, - volume = {20}, - number = {2}, - pages = {316 -- 334}, - year = {2011}, - publisher = {Taylor & Francis}, - doi = {10.1198/jcgs.2011.09224}, - URL = {https://doi.org/10.1198/jcgs.2011.09224} -} - -@article{szekely+rizzo_2010_brownian, - author = {Gábor J. Székely and Maria L. Rizzo}, - title = {Brownian distance covariance}, - volume = {3}, - journal = {The Annals of Applied Statistics}, - number = {4}, - publisher = {Institute of Mathematical Statistics}, - pages = {1236 -- 1265}, - year = {2009}, - doi = {10.1214/09-AOAS312}, - URL = {https://doi.org/10.1214/09-AOAS312} -} - -@inproceedings{torrecilla+suarez_2016_hunting, - author = {Torrecilla, Jose L. and Su\'{a}rez, Alberto}, - title = {Feature Selection in Functional Data Classification with Recursive Maxima Hunting}, - year = {2016}, - volume = {29}, - publisher = {Curran Associates Inc.}, - booktitle = {Proceedings of the 30th International Conference on Neural Information Processing Systems}, - pages = {4835 -- 4843}, - series = {NIPS'16} -} - -@inbook{wasserman_2006_nonparametric_nw, - author = {Larry Wasserman}, - title = {All of Nonparametric Statistics}, - chapter = {Nonparametric Regression}, - pages = {71}, - publisher = {Springer-Verlag New York}, - year = {2006}, - isbn = {978-0-387-25145-5}, - doi = {10.1007/0-387-30623-4} -} - -@inbook{wasserman_2006_nonparametric_llr, - author = {Larry Wasserman}, - title = {All of Nonparametric Statistics}, - chapter = {Nonparametric Regression}, - pages = {77}, - publisher = {Springer-Verlag New York}, - year = {2006}, - isbn = {978-0-387-25145-5}, - doi = {10.1007/0-387-30623-4} -} \ No newline at end of file From dc3014f28a0abfff06927ce260136f7f525f6886 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Thu, 9 Dec 2021 19:18:08 +0100 Subject: [PATCH 23/50] Types added --- skfda/_utils/_utils.py | 8 ++++++-- .../feature_extraction/_per_class_transformer.py | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index ef789c967..3ba8a4fd5 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -750,11 +750,15 @@ def _classifier_fit_depth_methods( return classes, class_depth_methods_ -def _fit_feature_transformer( +def _fit_feature_transformer( # noqa: WPS320 WPS234 X: Union[NDArrayInt, NDArrayFloat], y: Union[NDArrayInt, NDArrayFloat], transformer: TransformerMixin[Input, Output, Target], -) -> Tuple[Union[NDArrayInt, NDArrayFloat], Sequence[TransformerMixin]]: +) -> Tuple[ + Union[NDArrayInt, NDArrayFloat], + Sequence[TransformerMixin[Input, Output, Target]], +]: + classes, y_ind = _classifier_get_classes(y) class_feature_transformers = [ diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 4435579e7..b06c99894 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -6,20 +6,20 @@ import numpy as np from pandas import DataFrame -from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted -from ...._utils import _fit_feature_transformer +from ...._utils import TransformerMixin, _fit_feature_transformer +from ....representation._typing import NDArrayInt from ....representation.basis import FDataBasis from ....representation.grid import FData, FDataGrid T = TypeVar("T", bound=FData) Input = TypeVar("Input") Output = TypeVar("Output") -Target = TypeVar("Target") +Target = TypeVar("Target", bound=NDArrayInt) -class PerClassTransformer(TransformerMixin): +class PerClassTransformer(TransformerMixin[Input, Output, Target]): r"""Per class feature transformer for functional data. This class takes a transformer and performs the following map: From dc07b9599cb40d52322639b71437a80018962152 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 9 Dec 2021 19:55:18 +0100 Subject: [PATCH 24/50] Add doc entry. --- docs/modules/ml/classification.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/modules/ml/classification.rst b/docs/modules/ml/classification.rst index c2d894fbf..73002a078 100644 --- a/docs/modules/ml/classification.rst +++ b/docs/modules/ml/classification.rst @@ -46,3 +46,12 @@ This module contains depth based estimators to perform classification. skfda.ml.classification.DDClassifier skfda.ml.classification.DDGClassifier skfda.ml.classification.MaximumDepthClassifier + +Logistic regression +----------------------- +Classifier based on logistic regression. + +.. autosummary:: + :toctree: autosummary + + skfda.ml.classification.LogisticRegression From ca01deed0a5eded191de7ece58bdcde8d43d6de9 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 9 Dec 2021 20:50:55 +0100 Subject: [PATCH 25/50] Fix documentation. --- docs/refs.bib | 22 +++++++++++++------ ...t_fpca_inverse_transform_outl_detection.py | 7 +++--- .../ml/classification/_logistic_regression.py | 2 +- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/refs.bib b/docs/refs.bib index e788da62e..4029b7d6f 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -33,13 +33,21 @@ @inproceedings{breunig++_2000_outliers doi = {10.1145/342009.335388} } -@article{bueno+larraz_2021_functional, - title={On functional logistic regression: some conceptual issues}, - author={Beatriz Bueno-Larraz and José R. Berrendero and Antonio Cuevas}, - year={2021}, - eprint={1812.00721}, - archivePrefix={arXiv}, - primaryClass={math.ST} +@article{bueno++_2021_functional, + title = {On Functional Logistic Regression: Some Conceptual Issues}, + shorttitle = {On Functional Logistic Regression}, + author = {{Bueno-Larraz}, Beatriz and Berrendero, Jos{\'e} R. and Cuevas, Antonio}, + year = {2021}, + month = jul, + journal = {arXiv:1812.00721 [math, stat]}, + eprint = {1812.00721}, + eprinttype = {arxiv}, + primaryclass = {math, stat}, + url = {http://arxiv.org/abs/1812.00721}, + urldate = {2021-12-09}, + abstract = {The main ideas behind the classical multivariate logistic regression model make sense when translated to the functional setting, where the explanatory variable \$X\$ is a function and the response \$Y\$ is binary. However, some important technical issues appear (or are aggravated with respect to those of the multivariate case) due to the functional nature of the explanatory variable. First, the mere definition of the model can be questioned: while most approaches so far proposed rely on the \$L\_2\$-based model, we suggest an alternative (in some sense, more general) approach, based on the theory of Reproducing Kernel Hilbert Spaces (RKHS). The validity conditions of such RKHS-based model, as well as its relation with the \$L\_2\$-based one are investigated and made explicit in two formal results. Some relevant particular cases are considered as well. Second we show that, under very general conditions, the maximum likelihood (ML) of the logistic model parameters fail to exist in the functional case. Third, on a more positive side, we suggest an RKHS-based restricted version of the ML estimator. This is a methodological paper, aimed at a better understanding of the functional logistic model, rather than focusing on numerical and practical issues.}, + archiveprefix = {arXiv}, + keywords = {Mathematics - Statistics Theory} } @article{cuesta-albertos++_2015_ddg, diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index 6a592734c..99de96621 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -28,10 +28,11 @@ import matplotlib.pyplot as plt import numpy as np from scipy.stats import gaussian_kde -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.misc.covariances import Exponential, Gaussian + from skfda.datasets import make_gaussian_process +from skfda.misc.covariances import Exponential, Gaussian from skfda.misc.metrics import l2_distance, l2_norm +from skfda.preprocessing.dim_reduction.feature_extraction import FPCA ############################################################################## # We proceed as follows: @@ -55,7 +56,7 @@ # # The train set is generated according to a Gaussian process # with a Gaussian (i.e. squared-exponential) covariance function. -grid_size = 5 * 10**3 +grid_size = 100 cov_clean = Gaussian(variance=2.0, length_scale=5.0) diff --git a/skfda/ml/classification/_logistic_regression.py b/skfda/ml/classification/_logistic_regression.py index ec340d21f..4981894d7 100644 --- a/skfda/ml/classification/_logistic_regression.py +++ b/skfda/ml/classification/_logistic_regression.py @@ -20,7 +20,7 @@ class LogisticRegression( This class implements the sequential “greedy” algorithm for functional logistic regression proposed in - :footcite:ts:`bueno+larraz_2021_functional`. + :footcite:ts:`bueno++_2021_functional`. .. warning:: For now, only binary classification for functional From bf8cb2235f096b04d4b6e25034116b59ae93a5d6 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Fri, 17 Dec 2021 20:58:35 +0100 Subject: [PATCH 26/50] Corrections. Closes #376 --- skfda/_utils/_utils.py | 2 +- .../_per_class_transformer.py | 97 +++++++++++++++---- 2 files changed, 81 insertions(+), 18 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 3ba8a4fd5..6ff4e61a8 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -763,7 +763,7 @@ def _fit_feature_transformer( # noqa: WPS320 WPS234 class_feature_transformers = [ clone(transformer).fit(X[y_ind == cur_class], y[y_ind == cur_class]) - for cur_class in range(classes.size) + for cur_class, _ in enumerate(classes) ] return classes, class_feature_transformers diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index b06c99894..d14841c8e 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -11,9 +11,8 @@ from ...._utils import TransformerMixin, _fit_feature_transformer from ....representation._typing import NDArrayInt from ....representation.basis import FDataBasis -from ....representation.grid import FData, FDataGrid +from ....representation.grid import FDataGrid -T = TypeVar("T", bound=FData) Input = TypeVar("Input") Output = TypeVar("Output") Target = TypeVar("Target", bound=NDArrayInt) @@ -42,6 +41,7 @@ class PerClassTransformer(TransformerMixin[Input, Output, Target]): array_output: Indicates if the transformed data is requested to be a NumPy array output. By default the value is False. + Examples: Firstly, we will import the Berkeley Growth Study dataset: @@ -60,34 +60,91 @@ class PerClassTransformer(TransformerMixin[Input, Output, Target]): >>> from skfda.preprocessing.dim_reduction.variable_selection import ( ... RecursiveMaximaHunting, ... ) - >>> t = PerClassTransformer( + >>> t1 = PerClassTransformer( ... RecursiveMaximaHunting(), ... array_output=True, ... ) - >>> x_transformed = t.fit_transform(X, y) + >>> x_transformed1 = t1.fit_transform(X, y) - ``x_transformed`` will be a vector with the transformed data. + ``x_transformed1`` will be a vector with the transformed data. We will split the generated data and fit a KNN classifier. >>> from sklearn.model_selection import train_test_split >>> from sklearn.neighbors import KNeighborsClassifier - >>> X_train, X_test, y_train, y_test = train_test_split( - ... x_transformed, + >>> X_train1, X_test1, y_train1, y_test1 = train_test_split( + ... x_transformed1, ... y, ... test_size=0.25, ... stratify=y, ... random_state=0, ... ) - >>> neigh = KNeighborsClassifier() - >>> neigh = neigh.fit(X_train, y_train) + >>> neigh1 = KNeighborsClassifier() + >>> neigh1 = neigh1.fit(X_train1, y_train1) Finally we can predict and check the score: - >>> neigh.predict(X_test) + >>> neigh1.predict(X_test1) array([0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int8) - >>> round(neigh.score(X_test, y_test), 3) + >>> round(neigh1.score(X_test1, y_test1), 3) 0.958 + + + + We can also use a transformer that returns a FData object + when predicting. + In our example we are going to use the Nadaraya Watson Smoother. + + >>> from skfda.preprocessing.smoothing.kernel_smoothers import ( + ... NadarayaWatsonSmoother, + ... ) + >>> t2 = PerClassTransformer( + ... NadarayaWatsonSmoother(), + ... ) + >>> x_transformed2 = t2.fit_transform(X, y) + + ``x_transformed2`` will be a DataFrame with the transformed data. + Each row on the frame contains a FDataGrid describing a transformed + curve. + We need to convert the DataFrame into a FDataGrid with all the + samples, so we can train a classifier. We also need to duplicate + the outputs as we have the double amount of data curves: + + >>> for i, curve_grid in enumerate(x_transformed2.iloc[:,0].values): + ... if i == 0: + ... X_transformed_grid = curve_grid + ... else: + ... X_transformed_grid = X_transformed_grid.concatenate( + ... curve_grid, + ... ) + + >>> y = np.concatenate((y,y)) + + + ``X_transformed_grid`` contains a FDataGrid with all the transformed + curves. Now we are able to use it to fit a KNN classifier. + Again we split the data into train and test. + >>> X_train2, X_test2, y_train2, y_test2 = train_test_split( + ... X_transformed_grid, + ... y, + ... test_size=0.25, + ... stratify=y, + ... random_state=0, + ... ) + + This time we need a functional data classifier. + We fit the classifier and predict. + >>> from skfda.ml.classification import KNeighborsClassifier + >>> neigh2 = KNeighborsClassifier() + >>> neigh2 = neigh2.fit(X_train2, y_train2) + >>> neigh2.predict(X_test2) + array([1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, + 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, + 1, 1, 1, 0, 0], dtype=int8) + + >>> round(neigh2.score(X_test2, y_test2), 3) + 0.957 + """ def __init__( @@ -138,9 +195,9 @@ def _validate_transformer( def fit( self, - X: T, + X: Input, y: np.ndarray, - ) -> PerClassTransformer: + ) -> PerClassTransformer[Input, Output, Target]: """ Fit the model on each class. @@ -165,7 +222,7 @@ def fit( return self - def transform(self, X: T) -> Union[DataFrame, np.ndarray]: + def transform(self, X: Input) -> Union[DataFrame, np.ndarray]: """ Transform the provided data using the already fitted transformer. @@ -178,10 +235,16 @@ def transform(self, X: T) -> Union[DataFrame, np.ndarray]: """ sklearn_check_is_fitted(self) - transformed_data = np.empty((len(X), 0)) - for feature_transformer in self._class_feature_transformers_: + for j, feature_transformer in enumerate( + self._class_feature_transformers_, + ): elem = feature_transformer.transform(X) data = np.array(elem) + if j == 0: + if self.array_output: + transformed_data = np.empty((len(X), 0)) + else: + transformed_data = np.empty((0)) transformed_data = np.hstack((transformed_data, data)) if self.array_output: @@ -200,7 +263,7 @@ def transform(self, X: T) -> Union[DataFrame, np.ndarray]: def fit_transform( self, - X: T, + X: Input, y: np.ndarray, ) -> Union[DataFrame, np.ndarray]: """ From d72a5873dbb5fb02f5fbb8937d53a0f586b15306 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sat, 18 Dec 2021 22:39:15 +0100 Subject: [PATCH 27/50] Correction on fit method --- .../_per_class_transformer.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index d14841c8e..9cfe581f6 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -235,27 +235,25 @@ def transform(self, X: Input) -> Union[DataFrame, np.ndarray]: """ sklearn_check_is_fitted(self) - for j, feature_transformer in enumerate( - self._class_feature_transformers_, - ): - elem = feature_transformer.transform(X) - data = np.array(elem) - if j == 0: - if self.array_output: - transformed_data = np.empty((len(X), 0)) - else: - transformed_data = np.empty((0)) - transformed_data = np.hstack((transformed_data, data)) + transformed_data = [ + feature_transformer.transform(X) + for feature_transformer in self._class_feature_transformers_ + ] if self.array_output: - for i in transformed_data: - if isinstance(i, (FDataGrid, FDataBasis)): + for i, data in enumerate(transformed_data): + if isinstance(data, (FDataGrid, FDataBasis)): raise TypeError( "There are transformed instances of FDataGrid or " "FDataBasis that can't be concatenated on a NumPy " "array.", ) - return np.array(transformed_data) + elif i == 0: + transformed_array = data + else: + transformed_array = np.hstack((transformed_array, data)) + + return transformed_array return DataFrame( {'Transformed data': transformed_data}, From 960b09112eb5b640c6c17c0bf9019d305d13bd4f Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 23 Dec 2021 16:40:14 +0100 Subject: [PATCH 28/50] Fix Zenodo json. --- .zenodo.json | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index 95f1fd8e4..7d9d7bf51 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,47 +1,47 @@ { - 'creators': [ + "creators": [ { - 'affiliation': 'Universidad Autónoma de Madrid', - 'name': 'Ramos-Carreño, Carlos', - 'orcid': '0000-0003-2566-7058' + "affiliation": "Universidad Autónoma de Madrid", + "name": "Ramos-Carreño, Carlos", + "orcid": "0000-0003-2566-7058" }, { - 'affiliation': 'Universidad Autónoma de Madrid', - 'name': 'Suárez, Alberto', - 'orcid': '0000-0003-4534-0909' + "affiliation": "Universidad Autónoma de Madrid", + "name": "Suárez, Alberto", + "orcid": "0000-0003-4534-0909" }, { - 'affiliation': 'Universidad Autónoma de Madrid', - 'name': 'Torrecilla, José Luis', - 'orcid': '0000-0003-3719-5190' + "affiliation": "Universidad Autónoma de Madrid", + "name": "Torrecilla, José Luis", + "orcid": "0000-0003-3719-5190" }, { - 'name': 'Carbajo Berrocal, Miguel' + "name": "Carbajo Berrocal, Miguel" }, { - 'name': 'Marcos Manchón, Pablo' + "name": "Marcos Manchón, Pablo" }, { - 'name': 'Pérez Manso, Pablo' + "name": "Pérez Manso, Pablo" }, { - 'name': 'Hernando Bernabé, Amanda' + "name": "Hernando Bernabé, Amanda" }, { - 'name': 'García Fernández, David' + "name": "García Fernández, David" }, { - 'name': 'Hong, Yujian' + "name": "Hong, Yujian" }, { - 'name': 'Rodríguez-Ponga Eyriès, Pedro Martín' + "name": "Rodríguez-Ponga Eyriès, Pedro Martín" }, { - 'name': 'Sánchez Romero, Álvaro' + "name": "Sánchez Romero, Álvaro" }, { - 'name': 'Petrunina, Elena' + "name": "Petrunina, Elena" } ], - 'license': 'BSD 3-Clause License', + "license": "BSD 3-Clause License" } \ No newline at end of file From 2d5065f3dd9adab95a5bb9c6d4401474b00f17f3 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Thu, 23 Dec 2021 18:21:36 +0100 Subject: [PATCH 29/50] hstack on fit method correction Closes #376 --- .../feature_extraction/_per_class_transformer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index 9cfe581f6..e101fb33c 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -241,19 +241,14 @@ def transform(self, X: Input) -> Union[DataFrame, np.ndarray]: ] if self.array_output: - for i, data in enumerate(transformed_data): + for data in transformed_data: if isinstance(data, (FDataGrid, FDataBasis)): raise TypeError( "There are transformed instances of FDataGrid or " "FDataBasis that can't be concatenated on a NumPy " "array.", ) - elif i == 0: - transformed_array = data - else: - transformed_array = np.hstack((transformed_array, data)) - - return transformed_array + return np.hstack(transformed_data) return DataFrame( {'Transformed data': transformed_data}, From 95f61fd74e6dd92d1c4ca0e2c40e80f5aae2b9f2 Mon Sep 17 00:00:00 2001 From: dSerna4 <91683791+dSerna4@users.noreply.github.com> Date: Fri, 24 Dec 2021 19:36:25 +0100 Subject: [PATCH 30/50] Add suggested changes --- skfda/ml/classification/_logistic_regression.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/skfda/ml/classification/_logistic_regression.py b/skfda/ml/classification/_logistic_regression.py index 4981894d7..7e7d7c101 100644 --- a/skfda/ml/classification/_logistic_regression.py +++ b/skfda/ml/classification/_logistic_regression.py @@ -20,15 +20,16 @@ class LogisticRegression( This class implements the sequential “greedy” algorithm for functional logistic regression proposed in - :footcite:ts:`bueno++_2021_functional`. + :footcite:ts:`bueno+larraz_2021_functional`. .. warning:: For now, only binary classification for functional data with one dimensional domains is supported. Args: - p: number of points (and coefficients) to be selected by - the algorithm. + p: + number of points (and coefficients) to be selected by + the algorithm. Attributes: classes\_: A list containing the name of the classes @@ -91,7 +92,8 @@ def fit( # noqa: D102 selected_indexes = np.zeros(self.p, dtype=np.intc) - mvlr = mvLogisticRegression() # multivariate logistic regression + # multivariate logistic regression + mvlr = mvLogisticRegression(penalty='l2') x_mv = np.zeros((n_samples, self.p)) LL = np.zeros(n_features) @@ -103,8 +105,8 @@ def fit( # noqa: D102 # log-likelihood function at t log_probs = mvlr.predict_log_proba(x_mv[:, :q + 1]) - log_probs = np.array( - [log_probs[i, y[i]] for i in range(n_samples)], + log_probs = np.concatenate( + (log_probs[y_ind == 0, 0], log_probs[y_ind == 1, 1]), ) LL[t] = np.mean(log_probs) From 87a8dc431a75325bb5c5c3f090ab7db3e1d93185 Mon Sep 17 00:00:00 2001 From: dSerna4 <91683791+dSerna4@users.noreply.github.com> Date: Fri, 24 Dec 2021 20:01:27 +0100 Subject: [PATCH 31/50] Update _logistic_regression.py --- skfda/ml/classification/_logistic_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/ml/classification/_logistic_regression.py b/skfda/ml/classification/_logistic_regression.py index 7e7d7c101..10d2b222a 100644 --- a/skfda/ml/classification/_logistic_regression.py +++ b/skfda/ml/classification/_logistic_regression.py @@ -20,7 +20,7 @@ class LogisticRegression( This class implements the sequential “greedy” algorithm for functional logistic regression proposed in - :footcite:ts:`bueno+larraz_2021_functional`. + :footcite:ts:`bueno++_2021_functional`. .. warning:: For now, only binary classification for functional From 193a74dd51cc23b6db22ce1e4ba37bd9579327f5 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 7 Jan 2022 14:27:01 +0100 Subject: [PATCH 32/50] Use actual values for sliders in MultipleDisplay. --- skfda/exploratory/visualization/_baseplot.py | 4 +- .../visualization/_multiple_display.py | 41 ++++++++++--------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/skfda/exploratory/visualization/_baseplot.py b/skfda/exploratory/visualization/_baseplot.py index 633802e18..54f26628b 100644 --- a/skfda/exploratory/visualization/_baseplot.py +++ b/skfda/exploratory/visualization/_baseplot.py @@ -201,14 +201,14 @@ def _update_annotation( text = ( f"{sample_number}{sample_descr}: " - f"({xdata_graph:.2f}, {ydata_graph:.2f})" + f"({xdata_graph:.3g}, {ydata_graph:.3g})" ) tag.set_text(text) x_axis = axes.get_xlim() y_axis = axes.get_ylim() - label_xpos = 20 + label_xpos = -60 label_ypos = 20 if (xdata_graph - x_axis[0]) > (x_axis[1] - xdata_graph): label_xpos = -80 diff --git a/skfda/exploratory/visualization/_multiple_display.py b/skfda/exploratory/visualization/_multiple_display.py index d96620ce6..d592979df 100644 --- a/skfda/exploratory/visualization/_multiple_display.py +++ b/skfda/exploratory/visualization/_multiple_display.py @@ -86,13 +86,13 @@ def __init__( if d.n_samples is not None ) self.sliders: List[Widget] = [] - self.criteria: List[List[int]] = [] self.selected_sample: Optional[int] = None if len(criteria) != 0 and not isinstance(criteria[0], Sequence): criteria = (criteria,) criteria = cast(Sequence[Sequence[float]], criteria) + self.criteria = criteria if not isinstance(sliders, Sequence): sliders = (sliders,) @@ -297,7 +297,7 @@ def _select_sample(self, selected_sample: int) -> None: artist.set_alpha(1.0 if i == selected_sample else 0.1) for criterion, slider in zip(self.criteria, self.sliders): - val_widget = criterion.index(selected_sample) + val_widget = criterion[selected_sample] _set_val_noevents(slider, val_widget) self.selected_sample = selected_sample @@ -329,57 +329,60 @@ def add_slider( """ full_desc = "" if label is None else label + ordered_criterion_values, ordered_criterion_indexes = zip( + *sorted(zip(criterion, range(self.length_data))), + ) + widget = widget_class( ax=axes, label=full_desc, - valmin=0, - valmax=self.length_data - 1, - valinit=0, - valstep=1, + valmin=ordered_criterion_values[0], + valmax=ordered_criterion_values[-1], + valinit=ordered_criterion_values[0], + valstep=ordered_criterion_values, + valfmt="%.3g", ) self.sliders.append(widget) axes.annotate( - '0', + f"{ordered_criterion_values[0]:.3g}", xy=(0, -0.5), xycoords='axes fraction', annotation_clip=False, ) axes.annotate( - str(self.length_data - 1), + f"{ordered_criterion_values[-1]:.3g}", xy=(0.95, -0.5), xycoords='axes fraction', annotation_clip=False, ) - criterion_sample_indexes = [ - x for _, x in sorted(zip(criterion, range(self.length_data))) - ] - - self.criteria.append(criterion_sample_indexes) - on_changed_function = partial( self._value_updated, - criterion_sample_indexes=criterion_sample_indexes, + ordered_criterion_values=ordered_criterion_values, + ordered_criterion_indexes=ordered_criterion_indexes, ) widget.on_changed(on_changed_function) def _value_updated( self, - value: int, - criterion_sample_indexes: Sequence[int], + value: float, + ordered_criterion_values: Sequence[float], + ordered_criterion_indexes: Sequence[int], ) -> None: """ Update the graphs when a widget is clicked. Args: value: Current value of the widget. - criterion_sample_indexes: Sample numbers ordered using the + ordered_criterion_values: Ordered values of the criterion. + ordered_criterion_indexes: Sample numbers ordered using the criterion. """ - self.selected_sample = criterion_sample_indexes[value] + value_index = int(np.searchsorted(ordered_criterion_values, value)) + self.selected_sample = ordered_criterion_indexes[value_index] self._select_sample(self.selected_sample) From 8fb63cad9d566c549390ba2f372095f3dab5df2c Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 7 Jan 2022 16:07:11 +0100 Subject: [PATCH 33/50] Uniformize landmark locations. Closes #404. --- .../registration/_landmark_registration.py | 12 ++++-------- tests/test_registration.py | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/skfda/preprocessing/registration/_landmark_registration.py b/skfda/preprocessing/registration/_landmark_registration.py index e96b5296b..cc02eec6d 100644 --- a/skfda/preprocessing/registration/_landmark_registration.py +++ b/skfda/preprocessing/registration/_landmark_registration.py @@ -42,9 +42,8 @@ def landmark_shift_deltas( passed the location will be the result of the the call, the function should be accept as an unique parameter a numpy array with the list of landmarks. - By default it will be used as location :math:`\frac{1}{2}(max( - \text{landmarks})+ min(\text{landmarks}))` wich minimizes the - max shift. + By default it will be used as location the mean of the original + locations of the landmarks. Returns: Array containing the corresponding shifts. @@ -68,7 +67,7 @@ def landmark_shift_deltas( >>> shifts = landmark_shift_deltas(fd, landmarks) >>> shifts.round(3) - array([ 0.25 , -0.25 , -0.231]) + array([ 0.327, -0.173, -0.154]) The registered samples can be obtained with a shift @@ -88,10 +87,7 @@ def landmark_shift_deltas( # Parses location if location is None: - loc_array = ( - np.max(landmarks, axis=0) - + np.min(landmarks, axis=0) - ) / 2 + loc_array = np.mean(landmarks) elif callable(location): loc_array = location(landmarks) else: diff --git a/tests/test_registration.py b/tests/test_registration.py index a361a0468..6bdbac645 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -124,7 +124,7 @@ def test_landmark_shift_deltas(self) -> None: landmarks = landmarks.squeeze() shifts = landmark_shift_deltas(fd, landmarks).round(3) - np.testing.assert_almost_equal(shifts, [0.25, -0.25, -0.231]) + np.testing.assert_almost_equal(shifts, [0.327, -0.173, -0.154]) def test_landmark_shift_registration(self) -> None: """Test landmark shift registration.""" @@ -138,7 +138,7 @@ def test_landmark_shift_registration(self) -> None: ) # Test default location fd_registered = landmark_shift_registration(fd, landmarks) - center = (landmarks.max() + landmarks.min()) / 2 + center = np.mean(landmarks) reg_modes = fd_registered(center) # Test callable location From 3e9e34913eaf70253b974abf72354301de30ce68 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Jan 2022 13:31:40 +0100 Subject: [PATCH 34/50] Force eager installation of dependencies in tests. --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 73a717a71..4ba2e675b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,7 +27,7 @@ jobs: - name: Run tests run: | - pip3 install . + pip3 install --upgrade-strategy eager . coverage run --source=skfda/ setup.py test; - name: Upload coverage to Codecov From 1ab001f869f59da8b89c7924266fe6def261ca02 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Jan 2022 13:45:02 +0100 Subject: [PATCH 35/50] Add verbose flag for pip --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4ba2e675b..1506d0146 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,7 +27,7 @@ jobs: - name: Run tests run: | - pip3 install --upgrade-strategy eager . + pip3 install --upgrade-strategy eager -vvv . coverage run --source=skfda/ setup.py test; - name: Upload coverage to Codecov From e0ed706f29b2fd690f6fa6dbebd7f1296ab8ac21 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Jan 2022 13:53:20 +0100 Subject: [PATCH 36/50] Reduce verbosity level --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1506d0146..fb232aac1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,7 +27,7 @@ jobs: - name: Run tests run: | - pip3 install --upgrade-strategy eager -vvv . + pip3 install --upgrade-strategy eager -vv . coverage run --source=skfda/ setup.py test; - name: Upload coverage to Codecov From 20b866a0739dc306fc4078ac43633b1042867629 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Jan 2022 14:07:03 +0100 Subject: [PATCH 37/50] Debug pip --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fb232aac1..2b5777fc8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,6 +27,7 @@ jobs: - name: Run tests run: | + pip3 debug --verbose . pip3 install --upgrade-strategy eager -vv . coverage run --source=skfda/ setup.py test; From 19c688183850a889df8c8f0929e5f1a268286008 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Mon, 10 Jan 2022 14:31:47 +0100 Subject: [PATCH 38/50] Remove test verbosity --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2b5777fc8..ba686d849 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,7 +28,7 @@ jobs: - name: Run tests run: | pip3 debug --verbose . - pip3 install --upgrade-strategy eager -vv . + pip3 install --upgrade-strategy eager -v . coverage run --source=skfda/ setup.py test; - name: Upload coverage to Codecov From b8bca3b464b6ef9fee8deb5c27fde869264a8d8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 10 Jan 2022 17:07:39 +0100 Subject: [PATCH 39/50] Install common build packages The package findiff uses these but does not declare a build dependency. --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ba686d849..869d2dd9c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,6 +28,7 @@ jobs: - name: Run tests run: | pip3 debug --verbose . + pip3 install --upgrade setuptools wheel pip3 install --upgrade-strategy eager -v . coverage run --source=skfda/ setup.py test; From 3f4f665d52e44de5629011cdbacce45a3e3ebaf6 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 11 Jan 2022 01:19:35 +0100 Subject: [PATCH 40/50] Fix numba version for now --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ba686d849..3b14b1de2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,6 +28,7 @@ jobs: - name: Run tests run: | pip3 debug --verbose . + pip3 install numba==0.53 pip3 install --upgrade-strategy eager -v . coverage run --source=skfda/ setup.py test; From a456679c30596e79a6b1733f15a82de351d41342 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 11 Jan 2022 18:51:18 +0100 Subject: [PATCH 41/50] Add ellipsoid to the MS plot. Closes #407. --- .../visualization/_magnitude_shape_plot.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 25a165928..42d3b31d6 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -16,6 +16,7 @@ from matplotlib.axes import Axes from matplotlib.colors import Colormap from matplotlib.figure import Figure +from matplotlib.patches import Ellipse from ... import FDataGrid from ...representation._typing import NDArrayFloat, NDArrayInt @@ -71,6 +72,7 @@ class MagnitudeShapePlot(BasePlot): If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. By default, it is 0. + ellipsoid: Whether to draw the non outlying ellipsoid. Attributes: points(numpy.ndarray): 2-dimensional matrix where each row @@ -168,6 +170,7 @@ def __init__( *, fig: Optional[Figure] = None, axes: Optional[Sequence[Axes]] = None, + ellipsoid: bool = True, **kwargs: Any, ) -> None: @@ -187,6 +190,8 @@ def __init__( outliers = (y == -1) + self.ellipsoid = ellipsoid + self._fdata = fdata self._outliers = outliers self._colormap = plt.cm.get_cmap('seismic') @@ -276,6 +281,34 @@ def _plot( colors_rgba = [tuple(i) for i in colors] + if self.ellipsoid: + center = self.outlier_detector.cov_.location_ + prec = self.outlier_detector.cov_.get_precision() + + K = ( + self.outlier_detector.cutoff_value_ + / self.outlier_detector.scaling_ + ) + + eigvals, eigvecs = np.linalg.eigh(prec) + + a, b = np.sqrt(K / eigvals) + if eigvecs[0, 1] * eigvecs[1, 0] > 0: + eigvecs[:, 0] *= -1 + + angle = np.rad2deg(np.arctan2(eigvecs[1, 0], eigvecs[0, 0])) + + ellipse = Ellipse( + xy=center, + width=2 * a, + height=2 * b, + angle=angle, + facecolor='C0', + alpha=0.1, + ) + + axes[0].add_patch(ellipse) + for i, _ in enumerate(self.points[:, 0].ravel()): self.artists[i, 0] = axes[0].scatter( self.points[:, 0].ravel()[i], From 2ce031d32bb072737312c1c8946d7fdad3e95e05 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sun, 16 Jan 2022 20:23:06 +0100 Subject: [PATCH 42/50] Add second derivative as default operator for Tikhonov. --- skfda/misc/regularization/_regularization.py | 22 ++++++++---- tests/test_regularization.py | 36 ++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/skfda/misc/regularization/_regularization.py b/skfda/misc/regularization/_regularization.py index ea920c583..1f89f6a04 100644 --- a/skfda/misc/regularization/_regularization.py +++ b/skfda/misc/regularization/_regularization.py @@ -4,9 +4,9 @@ from typing import Any, Generic, Iterable, Optional, Union import numpy as np +import scipy.linalg from sklearn.base import BaseEstimator -import scipy.linalg from skfda.misc.operators import Identity, gramian_matrix from ...representation import FData @@ -38,9 +38,11 @@ class TikhonovRegularization( provides several common linear operators. Parameters: - linear_operator: linear operator used for regularization. - regularization_parameter: scaling parameter (:math:`\lambda`) of the - penalization. + linear_operator: Linear operator used for regularization. By default + the second derivative, which is related with the function + curvature, is penalized. + regularization_parameter: Scaling parameter (:math:`\lambda`) of the + penalization. Examples: Construct a regularization that penalizes the second derivative, @@ -88,7 +90,7 @@ class TikhonovRegularization( def __init__( self, - linear_operator: Operator[OperatorInput, Any], + linear_operator: Optional[Operator[OperatorInput, Any]] = None, *, regularization_parameter: float = 1, ) -> None: @@ -100,8 +102,16 @@ def penalty_matrix( basis: OperatorInput, ) -> np.ndarray: """Return a penalty matrix for ordinary least squares.""" + from ..operators import LinearDifferentialOperator + + linear_operator = ( + LinearDifferentialOperator(2) + if self.linear_operator is None + else self.linear_operator + ) + return self.regularization_parameter * gramian_matrix( - self.linear_operator, + linear_operator, basis, ) diff --git a/tests/test_regularization.py b/tests/test_regularization.py index 41f839829..c3ccdd81e 100644 --- a/tests/test_regularization.py +++ b/tests/test_regularization.py @@ -224,6 +224,42 @@ def test_bspline_penalty_special_case(self) -> None: ) +class TestDefaultTikhonovRegularization(unittest.TestCase): + """Test default value of Tikhonov regularization.""" + + def test_basis_default(self) -> None: + """Test that in basis smoothing.""" + data_matrix = np.linspace([0, 1, 2, 3], [1, 2, 3, 4], 100) + + fd = skfda.FDataGrid(data_matrix.T) + + smoother = skfda.preprocessing.smoothing.BasisSmoother( + basis=skfda.representation.basis.BSpline( + n_basis=10, + domain_range=fd.domain_range, + ), + regularization=TikhonovRegularization(), + ) + + smoother2 = skfda.preprocessing.smoothing.BasisSmoother( + basis=skfda.representation.basis.BSpline( + n_basis=10, + domain_range=fd.domain_range, + ), + regularization=TikhonovRegularization( + LinearDifferentialOperator(2), + ), + ) + + fd_basis = smoother.fit_transform(fd) + fd_basis2 = smoother2.fit_transform(fd) + + np.testing.assert_allclose( + fd_basis.data_matrix, + fd_basis2.data_matrix, + ) + + class TestEndpointsDifferenceRegularization(unittest.TestCase): """Test regularization with a callable.""" From f8aad4707890c8bee5cb346776b7bc10cc7dc510 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Wed, 19 Jan 2022 20:55:03 +0100 Subject: [PATCH 43/50] Improve mRMR. - Fix bug in the removal of selected variables. - Improve documentation and add references. - Allow to use "difference" and "quotient" as the criterion. --- docs/refs.bib | 20 ++++++ .../dim_reduction/variable_selection/mrmr.py | 69 +++++++++++++++---- 2 files changed, 75 insertions(+), 14 deletions(-) diff --git a/docs/refs.bib b/docs/refs.bib index 4029b7d6f..7a774f61c 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -1,3 +1,5 @@ + + @article{berrendero+cuevas+torrecilla_2016_hunting, author = {Berrendero, J.R. and Cuevas, Antonio and Torrecilla, José}, year = {2016}, @@ -9,6 +11,24 @@ @article{berrendero+cuevas+torrecilla_2016_hunting doi = {10.5705/ss.202014.0014} } +@article{berrendero++_2016_mrmr, + title = {The {{mRMR}} Variable Selection Method: A Comparative Study for Functional Data}, + shorttitle = {The {{mRMR}} Variable Selection Method}, + author = {Berrendero, J.R. and Cuevas, A. and Torrecilla, J.L.}, + year = {2016}, + month = mar, + journal = {Journal of Statistical Computation and Simulation}, + volume = {86}, + number = {5}, + pages = {891--907}, + publisher = {{Taylor \& Francis}}, + issn = {0094-9655}, + doi = {10.1080/00949655.2015.1042378}, + abstract = {The use of variable selection methods is particularly appealing in statistical problems with functional data. The obvious general criterion for variable selection is to choose the `most representative' or `most relevant' variables. However, it is also clear that a purely relevance-oriented criterion could lead to select many redundant variables. The minimum Redundance Maximum Relevance (mRMR) procedure, proposed by Ding and Peng [Minimum redundancy feature selection from microarray gene expression data. J Bioinform Comput Biol. 2005;3:185\textendash 205] and Peng et al. [Feature selection based on mutual information: criteria of max-dependency, max-relevance, and min-redundancy. IEEE Trans Pattern Anal Mach Intell. 2005;27:1226\textendash 1238] is an algorithm to systematically perform variable selection, achieving a reasonable trade-off between relevance and redundancy. In its original form, this procedure is based on the use of the so-called mutual information criterion to assess relevance and redundancy. Keeping the focus on functional data problems, we propose here a modified version of the mRMR method, obtained by replacing the mutual information by the new association measure (called distance correlation) suggested by Sz\'ekely et al. [Measuring and testing dependence by correlation of distances. Ann Statist. 2007;35:2769\textendash 2794]. We have also performed an extensive simulation study, including 1600 functional experiments (100 functional models \texttimes 4 sample sizes \texttimes 4 classifiers) and three real-data examples aimed at comparing the different versions of the mRMR methodology. The results are quite conclusive in favour of the new proposed alternative.}, + keywords = {distance correlation,functional data analysis,Primary: 62H30,Secondary: 62H20,supervised classification,variable selection}, + annotation = {\_eprint: https://doi.org/10.1080/00949655.2015.1042378} +} + @article{berrendero+cuevas+torrecilla_2018_hilbert, author = {José R. Berrendero and Antonio Cuevas and José L. Torrecilla}, title = {On the Use of Reproducing Kernel Hilbert Spaces in Functional Classification}, diff --git a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py index d3c0f8925..22badb531 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py @@ -26,6 +26,10 @@ from ....representation.grid import FDataGrid _Criterion = Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat] +_CriterionLike = Union[ + _Criterion, + Literal["difference", "quotient"], +] class Method(NamedTuple): @@ -135,7 +139,7 @@ def _mrmr( scores.append(coef[max_index]) selected_relevances.append(relevances[max_index]) - indexes.remove(max_index) + indexes.remove(indexes[max_index]) return ( np.asarray(selected_features), @@ -148,9 +152,30 @@ class MinimumRedundancyMaximumRelevance( sklearn.base.BaseEstimator, # type: ignore sklearn.base.TransformerMixin, # type: ignore ): - """ + r""" Minimum redundancy maximum relevance (mRMR) method. + This is a greedy version of mRMR that selects the variables iteratively. + This method considers the relevance of a variable as well as its redundancy + with respect of the already selected ones. + + It uses a dependence measure between random variables to compute the + dependence between the candidate variable and the target (for the + relevance) and another to compute the dependence between two variables + (for the redundancy). + It combines both measurements using a criterion such as the difference or + the quotient, and then selects the variable that maximizes that quantity. + For example, using the quotient criterion and the same dependence function + :math:`D` for relevance and redundancy, the variable selected at the + :math:`i`-th step would be :math:`X(t_i)` with + + .. math:: + t_i = \underset {t}{\operatorname {arg\,max}} \frac{D(X(t), y)} + {\frac{1}{i-1}\sum_{j < i} D(X(t), X(t_j))}. + + For further discussion of the applicability of this method to functional + data see :footcite:`berrendero++_2016_mrmr`. + Parameters: n_features_to_select: Number of features to select. method: Predefined method to use (MID or MIQ). @@ -160,8 +185,9 @@ class MinimumRedundancyMaximumRelevance( relevance. redundancy_dependence_measure: Dependence measure used to compute redundancy. - criterion: Criterion to combine relevance and redundancy. Common - choices include the difference and the quotient. + criterion: Criterion to combine relevance and redundancy. It must be + a Python callable with two inputs. As common choices include the + difference and the quotient, both can be especified as strings. Examples: >>> from skfda.preprocessing.dim_reduction import variable_selection @@ -227,10 +253,24 @@ class MinimumRedundancyMaximumRelevance( >>> mrmr = variable_selection.MinimumRedundancyMaximumRelevance( ... n_features_to_select=3, ... dependence_measure=dcor.u_distance_correlation_sqr, - ... criterion=operator.truediv, + ... criterion="quotient", ... ) >>> _ = mrmr.fit(X, y) + As a toy example illustrating the customizability of this method, + consider the following: + + >>> mrmr = variable_selection.MinimumRedundancyMaximumRelevance( + ... n_features_to_select=3, + ... relevance_dependence_measure=dcor.u_distance_covariance_sqr, + ... redundancy_dependence_measure=dcor.u_distance_correlation_sqr, + ... criterion=lambda rel, red: 0.5 * rel / red, + ... ) + >>> _ = mrmr.fit(X, y) + + References: + .. footbibliography:: + """ @overload @@ -256,7 +296,7 @@ def __init__( *, n_features_to_select: int = 1, dependence_measure: _DependenceMeasure, - criterion: _Criterion, + criterion: _CriterionLike, ) -> None: pass @@ -267,7 +307,7 @@ def __init__( n_features_to_select: int = 1, relevance_dependence_measure: _DependenceMeasure, redundancy_dependence_measure: _DependenceMeasure, - criterion: _Criterion, + criterion: _CriterionLike, ) -> None: pass @@ -279,7 +319,7 @@ def __init__( dependence_measure: Optional[_DependenceMeasure] = None, relevance_dependence_measure: Optional[_DependenceMeasure] = None, redundancy_dependence_measure: Optional[_DependenceMeasure] = None, - criterion: Optional[_Criterion] = None, + criterion: Optional[_CriterionLike] = None, ) -> None: self.n_features_to_select = n_features_to_select self.method = method @@ -324,9 +364,7 @@ def _validate_parameters(self) -> None: self.redundancy_dependence_measure_ = ( method.redundancy_dependence_measure ) - self.criterion_ = ( - method.criterion - ) + self.criterion_ = method.criterion else: if self.criterion is None: @@ -334,9 +372,12 @@ def _validate_parameters(self) -> None: "You must specify a criterion parameter", ) - self.criterion_ = ( - self.criterion - ) + if self.criterion == "difference": + self.criterion = operator.sub + elif self.criterion == "quotient": + self.criterion_ = operator.truediv + else: + self.criterion_ = self.criterion if self.dependence_measure: if ( From 601e8d54512676d0a7ca65163e50bab25c09c135 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 20 Jan 2022 13:21:09 +0100 Subject: [PATCH 44/50] Improve visualization docs. --- docs/modules/exploratory/stats.rst | 9 ++ docs/modules/exploratory/visualization.rst | 135 ++++++++++++++++-- .../exploratory/visualization/boxplot.rst | 28 ---- .../exploratory/visualization/clustering.rst | 23 --- .../exploratory/visualization/fpca.rst | 14 -- .../visualization/magnitude_shape_plot.rst | 19 --- .../exploratory/visualization/_outliergram.py | 11 +- 7 files changed, 142 insertions(+), 97 deletions(-) delete mode 100644 docs/modules/exploratory/visualization/boxplot.rst delete mode 100644 docs/modules/exploratory/visualization/clustering.rst delete mode 100644 docs/modules/exploratory/visualization/fpca.rst delete mode 100644 docs/modules/exploratory/visualization/magnitude_shape_plot.rst diff --git a/docs/modules/exploratory/stats.rst b/docs/modules/exploratory/stats.rst index 6d9b7e8e7..bd36cfc0e 100644 --- a/docs/modules/exploratory/stats.rst +++ b/docs/modules/exploratory/stats.rst @@ -32,4 +32,13 @@ statistics can be used. skfda.exploratory.stats.cov skfda.exploratory.stats.var +Additional statistics +--------------------- + +The following statistics can be used to estimate additional properties of the data. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.stats.modified_epigraph_index diff --git a/docs/modules/exploratory/visualization.rst b/docs/modules/exploratory/visualization.rst index a2de8fb3a..5affcead0 100644 --- a/docs/modules/exploratory/visualization.rst +++ b/docs/modules/exploratory/visualization.rst @@ -1,14 +1,131 @@ Visualization ============= -The visualization package provides tools to show different views of -the functional data, that highlight several important aspects of it. +Visualization methods are one of the most important tools for exploratory analysis. +They can provide intuition over particular data that is very difficult to obtain otherwise. +As functional data is infinite dimensional, good visualization tools capable to summarize +and illustrate the main features of the data are of particular importance. +The visualization module provides a thorough collection of these tools. +Each of them highlights different characteristics of the data and thus they complement each other. -.. toctree:: - :maxdepth: 4 - :caption: Modules: +Basic representation +-------------------- + +Functional data with :term:`domain` dimension of 1 or 2 can be represented directly as function +graphs, which will be curves or surfaces respectively. Each :term:`codomain` dimension will be plotted +separately. +Additionally, for discretized data, the discretization points can be plotted as a scatter plot. +The following classes implement these plotting methods. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.representation.GraphPlot + skfda.exploratory.visualization.representation.ScatterPlot - visualization/boxplot - visualization/magnitude_shape_plot - visualization/clustering - visualization/fpca \ No newline at end of file +Note that the :func:`~skfda.representation.FData.plot` and +:func:`~skfda.representation.grid.FDataGrid.plot` methods simply instantiate and plot an object +of one of these classes. + +Parametric plot +--------------- + +Parametric plots are used to plot one function versus another when they have the same :term:`domain`. +This is used for example in phase plane plots, showing the relation between two derivatives +of different order. +It is also useful to plot observations corresponding to curves in 2D, as it shows both dimensions +of the :term:`codomain` in the same plot. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.ParametricPlot + +Functional Data Boxplot +----------------------- + +The functional data boxplot is an extension of the univariate boxplot to the functional data domain. +As such, it is a very useful tool to detect outliers and check the magnitude of the variation of the data. +There are two variants of this plot, depending on the number of dimensions (1 or 2) of the :term:`domain`. + +If the dimension of the :term:`domain` is 1, the following class must be used. +See the :ref:`sphx_glr_auto_examples_plot_boxplot.py` example for detailed explanation. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.Boxplot + +If the dimension of the :term:`domain` is 2, this one. See the +:ref:`sphx_glr_auto_examples_plot_surface_boxplot.py` +example for detailed explanation. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.SurfaceBoxplot + +Outliergram +----------- + +The outliergram represents each functional observation as a point whose coordinates are its +:class:`modified band depth` and its +:func:`modified epigraph index`. +These quantities are related, and in absence of crossings between observations the points +should lie on a parabola. +Thus, substantial deviations from that behavior characterize observations that are shape +outliers. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.Outliergram + +Magnitude-Shape Plot +-------------------- + +The Magnitude-Shape plot tries to summarize the shape and magnitude of an observation as real +numbers, and plot them in a scatter plot. +In addition it computes an ellipse, which serves as a decision boundary for detecting outliers. + +This is a very useful tool to detect shape and magnitude outliers and differentiate between them. + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.MagnitudeShapePlot + +Clustering Plots +---------------- +In order to show the results of the cluster algorithms in a visual way, +:mod:`this module ` is +implemented. It contains the following classes: + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.clustering.ClusterPlot + skfda.exploratory.visualization.clustering.ClusterMembershipLinesPlot + skfda.exploratory.visualization.clustering.ClusterMembershipPlot + +In the first one, the samples of the FDataGrid are divided by clusters which +are assigned different colors. The following functions, are only valid for the +class :class:`FuzzyKMeans ` to see +the results graphically in the form of a parallel coordinates plot or a barplot +respectively. + +See `Clustering Example <../auto_examples/plot_clustering.html>`_ for detailed +explanation. + +Functional Principal Component Analysis plots +--------------------------------------------- +In order to show the modes of variation that the principal components represent, +the following class is implemented: + +.. autosummary:: + :toctree: autosummary + + skfda.exploratory.visualization.fpca.FPCAPlot + +See the example :ref:`sphx_glr_auto_examples_plot_fpca.py` for detailed +explanation. \ No newline at end of file diff --git a/docs/modules/exploratory/visualization/boxplot.rst b/docs/modules/exploratory/visualization/boxplot.rst deleted file mode 100644 index 9b7ff4a54..000000000 --- a/docs/modules/exploratory/visualization/boxplot.rst +++ /dev/null @@ -1,28 +0,0 @@ -Functional Data Boxplot -======================= - -Classes to construct the functional data boxplot. Only supported for -functional data with :term:`domain` dimension 1 or 2 and as many dimensions on -the :term:`codomain` as required. - -If the dimension of the :term:`domain` is 1, the following class must be used. -See the :ref:`sphx_glr_auto_examples_plot_boxplot.py` example for detailed explanation. - -.. autosummary:: - :toctree: autosummary - - skfda.exploratory.visualization.Boxplot - -If the dimension of the :term:`domain` is 2, this one. See the :ref:`sphx_glr_auto_examples_plot_surface_boxplot.py` -example for detailed explanation. - -.. autosummary:: - :toctree: autosummary - - skfda.exploratory.visualization.SurfaceBoxplot - - - - - - diff --git a/docs/modules/exploratory/visualization/clustering.rst b/docs/modules/exploratory/visualization/clustering.rst deleted file mode 100644 index e8bb23d86..000000000 --- a/docs/modules/exploratory/visualization/clustering.rst +++ /dev/null @@ -1,23 +0,0 @@ -Clustering Plots -================ -In order to show the results of the cluster algorithms in a visual way, -:mod:`this module ` is -implemented. It contains the following classes: - -.. autosummary:: - :toctree: autosummary - - skfda.exploratory.visualization.clustering.ClusterPlot - skfda.exploratory.visualization.clustering.ClusterMembershipLinesPlot - skfda.exploratory.visualization.clustering.ClusterMembershipPlot - -In the first one, the samples of the FDataGrid are divided by clusters which -are assigned different colors. The following functions, are only valid for the -class :class:`FuzzyKMeans ` to see -the results graphically in the form of a parallel coordinates plot or a barplot -respectively. - -See `Clustering Example <../auto_examples/plot_clustering.html>`_ for detailed -explanation. - - diff --git a/docs/modules/exploratory/visualization/fpca.rst b/docs/modules/exploratory/visualization/fpca.rst deleted file mode 100644 index 141769ef4..000000000 --- a/docs/modules/exploratory/visualization/fpca.rst +++ /dev/null @@ -1,14 +0,0 @@ -Functional Principal Component Analysis plots -============================================= -In order to show the modes of variation that the principal components represent, -the following class is implemented - -.. autosummary:: - :toctree: autosummary - - skfda.exploratory.visualization.fpca.FPCAPlot - -See the example :ref:`sphx_glr_auto_examples_plot_fpca.py` for detailed -explanation. - - diff --git a/docs/modules/exploratory/visualization/magnitude_shape_plot.rst b/docs/modules/exploratory/visualization/magnitude_shape_plot.rst deleted file mode 100644 index 17668c2ed..000000000 --- a/docs/modules/exploratory/visualization/magnitude_shape_plot.rst +++ /dev/null @@ -1,19 +0,0 @@ -Magnitude-Shape Plot -==================== - -The Magnitude-Shape Plot is implemented in the -:class:`~skfda.exploratory.visualization.MagnitudeShapePlot` class. - -The :class:`~skfda.exploratory.visualization.MagnitudeShapePlot` needs both the mean -and the variation of the directional outlyingness of the samples, which is calculated using -:func:`~skfda.exploratory.outliers.directional_outlyingness_stats`. - -Once the points assigned to each of the samples are obtained from the above -function, an outlier detection method is implemented. The results can be shown -calling the :meth:`~skfda.magnitude_shape_plot.MagnitudeShapePlot.plot` -method of the class. - -.. autosummary:: - :toctree: autosummary - - skfda.exploratory.visualization.MagnitudeShapePlot diff --git a/skfda/exploratory/visualization/_outliergram.py b/skfda/exploratory/visualization/_outliergram.py index d7410f41f..7b45e76ff 100644 --- a/skfda/exploratory/visualization/_outliergram.py +++ b/skfda/exploratory/visualization/_outliergram.py @@ -23,10 +23,13 @@ class Outliergram(BasePlot): """ Outliergram method of visualization. - Plots the Modified Band Depth (MBD) on the Y axis and the Modified - Epigraph Index (MEI) on the X axis. This points will create the form of - a parabola. The shape outliers will be the points that appear far from - this curve. + Plots the :class:`Modified Band Depth + (MBD)` on the Y axis and the + :func:`Modified Epigraph Index + (MEI)` on the X axis. + These points will create the form of a parabola. + The shape outliers will be the points that appear far from this curve. + Args: fdata: functional data set that we want to examine. chart: figure over with the graphs are plotted or axis over From 0bc020cecc1b83fa7821dd584977677db23ccd40 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 21 Jan 2022 01:32:29 +0100 Subject: [PATCH 45/50] Simplify regularization. --- docs/modules/misc/regularization.rst | 10 +-- skfda/misc/regularization/_regularization.py | 77 +++++++------------ skfda/ml/regression/_linear_regression.py | 13 ++-- .../dim_reduction/feature_extraction/_fpca.py | 7 +- skfda/preprocessing/smoothing/_basis.py | 12 +-- tests/test_fpca.py | 11 +-- tests/test_regression.py | 10 +-- tests/test_regularization.py | 20 +++-- tests/test_smoothing.py | 12 +-- 9 files changed, 74 insertions(+), 98 deletions(-) diff --git a/docs/modules/misc/regularization.rst b/docs/modules/misc/regularization.rst index e68aeb7c2..7ff55fe5f 100644 --- a/docs/modules/misc/regularization.rst +++ b/docs/modules/misc/regularization.rst @@ -17,11 +17,8 @@ When dealing with multivariate data, a common choice for the regularization is to penalize the squared Euclidean norm, or :math:`L_2` norm, of the vectors in order to obtain simpler solutions. This can be done in scikit-fda for both multivariate and functional data using the :class:`L2Regularization` -class. - -A more flexible generalization of this approach is the so called Tikhonov -regularization, available as :class:`TikhonovRegularization`, in which the -squared :math:`L_2` norm is penalized after a particular linear operator is +class. A more flexible generalization of this approach is to penalize the +squared :math:`L_2` norm after a particular linear operator is applied. This for example allows to penalize the second derivative of a curve, which is a measure of its curvature, because the differential operator is linear. As arbitrary Python callables can be used as operators (provided @@ -32,5 +29,4 @@ linear operations. .. autosummary:: :toctree: autosummary - skfda.misc.regularization.L2Regularization - skfda.misc.regularization.TikhonovRegularization \ No newline at end of file + skfda.misc.regularization.L2Regularization \ No newline at end of file diff --git a/skfda/misc/regularization/_regularization.py b/skfda/misc/regularization/_regularization.py index 1f89f6a04..9485161ea 100644 --- a/skfda/misc/regularization/_regularization.py +++ b/skfda/misc/regularization/_regularization.py @@ -1,6 +1,7 @@ from __future__ import annotations import itertools +import warnings from typing import Any, Generic, Iterable, Optional, Union import numpy as np @@ -10,17 +11,18 @@ from skfda.misc.operators import Identity, gramian_matrix from ...representation import FData +from ...representation._typing import NDArrayFloat from ...representation.basis import Basis from ..operators import Operator from ..operators._operators import OperatorInput -class TikhonovRegularization( +class L2Regularization( BaseEstimator, # type: ignore Generic[OperatorInput], ): r""" - Implements Tikhonov regularization. + Implements :math:`L_2` (Tikhonov) regularization. The penalization term in this type of regularization is the square of the :math:`L_2` (Euclidean) norm of a linear operator applied to the function @@ -48,27 +50,21 @@ class TikhonovRegularization( Construct a regularization that penalizes the second derivative, which is a measure of the curvature of the function. - >>> from skfda.misc.regularization import TikhonovRegularization + >>> from skfda.misc.regularization import L2Regularization >>> from skfda.misc.operators import LinearDifferentialOperator >>> - >>> regularization = TikhonovRegularization( - ... LinearDifferentialOperator(2)) + >>> regularization = L2Regularization( + ... LinearDifferentialOperator(2), + ... ) - Construct a regularization that penalizes the identity operator, - that is, completely equivalent to the :math:`L_2` regularization ( - :class:`L2Regularization`). + By default the regularization penalizes the identity operator: - >>> from skfda.misc.regularization import TikhonovRegularization - >>> from skfda.misc.operators import Identity - >>> - >>> regularization = TikhonovRegularization(Identity()) + >>> regularization = L2Regularization() Construct a regularization that penalizes the difference between the points :math:`f(1)` and :math:`f(0)` of a function :math:`f`. - >>> from skfda.misc.regularization import TikhonovRegularization - >>> - >>> regularization = TikhonovRegularization(lambda x: x(1) - x(0)) + >>> regularization = L2Regularization(lambda x: x(1) - x(0)) Construct a regularization that penalizes the harmonic acceleration operator :math:`Lf = \omega^2 D f + D^3 f`, that, when the @@ -77,14 +73,13 @@ class TikhonovRegularization( :math:`\omega` is the angular frequency. This is useful for some periodic functions. - >>> from skfda.misc.regularization import TikhonovRegularization - >>> from skfda.misc.operators import LinearDifferentialOperator >>> import numpy as np >>> >>> period = 1 >>> w = 2 * np.pi / period - >>> regularization = TikhonovRegularization( - ... LinearDifferentialOperator([0, w**2, 0, 1])) + >>> regularization = L2Regularization( + ... LinearDifferentialOperator([0, w**2, 0, 1]), + ... ) """ @@ -100,12 +95,10 @@ def __init__( def penalty_matrix( self, basis: OperatorInput, - ) -> np.ndarray: + ) -> NDArrayFloat: """Return a penalty matrix for ordinary least squares.""" - from ..operators import LinearDifferentialOperator - linear_operator = ( - LinearDifferentialOperator(2) + Identity() if self.linear_operator is None else self.linear_operator ) @@ -116,43 +109,31 @@ def penalty_matrix( ) -class L2Regularization( - TikhonovRegularization[Union[np.ndarray, FData, Basis]], +class TikhonovRegularization( + L2Regularization[OperatorInput], ): - r""" - Implements :math:`L_2` regularization. - - The penalization term in this type of regularization is the square of the - :math:`L_2` (Euclidean) norm of the function or vector - - .. math:: - \lambda \| x \|_2^2 - - where :math:`\lambda` is a positive real number. - - This is equivalent to Tikhonov regularization ( - :class:`TikhonovRegularization`) using the identity operator ( - :class:`Identity`). - - Parameters: - regularization_parameter: scaling parameter (:math:`\lambda`) of the - penalization. - - """ def __init__( self, + linear_operator: Optional[Operator[OperatorInput, Any]] = None, *, regularization_parameter: float = 1, ) -> None: + + warnings.warn( + "Class TikhonovRegularization is deprecated. Use class " + "L2Regularization instead.", + DeprecationWarning, + ) + return super().__init__( - linear_operator=Identity(), + linear_operator=linear_operator, regularization_parameter=regularization_parameter, ) BasisTypes = Union[np.ndarray, FData, Basis] -Regularization = TikhonovRegularization[Any] +Regularization = L2Regularization[Any] RegularizationLike = Union[ None, Regularization, @@ -164,7 +145,7 @@ def compute_penalty_matrix( basis_iterable: Iterable[BasisTypes], regularization_parameter: Union[float, Iterable[float]], regularization: RegularizationLike, -) -> Optional[np.ndarray]: +) -> Optional[NDArrayFloat]: """ Compute the regularization matrix for a linear differential operator. diff --git a/skfda/ml/regression/_linear_regression.py b/skfda/ml/regression/_linear_regression.py index 5de4d4eb1..016c70015 100644 --- a/skfda/ml/regression/_linear_regression.py +++ b/skfda/ml/regression/_linear_regression.py @@ -9,23 +9,20 @@ from sklearn.utils.validation import check_is_fitted from ...misc.lstsq import solve_regularized_weighted_lstsq -from ...misc.regularization import ( - TikhonovRegularization, - compute_penalty_matrix, -) +from ...misc.regularization import L2Regularization, compute_penalty_matrix from ...representation import FData from ...representation.basis import Basis from ._coefficients import CoefficientInfo, coefficient_info_from_covariate RegularizationType = Union[ - TikhonovRegularization[Any], - Sequence[Optional[TikhonovRegularization[Any]]], + L2Regularization[Any], + Sequence[Optional[L2Regularization[Any]]], None, ] RegularizationIterableType = Union[ - TikhonovRegularization[Any], - Iterable[Optional[TikhonovRegularization[Any]]], + L2Regularization[Any], + Iterable[Optional[L2Regularization[Any]]], None, ] diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 8e6b9f2e4..9d81298ae 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -12,10 +12,7 @@ from ....misc import inner_product_matrix from ....misc.metrics import l2_norm -from ....misc.regularization import ( - TikhonovRegularization, - compute_penalty_matrix, -) +from ....misc.regularization import L2Regularization, compute_penalty_matrix from ....representation import FData from ....representation._typing import ArrayLike from ....representation.basis import Basis, FDataBasis @@ -100,7 +97,7 @@ def __init__( self, n_components: int = 3, centering: bool = True, - regularization: Optional[TikhonovRegularization[FData]] = None, + regularization: Optional[L2Regularization[FData]] = None, weights: Optional[Union[ArrayLike, WeightsCallable]] = None, components_basis: Optional[Basis] = None, ) -> None: diff --git a/skfda/preprocessing/smoothing/_basis.py b/skfda/preprocessing/smoothing/_basis.py index 2a128a512..21b089a9c 100644 --- a/skfda/preprocessing/smoothing/_basis.py +++ b/skfda/preprocessing/smoothing/_basis.py @@ -13,7 +13,7 @@ from ..._utils import _cartesian_product, _to_grid_points from ...misc.lstsq import LstsqMethod, solve_regularized_weighted_lstsq -from ...misc.regularization import TikhonovRegularization +from ...misc.regularization import L2Regularization from ...representation import FData, FDataBasis, FDataGrid from ...representation._typing import GridPointsLike, NDArrayFloat from ...representation.basis import Basis @@ -145,7 +145,7 @@ class BasisSmoother(_LinearSmoother): We can penalize approximations that are not smooth enough using some kind of regularization: - >>> from skfda.misc.regularization import TikhonovRegularization + >>> from skfda.misc.regularization import L2Regularization >>> from skfda.misc.operators import LinearDifferentialOperator >>> >>> fd = skfda.FDataGrid(data_matrix=x, grid_points=t) @@ -153,7 +153,7 @@ class BasisSmoother(_LinearSmoother): >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, ... method='cholesky', - ... regularization=TikhonovRegularization( + ... regularization=L2Regularization( ... LinearDifferentialOperator([0.1, 0.2]), ... ), ... return_basis=True, @@ -167,7 +167,7 @@ class BasisSmoother(_LinearSmoother): >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, ... method='qr', - ... regularization=TikhonovRegularization( + ... regularization=L2Regularization( ... LinearDifferentialOperator([0.1, 0.2]), ... ), ... return_basis=True, @@ -181,7 +181,7 @@ class BasisSmoother(_LinearSmoother): >>> smoother = skfda.preprocessing.smoothing.BasisSmoother( ... basis, ... method='svd', - ... regularization=TikhonovRegularization( + ... regularization=L2Regularization( ... LinearDifferentialOperator([0.1, 0.2]), ... ), ... return_basis=True, @@ -209,7 +209,7 @@ def __init__( *, smoothing_parameter: float = 1.0, weights: Optional[NDArrayFloat] = None, - regularization: Optional[TikhonovRegularization[FDataGrid]] = None, + regularization: Optional[L2Regularization[FDataGrid]] = None, output_points: Optional[GridPointsLike] = None, method: LstsqMethod = 'svd', return_basis: bool = False, diff --git a/tests/test_fpca.py b/tests/test_fpca.py index 93dce7529..f7e790f5f 100644 --- a/tests/test_fpca.py +++ b/tests/test_fpca.py @@ -2,14 +2,15 @@ import unittest import numpy as np +from sklearn.decomposition import PCA + import skfda from skfda import FDataBasis, FDataGrid from skfda.datasets import fetch_weather from skfda.misc.operators import LinearDifferentialOperator -from skfda.misc.regularization import TikhonovRegularization +from skfda.misc.regularization import L2Regularization from skfda.preprocessing.dim_reduction.feature_extraction import FPCA from skfda.representation.basis import Basis, BSpline, Fourier -from sklearn.decomposition import PCA class FPCATestCase(unittest.TestCase): @@ -65,7 +66,7 @@ def test_basis_fpca_fit_result(self) -> None: fpca = FPCA( n_components=n_components, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2), regularization_parameter=1e5, ), @@ -119,7 +120,7 @@ def test_basis_fpca_transform_result(self) -> None: fpca = FPCA( n_components=n_components, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2), regularization_parameter=1e5, ), @@ -438,7 +439,7 @@ def test_grid_fpca_regularization_fit_result(self) -> None: fpca = FPCA( n_components=n_components, weights=[1] * 365, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2), ), ) diff --git a/tests/test_regression.py b/tests/test_regression.py index 9b17891b8..4680b9d0b 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -6,7 +6,7 @@ from skfda.datasets import make_gaussian, make_gaussian_process from skfda.misc.covariances import Gaussian from skfda.misc.operators import LinearDifferentialOperator -from skfda.misc.regularization import TikhonovRegularization +from skfda.misc.regularization import L2Regularization from skfda.ml.regression import HistoricalLinearRegression, LinearRegression from skfda.representation.basis import BSpline, FDataBasis, Fourier, Monomial from skfda.representation.grid import FDataGrid @@ -129,8 +129,8 @@ def test_regression_mixed_regularization(self): y = 2 + y_sum + y_integral scalar = LinearRegression( - regularization=[TikhonovRegularization(lambda x: x), - TikhonovRegularization( + regularization=[L2Regularization(lambda x: x), + L2Regularization( LinearDifferentialOperator(2))]) scalar.fit(X, y) @@ -177,7 +177,7 @@ def test_regression_regularization(self): scalar = LinearRegression( coef_basis=[beta_basis], - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2))) scalar.fit(x_fd, y) np.testing.assert_allclose(scalar.coef_[0].coefficients, @@ -213,7 +213,7 @@ def test_regression_regularization(self): y_reg = [5.333, 3.419, 2.697, 11.366] scalar_reg = LinearRegression( - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2))) scalar_reg.fit(x_fd, y) np.testing.assert_allclose(scalar_reg.coef_[0].coefficients, diff --git a/tests/test_regularization.py b/tests/test_regularization.py index c3ccdd81e..13aa9dc79 100644 --- a/tests/test_regularization.py +++ b/tests/test_regularization.py @@ -11,12 +11,16 @@ from sklearn.model_selection._split import train_test_split import skfda -from skfda.misc.operators import LinearDifferentialOperator, gramian_matrix +from skfda.misc.operators import ( + Identity, + LinearDifferentialOperator, + gramian_matrix, +) from skfda.misc.operators._linear_differential_operator import ( _monomial_evaluate_constant_linear_diff_op, ) from skfda.misc.operators._operators import gramian_matrix_numerical -from skfda.misc.regularization import L2Regularization, TikhonovRegularization +from skfda.misc.regularization import L2Regularization from skfda.ml.regression import LinearRegression from skfda.representation.basis import ( Basis, @@ -224,8 +228,8 @@ def test_bspline_penalty_special_case(self) -> None: ) -class TestDefaultTikhonovRegularization(unittest.TestCase): - """Test default value of Tikhonov regularization.""" +class TestDefaultL2Regularization(unittest.TestCase): + """Test default value of L2 regularization.""" def test_basis_default(self) -> None: """Test that in basis smoothing.""" @@ -238,7 +242,7 @@ def test_basis_default(self) -> None: n_basis=10, domain_range=fd.domain_range, ), - regularization=TikhonovRegularization(), + regularization=L2Regularization(), ) smoother2 = skfda.preprocessing.smoothing.BasisSmoother( @@ -246,8 +250,8 @@ def test_basis_default(self) -> None: n_basis=10, domain_range=fd.domain_range, ), - regularization=TikhonovRegularization( - LinearDifferentialOperator(2), + regularization=L2Regularization( + Identity(), ), ) @@ -274,7 +278,7 @@ def test_basis_conversion(self) -> None: n_basis=10, domain_range=fd.domain_range, ), - regularization=TikhonovRegularization( + regularization=L2Regularization( lambda x: x(1)[:, 0] - x(0)[:, 0], ), smoothing_parameter=10000, diff --git a/tests/test_smoothing.py b/tests/test_smoothing.py index e3f9b0bb3..d7fccd137 100644 --- a/tests/test_smoothing.py +++ b/tests/test_smoothing.py @@ -9,7 +9,7 @@ import skfda.preprocessing.smoothing.validation as validation from skfda._utils import _check_estimator from skfda.misc.operators import LinearDifferentialOperator -from skfda.misc.regularization import TikhonovRegularization +from skfda.misc.regularization import L2Regularization from skfda.representation.basis import BSpline, Monomial from skfda.representation.grid import FDataGrid @@ -82,7 +82,7 @@ def test_cholesky(self): smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=10, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2)), method='cholesky', return_basis=True) @@ -100,7 +100,7 @@ def test_qr(self): smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=10, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2)), method='qr', return_basis=True) @@ -120,7 +120,7 @@ def test_monomial_smoothing(self): smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=1, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2)), return_basis=True) fd_basis = smoother.fit_transform(fd) @@ -143,7 +143,7 @@ def test_vector_valued_smoothing(self) -> None: basis_smoother = smoothing.BasisSmoother( basis, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2)), return_basis=True, smoothing_parameter=1, @@ -151,7 +151,7 @@ def test_vector_valued_smoothing(self) -> None: basis_smoother_dim = smoothing.BasisSmoother( basis_dim, - regularization=TikhonovRegularization( + regularization=L2Regularization( LinearDifferentialOperator(2)), return_basis=True, smoothing_parameter=1, From 6a26fc6c37476ee5e616641c2a744e53220fc94c Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 21 Jan 2022 23:02:31 +0100 Subject: [PATCH 46/50] Fixed per class transformer tags. --- skfda/_utils/_utils.py | 12 ++-- .../_per_class_transformer.py | 56 +++++++++++++------ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 6ff4e61a8..c2518206f 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -48,9 +48,9 @@ from ..representation.basis import Basis T = TypeVar("T", bound=FData) -Input = TypeVar("Input") -Output = TypeVar("Output") -Target = TypeVar("Target") +Input = TypeVar("Input", bound=Union[FData, NDArrayFloat]) +Output = TypeVar("Output", bound=Union[FData, NDArrayFloat]) +Target = TypeVar("Target", bound=NDArrayInt) def check_is_univariate(fd: FData) -> None: @@ -113,7 +113,7 @@ def _check_compatible_fdatagrid(fdata1: FDataGrid, fdata2: FDataGrid) -> None: def _to_grid( X: FData, y: FData, - eval_points: Optional[np.ndarray] = None, + eval_points: Optional[NDArrayFloat] = None, ) -> Tuple[FDataGrid, FDataGrid]: """Transform a pair of FDatas in grids to perform calculations.""" from .. import FDataGrid @@ -751,8 +751,8 @@ def _classifier_fit_depth_methods( def _fit_feature_transformer( # noqa: WPS320 WPS234 - X: Union[NDArrayInt, NDArrayFloat], - y: Union[NDArrayInt, NDArrayFloat], + X: Input, + y: Target, transformer: TransformerMixin[Input, Output, Target], ) -> Tuple[ Union[NDArrayInt, NDArrayFloat], diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py index e101fb33c..e00b887d5 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_transformer.py @@ -2,21 +2,24 @@ from __future__ import annotations import warnings -from typing import TypeVar, Union +from typing import Any, Mapping, TypeVar, Union import numpy as np from pandas import DataFrame from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from ...._utils import TransformerMixin, _fit_feature_transformer -from ....representation._typing import NDArrayInt +from ....representation import FData +from ....representation._typing import NDArrayFloat, NDArrayInt from ....representation.basis import FDataBasis from ....representation.grid import FDataGrid -Input = TypeVar("Input") -Output = TypeVar("Output") +Input = TypeVar("Input", bound=Union[FData, NDArrayFloat]) +Output = TypeVar("Output", bound=Union[DataFrame, NDArrayFloat]) Target = TypeVar("Target", bound=NDArrayInt) +TransformerOutput = Union[FData, NDArrayFloat] + class PerClassTransformer(TransformerMixin[Input, Output, Target]): r"""Per class feature transformer for functional data. @@ -149,13 +152,27 @@ class PerClassTransformer(TransformerMixin[Input, Output, Target]): def __init__( self, - transformer: TransformerMixin[Input, Output, Target], + transformer: TransformerMixin[Input, TransformerOutput, Target], *, array_output: bool = False, ) -> None: self.transformer = transformer self.array_output = array_output + def _more_tags(self) -> Mapping[str, Any]: + parent_tags = super()._more_tags() + transformer_tags = self.transformer._get_tags() # noqa: WPS437 + + return { + **parent_tags, + 'allow_nan': transformer_tags['allow_nan'], + 'non_deterministic': transformer_tags['non_deterministic'], + 'pairwise': transformer_tags['pairwise'], + 'requires_positive_X': transformer_tags['requires_positive_X'], + 'requires_y': True, + 'X_types': transformer_tags['X_types'], + } + def _validate_transformer( self, ) -> None: @@ -185,18 +202,25 @@ def _validate_transformer( tags = self.transformer._get_tags() # noqa: WPS437 - if tags['stateless'] or not tags['requires_y']: + if tags['stateless']: + warnings.warn( + f"Parameter 'transformer' with type " + f"{type(self.transformer)} should use the data for " + f" fitting." + f"It should have the 'stateless' tag set to 'False'", + ) + + if tags['requires_y']: warnings.warn( - f"Parameter ``transformer`` with type" # noqa: WPS237 - f" {type(self.transformer)} should use class information." - f" It should have the ``requires_y`` tag set to ``True`` and" - f" the ``stateless`` tag set to ``False``", + f"Parameter 'transformer' with type " # noqa: WPS237 + f"{type(self.transformer)} should not use the class label." + f"It should have the 'requires_y' tag set to 'False'", ) - def fit( + def fit( # type: ignore[override] self, X: Input, - y: np.ndarray, + y: Target, ) -> PerClassTransformer[Input, Output, Target]: """ Fit the model on each class. @@ -222,7 +246,7 @@ def fit( return self - def transform(self, X: Input) -> Union[DataFrame, np.ndarray]: + def transform(self, X: Input) -> Output: """ Transform the provided data using the already fitted transformer. @@ -254,11 +278,11 @@ def transform(self, X: Input) -> Union[DataFrame, np.ndarray]: {'Transformed data': transformed_data}, ) - def fit_transform( + def fit_transform( # type: ignore[override] self, X: Input, - y: np.ndarray, - ) -> Union[DataFrame, np.ndarray]: + y: Target, + ) -> Output: """ Fits and transforms the provided data. From e011d1fcb0b50d2360342cf7da02ef4172ff657b Mon Sep 17 00:00:00 2001 From: vnmabus Date: Sat, 22 Jan 2022 00:10:56 +0100 Subject: [PATCH 47/50] Fix typing error. --- skfda/_utils/_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index c2518206f..f1fc3b8b5 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -48,9 +48,9 @@ from ..representation.basis import Basis T = TypeVar("T", bound=FData) -Input = TypeVar("Input", bound=Union[FData, NDArrayFloat]) -Output = TypeVar("Output", bound=Union[FData, NDArrayFloat]) -Target = TypeVar("Target", bound=NDArrayInt) + Input = TypeVar("Input", bound=Union[FData, NDArrayFloat]) + Output = TypeVar("Output", bound=Union[FData, NDArrayFloat]) + Target = TypeVar("Target", bound=NDArrayInt) def check_is_univariate(fd: FData) -> None: From b761989cc4771c0027ea4afe56c2076d2ece218c Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 25 Jan 2022 16:19:55 +0100 Subject: [PATCH 48/50] Typing improvements. --- skfda/_utils/_utils.py | 60 ++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f1fc3b8b5..a2e89d6ca 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -35,6 +35,7 @@ DomainRangeLike, GridPoints, GridPointsLike, + NDArrayAny, NDArrayFloat, NDArrayInt, ) @@ -305,7 +306,7 @@ def _reshape_eval_points( aligned: Literal[True], n_samples: int, dim_domain: int, -) -> np.ndarray: +) -> NDArrayFloat: pass @@ -316,7 +317,7 @@ def _reshape_eval_points( aligned: Literal[True], n_samples: int, dim_domain: int, -) -> np.ndarray: +) -> NDArrayFloat: pass @@ -327,7 +328,7 @@ def _reshape_eval_points( aligned: bool, n_samples: int, dim_domain: int, -) -> np.ndarray: +) -> NDArrayFloat: pass @@ -337,7 +338,7 @@ def _reshape_eval_points( aligned: bool, n_samples: int, dim_domain: int, -) -> np.ndarray: +) -> NDArrayFloat: """Convert and reshape the eval_points to ndarray. Args: @@ -394,7 +395,7 @@ def _one_grid_to_points( axes: GridPointsLike, *, dim_domain: int, -) -> Tuple[np.ndarray, Tuple[int, ...]]: +) -> Tuple[NDArrayFloat, Tuple[int, ...]]: """ Convert a list of ndarrays, one per domain dimension, in the points. @@ -421,10 +422,10 @@ class EvaluateMethod(Protocol): def __call__( self, - __eval_points: np.ndarray, # noqa: WPS112 + __eval_points: NDArrayFloat, # noqa: WPS112 extrapolation: Optional[ExtrapolationLike], aligned: bool, - ) -> np.ndarray: + ) -> NDArrayFloat: """Evaluate a function.""" pass @@ -439,7 +440,7 @@ def _evaluate_grid( dim_codomain: int, extrapolation: Optional[ExtrapolationLike] = None, aligned: Literal[True] = True, -) -> np.ndarray: +) -> NDArrayFloat: pass @@ -453,7 +454,7 @@ def _evaluate_grid( dim_codomain: int, extrapolation: Optional[ExtrapolationLike] = None, aligned: Literal[False], -) -> np.ndarray: +) -> NDArrayFloat: pass @@ -466,7 +467,7 @@ def _evaluate_grid( # noqa: WPS234 dim_codomain: int, extrapolation: Optional[ExtrapolationLike] = None, aligned: bool = True, -) -> np.ndarray: +) -> NDArrayFloat: """ Evaluate the functional object in the cartesian grid. @@ -569,13 +570,13 @@ def _evaluate_grid( # noqa: WPS234 def nquad_vec( - func: Callable[[np.ndarray], np.ndarray], + func: Callable[[NDArrayFloat], NDArrayFloat], ranges: Sequence[Tuple[float, float]], -) -> np.ndarray: +) -> NDArrayFloat: """Perform multiple integration of vector valued functions.""" initial_depth = len(ranges) - 1 - def integrate(*args: Any, depth: int) -> np.ndarray: # noqa: WPS430 + def integrate(*args: Any, depth: int) -> NDArrayFloat: # noqa: WPS430 if depth == 0: f = functools.partial(func, *args) @@ -587,13 +588,16 @@ def integrate(*args: Any, depth: int) -> np.ndarray: # noqa: WPS430 return integrate(depth=initial_depth) +ArrayT = TypeVar("ArrayT", bound=NDArrayAny) + + def _map_in_batches( - function: Callable[..., np.ndarray], - arguments: Tuple[Union[FData, np.ndarray], ...], - indexes: Tuple[np.ndarray, ...], + function: Callable[..., ArrayT], + arguments: Tuple[Union[FData, NDArrayAny], ...], + indexes: Tuple[NDArrayInt, ...], memory_per_batch: Optional[int] = None, **kwargs: Any, -) -> np.ndarray: +) -> ArrayT: """ Map a function over samples of FData or ndarray tuples efficiently. @@ -614,7 +618,7 @@ def _map_in_batches( assert all(n_indexes == len(i) for i in indexes) - batches: List[np.ndarray] = [] + batches: List[ArrayT] = [] for pos in range(0, n_indexes, n_elements_per_batch_allowed): batch_args = tuple( @@ -628,12 +632,12 @@ def _map_in_batches( def _pairwise_symmetric( - function: Callable[..., np.ndarray], - arg1: Union[FData, np.ndarray], - arg2: Optional[Union[FData, np.ndarray]] = None, + function: Callable[..., ArrayT], + arg1: Union[FData, NDArrayAny], + arg2: Optional[Union[FData, NDArrayAny]] = None, memory_per_batch: Optional[int] = None, **kwargs: Any, -) -> np.ndarray: +) -> ArrayT: """Compute pairwise a commutative function.""" dim1 = len(arg1) if arg2 is None or arg2 is arg1: @@ -671,12 +675,12 @@ def _pairwise_symmetric( return vec.reshape((dim1, dim2)) -def _int_to_real(array: np.ndarray) -> np.ndarray: +def _int_to_real(array: Union[NDArrayInt, NDArrayFloat]) -> NDArrayFloat: """Convert integer arrays to floating point.""" return array + 0.0 -def _check_array_key(array: np.ndarray, key: Any) -> Any: +def _check_array_key(array: NDArrayAny, key: Any) -> Any: """Check a getitem key.""" key = check_array_indexer(array, key) if isinstance(key, tuple): @@ -706,7 +710,7 @@ def _check_estimator(estimator): check_set_params(name, instance) -def _classifier_get_classes(y: ndarray) -> Tuple[ndarray, ndarray]: +def _classifier_get_classes(y: ndarray) -> Tuple[ndarray, NDArrayInt]: check_classification_targets(y) @@ -773,11 +777,11 @@ def _fit_feature_transformer( # noqa: WPS320 WPS234 def _compute_dependence( - X: np.ndarray, - y: np.ndarray, + X: Union[NDArrayInt, NDArrayFloat], + y: Union[NDArrayInt, NDArrayFloat], *, dependence_measure: _DependenceMeasure, -) -> np.ndarray: +) -> NDArrayFloat: """ Compute dependence between points and target. From c2b106459fdf056f27b3a2818c7ec144c89639f7 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 25 Jan 2022 17:30:28 +0100 Subject: [PATCH 49/50] Fix recent Pandas index test. --- skfda/_utils/_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index a2e89d6ca..10a49b625 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -691,7 +691,11 @@ def _check_array_key(array: NDArrayAny, key: Any) -> Any: if isinstance(key, numbers.Integral): # To accept also numpy ints key = int(key) - key = range(len(array))[key] + if key < 0: + key = len(array) + key + + if not 0 <= key < len(array): + raise IndexError("index out of bounds") return slice(key, key + 1) From d18f353adf387fc43a58b72cb0cbd0fd9d2f69ae Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 25 Jan 2022 18:17:34 +0100 Subject: [PATCH 50/50] Bump version. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ee6cdce3c..eb49d7c7f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.1 +0.7