From be38696aeb489fe2ecc779c796e8a64bf5a11d29 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Fri, 13 Dec 2024 11:40:52 +0000 Subject: [PATCH] clustering for v1.0.0 (#318) --- docs/api.md | 1 + tsml_eval/experiments/__init__.py | 2 + tsml_eval/experiments/experiments.py | 43 ++++++----- tsml_eval/experiments/scalability.py | 2 +- tsml_eval/experiments/set_classifier.py | 27 +++++++ tsml_eval/experiments/set_clusterer.py | 73 +++++++++++++++++-- tsml_eval/experiments/set_regressor.py | 13 +++- .../experiments/tests/test_classification.py | 10 +-- .../experiments/tests/test_clustering.py | 35 ++++++--- .../experiments/tests/test_regression.py | 4 +- tsml_eval/testing/testing_utils.py | 12 ++- tsml_eval/utils/arguments.py | 11 ++- 12 files changed, 177 insertions(+), 56 deletions(-) diff --git a/docs/api.md b/docs/api.md index 51a69cab..68cc1e23 100644 --- a/docs/api.md +++ b/docs/api.md @@ -69,6 +69,7 @@ Functions for running experiments. experiments.get_classifier_by_name experiments.get_clusterer_by_name experiments.get_regressor_by_name + experiments.run_timing_experiment experiments.classification_cross_validation experiments.classification_cross_validation_folds experiments.regression_cross_validation diff --git a/tsml_eval/experiments/__init__.py b/tsml_eval/experiments/__init__.py index 6ace29b2..00606902 100644 --- a/tsml_eval/experiments/__init__.py +++ b/tsml_eval/experiments/__init__.py @@ -10,6 +10,7 @@ "get_classifier_by_name", "get_clusterer_by_name", "get_regressor_by_name", + "run_timing_experiment", "classification_cross_validation", "classification_cross_validation_folds", "regression_cross_validation", @@ -30,6 +31,7 @@ run_clustering_experiment, run_regression_experiment, ) +from tsml_eval.experiments.scalability import run_timing_experiment from tsml_eval.experiments.set_classifier import get_classifier_by_name from tsml_eval.experiments.set_clusterer import get_clusterer_by_name from tsml_eval.experiments.set_regressor import get_regressor_by_name diff --git a/tsml_eval/experiments/experiments.py b/tsml_eval/experiments/experiments.py index cf82e1f3..afa39b15 100644 --- a/tsml_eval/experiments/experiments.py +++ b/tsml_eval/experiments/experiments.py @@ -749,7 +749,11 @@ def run_clustering_experiment( If None, the clusterers default is used. If -1, the number of classes in the dataset is used. - This may not work as intended for pipelines currently. + The `n_clusters` parameter for arguments which are estimators will also be + set to this value if it exists. Please ensure that the argument input itself + has the `n_clusters` parameters and is not a default such as None. This is + likely to be the case for parameters such as `estimator` or `clusterer` in + pipelines and deep learners. clusterer_name : str or None, default=None Name of clusterer used in writing results. If None, the name is taken from the clusterer. @@ -821,22 +825,18 @@ def run_clustering_experiment( f"Encoder dictionary: {str(encoder_dict)}" ) + # set n_clusters for clusterer and any contained estimators + # NOTE: If the clusterer has an estimator parameteri.e. `estimator` or `clusterer` + # which defaults to None, we cannot set the n_clusters parameter for it here. if isinstance(n_clusters, int): - try: - if n_clusters == -1: - n_clusters = n_classes - - if isinstance(clusterer, SklearnToTsmlClusterer): - clusterer.set_params(clusterer__n_clusters=n_clusters) - else: - clusterer.set_params(n_clusters=n_clusters) - except ValueError: - warnings.warn( - f"{clusterer_name} does not have a n_clusters parameter, " - "so it cannot be set.", - stacklevel=1, - ) - n_clusters = None + if n_clusters == -1: + n_clusters = n_classes + + if "n_clusters" in clusterer.get_params(): + clusterer.set_params(n_clusters=n_clusters) + for att in clusterer.__dict__.values(): + if isinstance(att, BaseEstimator) and "n_clusters" in att.get_params(): + att.set_params(n_clusters=n_clusters) elif n_clusters is not None: raise ValueError("n_clusters must be an int or None.") @@ -868,7 +868,7 @@ def run_clustering_experiment( train_probs = np.zeros( ( len(train_preds), - n_clusters if n_clusters is not None else len(np.unique(train_preds)), + len(np.unique(train_preds)), ) ) train_probs[np.arange(len(train_preds)), train_preds] = 1 @@ -909,11 +909,7 @@ def run_clustering_experiment( test_probs = np.zeros( ( len(test_preds), - ( - n_clusters - if n_clusters is not None - else len(np.unique(train_preds)) - ), + len(np.unique(train_preds)), ) ) test_probs[np.arange(len(test_preds)), test_preds] = 1 @@ -989,6 +985,9 @@ def load_and_run_clustering_experiment( Number of clusters to use if the clusterer has an `n_clusters` parameter. If None, the clusterers default is used. If -1, the number of classes in the dataset is used. + + The `n_clusters` parameter for attributes which are estimators will also be + set to this value if it exists. clusterer_name : str or None, default=None Name of clusterer used in writing results. If None, the name is taken from the clusterer. diff --git a/tsml_eval/experiments/scalability.py b/tsml_eval/experiments/scalability.py index 46624bc7..ee8012e0 100644 --- a/tsml_eval/experiments/scalability.py +++ b/tsml_eval/experiments/scalability.py @@ -18,7 +18,7 @@ def run_timing_experiment( function="fit", random_state=None, ): - """Return the time taken to run eestimator functions for randomly generated data. + """Return the time taken to run estimator functions for randomly generated data. Will time the function for each estimator in milliseconds, gradually increasing the size of the chosen dimension. The time taken will be stored in a dictionary. diff --git a/tsml_eval/experiments/set_classifier.py b/tsml_eval/experiments/set_classifier.py index e2818875..b9c4b245 100644 --- a/tsml_eval/experiments/set_classifier.py +++ b/tsml_eval/experiments/set_classifier.py @@ -24,6 +24,8 @@ ["inceptiontimeclassifier", "inceptiontime"], ["h-inceptiontimeclassifier", "h-inceptiontime"], ["litetimeclassifier", "litetime"], + ["individualliteclassifier", "individuallite"], + ["disjointcnnclassifier", "disjointcnn"], ] dictionary_based_classifiers = [ ["bossensemble", "boss"], @@ -38,6 +40,7 @@ ["weasel_v2", "weaseldilation", "weasel-dilation", "weasel-d"], "redcomets", "redcomets-500", + ["mrseqlclassifier", "mrseql"], ["mrsqmclassifier", "mrsqm"], ] distance_based_classifiers = [ @@ -83,6 +86,7 @@ "summary-intervals", ["randomintervals-500", "catch22-intervals-500"], ["randomintervalclassifier", "randomintervals", "catch22-intervals"], + ["supervisedintervalclassifier", "supervisedintervals"], ["quantclassifier", "quant"], ] other_classifiers = [ @@ -97,6 +101,7 @@ ["randomshapeletforestclassifier", "randomshapeletforest", "rsf"], ["sastclassifier", "sast"], ["rsastclassifier", "rsast"], + ["learningshapeletclassifier", "ls"], ] vector_classifiers = [ ["rotationforestclassifier", "rotationforest", "rotf"], @@ -299,6 +304,14 @@ def _set_classifier_deep_learning( from aeon.classification.deep_learning import LITETimeClassifier return LITETimeClassifier(random_state=random_state, **kwargs) + elif c == "individualliteclassifier" or c == "individuallite": + from aeon.classification.deep_learning import IndividualLITEClassifier + + return IndividualLITEClassifier(random_state=random_state, **kwargs) + elif c == "disjointcnnclassifier" or c == "disjointcnn": + from aeon.classification.deep_learning import DisjointCNNClassifier + + return DisjointCNNClassifier(random_state=random_state, **kwargs) def _set_classifier_dictionary_based( @@ -391,6 +404,10 @@ def _set_classifier_dictionary_based( return REDCOMETS( n_trees=500, random_state=random_state, n_jobs=n_jobs, **kwargs ) + elif c == "mrseqlclassifier" or c == "mrseql": + from aeon.classification.dictionary_based import MrSEQLClassifier + + return MrSEQLClassifier(**kwargs) elif c == "mrsqmclassifier" or c == "mrsqm": from aeon.classification.dictionary_based import MrSQMClassifier @@ -670,6 +687,12 @@ def _set_classifier_interval_based( return RandomIntervalClassifier( random_state=random_state, n_jobs=n_jobs, **kwargs ) + elif c == "supervisedintervalclassifier" or c == "supervisedintervals": + from aeon.classification.interval_based import SupervisedIntervalClassifier + + return SupervisedIntervalClassifier( + random_state=random_state, n_jobs=n_jobs, **kwargs + ) elif c == "quantclassifier" or c == "quant": from aeon.classification.interval_based import QUANTClassifier @@ -734,6 +757,10 @@ def _set_classifier_shapelet_based( from aeon.classification.shapelet_based import RSASTClassifier return RSASTClassifier(seed=random_state, n_jobs=n_jobs, **kwargs) + elif c == "learningshapeletclassifier" or c == "ls": + from aeon.classification.shapelet_based import LearningShapeletClassifier + + return LearningShapeletClassifier(random_state=random_state, **kwargs) def _set_classifier_vector(c, random_state, n_jobs, fit_contract, checkpoint, kwargs): diff --git a/tsml_eval/experiments/set_clusterer.py b/tsml_eval/experiments/set_clusterer.py index 5d65e846..01972c00 100644 --- a/tsml_eval/experiments/set_clusterer.py +++ b/tsml_eval/experiments/set_clusterer.py @@ -8,6 +8,7 @@ KSpectralCentroid, TimeSeriesCLARA, TimeSeriesCLARANS, + TimeSeriesKernelKMeans, TimeSeriesKMeans, TimeSeriesKMedoids, TimeSeriesKShape, @@ -21,6 +22,10 @@ deep_learning_clusterers = [ ["aefcnclusterer", "aefcn"], ["aeresnetclusterer", "aeresnet"], + ["aeattentionbigruclusterer", "aeattentionbigru"], + ["aebigruclusterer", "aebigru"], + ["aedcnnclusterer", "aedcnn"], + ["aedrnnclusterer", "aedrnn"], ] distance_based_clusterers = [ "kmeans-euclidean", @@ -130,14 +135,13 @@ "elasticsom", "kspectralcentroid", "timeserieskshape", + "timeserieskernelkmeans", ] - feature_based_clusterers = [ ["catch22", "catch22clusterer"], ["tsfresh", "tsfreshclusterer"], ["summary", "summaryclusterer"], ] - other_clusterers = [ ["dummyclusterer", "dummy", "dummyclusterer-tsml"], "dummyclusterer-aeon", @@ -231,11 +235,51 @@ def _set_clusterer_deep_learning( if c == "aefcnclusterer" or c == "aefcn": from aeon.clustering.deep_learning import AEFCNClusterer - return AEFCNClusterer(random_state=random_state, **kwargs) + return AEFCNClusterer( + estimator=TimeSeriesKMeans(distance="euclidean", averaging_method="mean"), + random_state=random_state, + **kwargs, + ) elif c == "aeresnetclusterer" or c == "aeresnet": from aeon.clustering.deep_learning import AEResNetClusterer - return AEResNetClusterer(random_state=random_state, **kwargs) + return AEResNetClusterer( + estimator=TimeSeriesKMeans(distance="euclidean", averaging_method="mean"), + random_state=random_state, + **kwargs, + ) + elif c == "aeattentionbigruclusterer" or c == "aeattentionbigru": + from aeon.clustering.deep_learning import AEAttentionBiGRUClusterer + + return AEAttentionBiGRUClusterer( + estimator=TimeSeriesKMeans(distance="euclidean", averaging_method="mean"), + random_state=random_state, + **kwargs, + ) + elif c == "aebigruclusterer" or c == "aebigru": + from aeon.clustering.deep_learning import AEBiGRUClusterer + + return AEBiGRUClusterer( + estimator=TimeSeriesKMeans(distance="euclidean", averaging_method="mean"), + random_state=random_state, + **kwargs, + ) + elif c == "aedcnnclusterer" or c == "aedcnn": + from aeon.clustering.deep_learning import AEDCNNClusterer + + return AEDCNNClusterer( + estimator=TimeSeriesKMeans(distance="euclidean", averaging_method="mean"), + random_state=random_state, + **kwargs, + ) + elif c == "aedrnnclusterer" or c == "aedrnn": + from aeon.clustering.deep_learning import AEDRNNClusterer + + return AEDRNNClusterer( + estimator=TimeSeriesKMeans(distance="euclidean", averaging_method="mean"), + random_state=random_state, + **kwargs, + ) def _set_clusterer_distance_based( @@ -388,6 +432,15 @@ def _set_clusterer_distance_based( random_state=random_state, **kwargs, ) + elif c == "timeserieskernelkmeans" or c == "kernelkmeans": + return TimeSeriesKernelKMeans( + max_iter=50, + n_init=10, + tol=1e-06, + random_state=random_state, + n_jobs=n_jobs, + **kwargs, + ) def _get_distance_default_params( @@ -433,15 +486,21 @@ def _set_clusterer_feature_based( if c == "catch22" or c == "catch22clusterer": from aeon.clustering.feature_based import Catch22Clusterer - return Catch22Clusterer(random_state=random_state, n_jobs=n_jobs, **kwargs) + return Catch22Clusterer( + estimator=KMeans(), random_state=random_state, n_jobs=n_jobs, **kwargs + ) elif c == "tsfresh" or c == "tsfreshclusterer": from aeon.clustering.feature_based import TSFreshClusterer - return TSFreshClusterer(random_state=random_state, n_jobs=n_jobs, **kwargs) + return TSFreshClusterer( + estimator=KMeans(), random_state=random_state, n_jobs=n_jobs, **kwargs + ) elif c == "summary" or c == "summaryclusterer": from aeon.clustering.feature_based import SummaryClusterer - return SummaryClusterer(random_state=random_state, n_jobs=n_jobs, **kwargs) + return SummaryClusterer( + estimator=KMeans(), random_state=random_state, n_jobs=n_jobs, **kwargs + ) def _set_clusterer_other(c, random_state, n_jobs, fit_contract, checkpoint, kwargs): diff --git a/tsml_eval/experiments/set_regressor.py b/tsml_eval/experiments/set_regressor.py index 33d11ca0..8ebd2b8e 100644 --- a/tsml_eval/experiments/set_regressor.py +++ b/tsml_eval/experiments/set_regressor.py @@ -14,7 +14,7 @@ ["multirockethydraregressor", "multirockethydra", "multirocket-hydra"], ] deep_learning_regressors = [ - ["cnnregressor", "cnn"], + ["timecnnregressor", "timecnn", "cnnregressor", "cnn"], ["fcnregressor", "fcnn", "fcn"], ["mlpregressor", "mlp"], ["encoderregressor", "encoder"], @@ -23,7 +23,8 @@ ["inceptiontimeregressor", "inception", "inceptiontime"], ["h-inceptiontimeregressor", "h-inceptiontime"], ["litetimeregressor", "litetime"], - ["timecnnregressor", "timecnn"], + ["individualliteregressor", "individuallite"], + ["disjointcnnregressor", "disjointcnn"], ] distance_based_regressors = [ "1nn-ed", @@ -252,6 +253,14 @@ def _set_regressor_deep_learning( from aeon.regression.deep_learning import LITETimeRegressor return LITETimeRegressor(random_state=random_state, **kwargs) + elif r == "individualliteregressor" or r == "individuallite": + from aeon.regression.deep_learning import IndividualLITERegressor + + return IndividualLITERegressor(random_state=random_state, **kwargs) + elif r == "disjointcnnregressor" or r == "disjointcnn": + from aeon.regression.deep_learning import DisjointCNNRegressor + + return DisjointCNNRegressor(random_state=random_state, **kwargs) def _set_regressor_distance_based( diff --git a/tsml_eval/experiments/tests/test_classification.py b/tsml_eval/experiments/tests/test_classification.py index c9d882e9..389b07cf 100644 --- a/tsml_eval/experiments/tests/test_classification.py +++ b/tsml_eval/experiments/tests/test_classification.py @@ -212,15 +212,11 @@ def test_aeon_classifiers_available(): "ClassifierPipeline", "ClassifierEnsemble", "SklearnClassifierWrapper", - # just missing - "IndividualLITEClassifier", + "IntervalForestClassifier", + # ordinal "OrdinalTDE", "IndividualOrdinalTDE", - "IntervalForestClassifier", - "SupervisedIntervalClassifier", - "LearningShapeletClassifier", - "DisjointCNNClassifier", - "MrSEQLClassifier", + # just missing ] est = [e for e, _ in all_estimators(type_filter="classifier")] diff --git a/tsml_eval/experiments/tests/test_clustering.py b/tsml_eval/experiments/tests/test_clustering.py index 635ed10b..787b9c6e 100644 --- a/tsml_eval/experiments/tests/test_clustering.py +++ b/tsml_eval/experiments/tests/test_clustering.py @@ -172,22 +172,44 @@ def test_run_clustering_experiment_invalid_estimator(): def test_get_clusterer_by_name(): """Test get_clusterer_by_name method.""" clusterer_lists = [ + set_clusterer.deep_learning_clusterers, set_clusterer.distance_based_clusterers, + set_clusterer.feature_based_clusterers, set_clusterer.other_clusterers, set_clusterer.vector_clusterers, ] + clusterer_non_default_params = [ + "clusterer", + "base_clusterer", + "estimator", + "base_estimator", + ] clusterer_dict = {} all_clusterer_names = [] for clusterer_list in clusterer_lists: - _check_set_method( + estimatorrs = _check_set_method( get_clusterer_by_name, clusterer_list, clusterer_dict, all_clusterer_names, + return_estimator=True, ) + # Check that clusterers with estimator parameters which are likely to be + # a sub-estimator are not None so n_clusters can be set + for clusterer in estimatorrs: + for param_name in clusterer_non_default_params: + params = clusterer.get_params() + if param_name in params: + assert params[param_name] is not None, ( + f"Clusterers which have an estimator parameter i.e. " + f"pipelines and deep learners must not have None as the " + f"estimator. Found None for {param_name} in " + f"{clusterer.__class__.__name__}" + ) + _check_set_method_results( clusterer_dict, estimator_name="Clusterers", method_name="set_clusterer" ) @@ -204,15 +226,8 @@ def test_aeon_clusterers_available(): excluded = [ # composable/wrapper "ClustererPipeline", - "SklearnClustererWrapper" + "SklearnClustererWrapper", # just missing - "AEFCNClusterer", - "AEResNetClusterer", - "AEAttentionBiGRUClusterer", - "AEBiGRUClusterer", - "TimeSeriesKernelKMeans", - "AEDCNNClusterer", - "AEDRNNClusterer", ] est = [e for e, _ in all_estimators(type_filter="clusterer")] @@ -229,7 +244,7 @@ def test_aeon_clusterers_available(): @pytest.mark.parametrize("n_clusters", ["4", "-1"]) @pytest.mark.parametrize( "clusterer", - ["DBSCAN", "DummyClusterer-aeon", "DummyClusterer-sklearn"], + ["DBSCAN", "DummyClusterer-aeon", "DummyClusterer-sklearn", "Summary"], ) def test_n_clusters(n_clusters, clusterer): """Test n_clusters parameter.""" diff --git a/tsml_eval/experiments/tests/test_regression.py b/tsml_eval/experiments/tests/test_regression.py index 5e2b8fe4..1f1f0b45 100644 --- a/tsml_eval/experiments/tests/test_regression.py +++ b/tsml_eval/experiments/tests/test_regression.py @@ -209,10 +209,8 @@ def test_aeon_regressors_available(): "RegressorPipeline", "RegressorEnsemble", "SklearnRegressorWrapper", - # just missing - "IndividualLITERegressor", "IntervalForestRegressor", - "DisjointCNNRegressor", + # just missing ] est = [e for e, _ in all_estimators(type_filter="regressor")] diff --git a/tsml_eval/testing/testing_utils.py b/tsml_eval/testing/testing_utils.py index 7868467c..9dae58fb 100644 --- a/tsml_eval/testing/testing_utils.py +++ b/tsml_eval/testing/testing_utils.py @@ -24,8 +24,13 @@ def _check_set_method( - set_method, estimator_sub_list, estimator_dict, all_estimator_names + set_method, + estimator_sub_list, + estimator_dict, + all_estimator_names, + return_estimator=False, ): + estimators = [] for estimator_names in estimator_sub_list: estimator_names = ( [estimator_names] if isinstance(estimator_names, str) else estimator_names @@ -61,6 +66,11 @@ def _check_set_method( elif c_name not in estimator_dict: estimator_dict[c_name] = False + if return_estimator: + estimators.append(e) + if return_estimator: + return estimators + EXEMPT_ESTIMATOR_NAMES = [ "channelensembleregressor", diff --git a/tsml_eval/utils/arguments.py b/tsml_eval/utils/arguments.py index 3c0d4d31..7235d3cc 100644 --- a/tsml_eval/utils/arguments.py +++ b/tsml_eval/utils/arguments.py @@ -69,7 +69,10 @@ def parse_args(args): -nc N_CLUSTERS, --n_clusters N_CLUSTERS the number of clusters to find for clusterers which have an {n_clusters} parameter. If {-1}, use the - number of classes in the dataset (default: -1). + number of classes in the dataset. The {n_clusters} parameter + for attributes will also be set. Please ensure that + the argument input itself has the {n_clusters} parameters + and is not a default such as None. (default: -1). -ctts, --combine_test_train_split whether to use a train/test split or not. If True, the train and test sets are combined and used the fit the @@ -207,8 +210,10 @@ def parse_args(args): type=int, default=-1, help="the number of clusters to find for clusterers which have an {n_clusters} " - "parameter. If {-1}, use the number of classes in the dataset " - "(default: %(default)s).", + "parameter. If {-1}, use the number of classes in the dataset. The " + "{n_clusters} parameter for arguments will also be set. Please ensure that" + "the argument input itself has the {n_clusters} parameters and is not a default" + "such as None (default: %(default)s).", ) parser.add_argument( "-ctts",