Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Sort out clustering base class #2251

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
6 changes: 2 additions & 4 deletions aeon/clustering/_clara.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def __init__(
self.distance_params = distance_params
self.n_samples = n_samples
self.n_sampling_iters = n_sampling_iters
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -148,7 +149,7 @@ def __init__(
self._random_state = None
self._kmedoids_instance = None

super().__init__(n_clusters)
super().__init__()

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
return self._kmedoids_instance.predict(X)
Expand Down Expand Up @@ -207,9 +208,6 @@ def _fit(self, X: np.ndarray, y=None):
self.n_iter_ = best_pam.n_iter_
self._kmedoids_instance = best_pam

def _score(self, X, y=None):
return -self.inertia_

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Expand Down
6 changes: 2 additions & 4 deletions aeon/clustering/_elastic_som.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def __init__(
self.init = init
self.sigma_decay_function = sigma_decay_function
self.custom_alignment_path = custom_alignment_path
self.n_clusters = n_clusters

self._random_state = None
self._alignment_path_callable = None
Expand All @@ -191,7 +192,7 @@ def __init__(

self.labels_ = None
self.cluster_centers_ = None
super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
self._check_params(X)
Expand Down Expand Up @@ -219,9 +220,6 @@ def _fit(self, X, y=None):
def _predict(self, X, y=None):
return self._find_bmu(X, self.cluster_centers_)

def _score(self, X, y=None):
raise NotImplementedError("TimeSeriesSOM does not support scoring")

def _find_bmu(self, x, weights):
pairwise_matrix = pairwise_distance(
x,
Expand Down
6 changes: 2 additions & 4 deletions aeon/clustering/_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def __init__(
self.distance_params = distance_params
self.average_params = average_params
self.averaging_method = averaging_method
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -203,7 +204,7 @@ def __init__(
self._averaging_method = None
self._average_params = None

super().__init__(n_clusters)
super().__init__()

def _fit(self, X: np.ndarray, y=None):
self._check_params(X)
Expand Down Expand Up @@ -281,9 +282,6 @@ def _fit_one_init(self, X: np.ndarray) -> tuple:

return prev_labels, cluster_centres, prev_inertia, i + 1

def _score(self, X, y=None):
return -self.inertia_

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
if isinstance(self.distance, str):
pairwise_matrix = pairwise_distance(
Expand Down
6 changes: 2 additions & 4 deletions aeon/clustering/_k_medoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def __init__(
self.random_state = random_state
self.distance_params = distance_params
self.method = method
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -184,7 +185,7 @@ def __init__(
self._fit_method = None

self._distance_params = {}
super().__init__(n_clusters)
super().__init__()

def _fit(self, X: np.ndarray, y=None):
self._check_params(X)
Expand All @@ -207,9 +208,6 @@ def _fit(self, X: np.ndarray, y=None):
self.cluster_centers_ = best_centers
self.n_iter_ = best_iters

def _score(self, X, y=None):
return -self.inertia_

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
if isinstance(self.distance, str):
pairwise_matrix = pairwise_distance(
Expand Down
6 changes: 2 additions & 4 deletions aeon/clustering/_k_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def __init__(
self.tol = tol
self.verbose = verbose
self.random_state = random_state
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -97,7 +98,7 @@ def __init__(

self._tslearn_k_shapes = None

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""Fit time series clusterer to training data.
Expand Down Expand Up @@ -179,6 +180,3 @@ def _get_test_params(cls, parameter_set="default"):
"verbose": False,
"random_state": 1,
}

def _score(self, X, y=None):
return np.abs(self.inertia_)
3 changes: 2 additions & 1 deletion aeon/clustering/_k_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __init__(
self.tol = tol
self.verbose = verbose
self.random_state = random_state
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -98,7 +99,7 @@ def __init__(

self._tslearn_k_shapes = None

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""Fit time series clusterer to training data.
Expand Down
3 changes: 2 additions & 1 deletion aeon/clustering/_kernel_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
self.verbose = verbose
self.n_jobs = n_jobs
self.random_state = random_state
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
Expand All @@ -116,7 +117,7 @@ def __init__(

self._tslearn_kernel_k_means = None

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""Fit time series clusterer to training data.
Expand Down
35 changes: 16 additions & 19 deletions aeon/clustering/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Base class for clustering."""

from typing import Optional

__maintainer__ = []
__all__ = ["BaseClusterer"]

Expand All @@ -28,8 +26,7 @@ class BaseClusterer(BaseCollectionEstimator):
"fit_is_empty": False,
}

def __init__(self, n_clusters: Optional[int] = None):
self.n_clusters = n_clusters
def __init__(self):
# required for compatibility with some sklearn interfaces e.g.
# CalibratedClassifierCV
self._estimator_type = "clusterer"
Expand Down Expand Up @@ -125,6 +122,7 @@ def predict_proba(self, X) -> np.ndarray:
self._check_shape(X)
return self._predict_proba(X)

@final
def fit_predict(self, X, y=None) -> np.ndarray:
"""Compute cluster centers and predict cluster index for each time series.

Expand All @@ -143,11 +141,10 @@ def fit_predict(self, X, y=None) -> np.ndarray:
np.ndarray (1d array of shape (n_cases,))
Index of the cluster each time series in X belongs to.
"""
self.fit(X)
return self.predict(X)
return self._fit_predict(X, y)

def score(self, X, y=None) -> float:
"""Score the quality of the clusterer.
def _fit_predict(self, X, y=None) -> np.ndarray:
"""Fit predict using base methods.

Parameters
----------
Expand All @@ -159,13 +156,13 @@ def score(self, X, y=None) -> float:

Returns
-------
score : float
Score of the clusterer.
np.ndarray (1d array of shape (n_cases,))
Index of the cluster each time series in X belongs to.
"""
self._check_is_fitted()
X = self._preprocess_collection(X, store_metadata=False)
self._check_shape(X)
return self._score(X, y)
self.fit(X)
if hasattr(self, "labels_"):
return self.labels_
Comment on lines +163 to +164
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, everything should have this.

return self.predict(X)

def _predict_proba(self, X) -> np.ndarray:
"""Predicts labels probabilities for sequences in X.
Expand Down Expand Up @@ -198,17 +195,17 @@ def _predict_proba(self, X) -> np.ndarray:
for i, u in enumerate(unique):
preds[preds == u] = i
n_cases = len(preds)
n_clusters = self.n_clusters
if hasattr(self, "n_clusters"):
n_clusters = self.n_clusters
else:
n_clusters = len(np.unique(preds))
Comment on lines +198 to +201
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think this could be risky down the line if there are non-int methods of generating n_clusters, but no solution for now other than always using unique.

if n_clusters is None:
n_clusters = int(max(preds)) + 1
dists = np.zeros((X.shape[0], n_clusters))
dists = np.zeros((len(X), n_clusters))
for i in range(n_cases):
dists[i, preds[i]] = 1
return dists

@abstractmethod
def _score(self, X, y=None): ...

@abstractmethod
def _predict(self, X) -> np.ndarray:
"""Predict the closest cluster each sample in X belongs to.
Expand Down
3 changes: 0 additions & 3 deletions aeon/clustering/compose/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,6 @@ def __init__(self, transformers, clusterer, random_state=None):
def _fit(self, X, y=None):
return super()._fit(X, y)

def _score(self, X, y=None):
raise NotImplementedError("Pipeline does not support scoring.")

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Expand Down
2 changes: 1 addition & 1 deletion aeon/clustering/deep_learning/_ae_bgru.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ def __init__(
self.save_last_model = save_last_model
self.best_file_name = best_file_name
self.random_state = random_state
self.n_clusters = n_clusters

super().__init__(
n_clusters=n_clusters,
clustering_algorithm=clustering_algorithm,
clustering_params=clustering_params,
estimator=estimator,
Expand Down
8 changes: 1 addition & 7 deletions aeon/clustering/deep_learning/_ae_fcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,10 +173,10 @@ def __init__(
self.save_last_model = save_last_model
self.best_file_name = best_file_name
self.random_state = random_state
self.n_clusters = n_clusters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can keep this for now, but I think @hadifawaz1999 said these would be better removed (n_clusters should be input with estimator)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think it should not be added as we're gonna remove it in a few days


super().__init__(
estimator=estimator,
n_clusters=n_clusters,
clustering_algorithm=clustering_algorithm,
clustering_params=clustering_params,
batch_size=batch_size,
Expand Down Expand Up @@ -336,12 +336,6 @@ def _fit(self, X):

return self

def _score(self, X, y=None):
# Transpose to conform to Keras input style.
X = X.transpose(0, 2, 1)
latent_space = self.model_.layers[1].predict(X)
return self._estimator.score(latent_space)

def _fit_multi_rec_model(
self,
autoencoder,
Expand Down
2 changes: 1 addition & 1 deletion aeon/clustering/deep_learning/_ae_resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,12 @@ def __init__(
self.best_file_name = best_file_name
self.last_file_name = last_file_name
self.optimizer = optimizer
self.n_clusters = n_clusters

self.history = None

super().__init__(
estimator=estimator,
n_clusters=n_clusters,
clustering_algorithm=clustering_algorithm,
clustering_params=clustering_params,
batch_size=batch_size,
Expand Down
5 changes: 1 addition & 4 deletions aeon/clustering/deep_learning/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,21 @@ class BaseDeepClusterer(BaseClusterer):

def __init__(
self,
n_clusters=None,
estimator=None,
clustering_algorithm="deprecated",
clustering_params=None,
batch_size=32,
last_file_name="last_file",
):
self.estimator = estimator
self.n_clusters = n_clusters
self.clustering_algorithm = clustering_algorithm
self.clustering_params = clustering_params
self.batch_size = batch_size
self.last_file_name = last_file_name

self.model_ = None

super().__init__(n_clusters=n_clusters)
super().__init__()

@abstractmethod
def build_model(self, input_shape):
Expand Down Expand Up @@ -124,7 +122,6 @@ def _fit_clustering(self, X):
if (
self.clustering_algorithm != "deprecated"
or self.clustering_params is not None
or self.n_clusters is not None
):
warnings.warn(
"The 'n_clusters' 'clustering_algorithm' and "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,3 @@ def test_base_deep_clusterer(estimator):
ypred_proba = dummy_deep_clr.predict_proba(X)
assert ypred_proba is not None
assert len(ypred_proba[0]) == len(np.unique(y))
score = dummy_deep_clr.score(X)
assert isinstance(score, np.float64) or isinstance(score, np.float32)
19 changes: 2 additions & 17 deletions aeon/clustering/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ class DummyClusterer(BaseClusterer):
def __init__(self, strategy="random", n_clusters=3, random_state=None):
self.strategy = strategy
self.random_state = random_state
self.n_clusters = n_clusters

super().__init__(n_clusters=n_clusters)
super().__init__()

def _fit(self, X, y=None):
"""
Expand Down Expand Up @@ -122,19 +123,3 @@ def _predict(self, X, y=None) -> np.ndarray:
return np.zeros(n_samples, dtype=int)
else:
raise ValueError("Unknown strategy type")

def _score(self, X, y=None):
if self.strategy == "single_cluster":
centers = np.mean(X, axis=0).reshape(1, -1)
else:
centers = np.array(
[X[self.labels_ == i].mean(axis=0) for i in range(self.n_clusters)]
)

inertia = np.sum(
[
np.sum((X[self.labels_ == i] - centers[i]) ** 2)
for i in range(len(centers))
]
)
return inertia
Loading