Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(clustering): Add KMeans GPU clustering support #1731

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions pyannote/audio/pipelines/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,79 @@ def cluster(
return clusters


class KMeansGPU(BaseClustering):
def __init__(
self,
metric: str = "",
max_num_embeddings: int = np.inf,
constrained_assignment: bool = False,
):
"""KMeans clustering

Parameters
----------
metric : {""}, optional
Distance metric to use. KMeansGPU only supports the default value.
"""
super().__init__(
metric=metric,
max_num_embeddings=max_num_embeddings,
constrained_assignment=constrained_assignment,
)

def cluster(
self, embeddings, min_clusters: int, max_clusters: int, num_clusters: int = None
):
try:
import cuml
import cupy as cp
from cuml.metrics.cluster import silhouette_score
except ImportError:
raise ImportError(
"KMeansGPU requires cuML. You can install it with 'https://docs.rapids.ai/install'."
)

assert max_clusters >= min_clusters > 0

num_embeddings = len(embeddings)

may_single = False
if max_clusters > 1 and min_clusters == 1:
min_clusters = 2
may_single = True
elif max_clusters == 1:
return np.zeros((num_embeddings,))

if num_embeddings <= min_clusters or num_embeddings == num_clusters:
return np.arange(num_embeddings)

if num_clusters is not None:
agg_clust = cuml.cluster.KMeans(n_clusters=num_clusters)
clusters = agg_clust.fit_predict(embeddings)
return clusters.get()

embeddings = cp.asarray(embeddings)

best_score = -1
best_clusters = None

for num_clusters in range(min_clusters, min(max_clusters + 1, num_embeddings)):
agg_clust = cuml.cluster.KMeans(n_clusters=num_clusters)
clusters = agg_clust.fit_predict(embeddings)

score = silhouette_score(embeddings, clusters)

if score > best_score:
best_score = score
best_clusters = clusters

if may_single:
if num_clusters == 2 and best_score < 0.25:
return np.zeros((num_embeddings,))

return best_clusters.get()


class OracleClustering(BaseClustering):
"""Oracle clustering"""

Expand Down Expand Up @@ -558,4 +631,5 @@ def __call__(

class Clustering(Enum):
AgglomerativeClustering = AgglomerativeClustering
KMeansGPU = KMeansGPU
OracleClustering = OracleClustering
81 changes: 80 additions & 1 deletion tests/test_clustering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

from pyannote.audio.pipelines.clustering import AgglomerativeClustering
from pyannote.audio.pipelines.clustering import AgglomerativeClustering, KMeansGPU


def test_agglomerative_clustering_num_cluster():
Expand All @@ -26,4 +26,83 @@ def test_agglomerative_clustering_num_cluster():
clusters = clustering.cluster(
embeddings=embeddings, min_clusters=2, max_clusters=2, num_clusters=2
)
print(clusters)
assert np.array_equal(clusters, np.array([0, 1]))


def test_kmeans_clustering_num_cluster_gpu_too_small():
clustering = KMeansGPU().instantiate({})

embeddings = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 1.0, 2.0]])

# request 2 clusters
clusters = clustering.cluster(
embeddings=embeddings, min_clusters=2, max_clusters=2, num_clusters=2
)

assert np.array_equal(clusters, np.array([0, 1]))

# generate a 256-dimensional random vector
v = np.random.rand(256)

# define the range and standard deviation of the generated cluster center
cluster_center_std_dev = 2.0

# generate 8 cluster centers randomly
num_clusters = 8
cluster_centers = np.random.normal(
np.mean(v), cluster_center_std_dev, size=(num_clusters, 256)
)

a, b, c = clustering.set_num_clusters(num_clusters, 10, 1, 10)
assert a == 8

a, b, c = clustering.set_num_clusters(num_clusters, None, 1, 10)
assert a is None and b == 1 and c == 8

a, b, c = clustering.set_num_clusters(num_clusters, None, 8, 10)
assert a == 8

a, b, c = clustering.set_num_clusters(num_clusters, None, 7, 10)
assert a is None and b == 7 and c == 8

clustering.cluster(
embeddings=cluster_centers, num_clusters=a, min_clusters=b, max_clusters=c
)


def test_kmeans_clustering_num_cluster_gpu_large():
clustering = KMeansGPU().instantiate({})

# generate a 256-dimensional random vector
v = np.random.rand(256)

# define the range and standard deviation of the generated cluster center
cluster_center_std_dev = 2.0
vector_std_dev = 1

# generate 5 cluster centers randomly
num_clusters = 5
cluster_centers = np.random.normal(
np.mean(v), cluster_center_std_dev, size=(num_clusters, 256)
)

# generate 2000 * 32 vectors
num_vectors_per_cluster = int(2000 * 32 / num_clusters)
all_vectors = []

for center in cluster_centers:
vectors = np.random.normal(
center, vector_std_dev, size=(num_vectors_per_cluster, 256)
)
all_vectors.append(vectors)

# stack all vectors
all_vectors = np.vstack(all_vectors)

np.random.shuffle(all_vectors)

clusters = clustering.cluster(
embeddings=all_vectors, min_clusters=2, max_clusters=10
)
assert np.unique(clusters).shape[0] == num_clusters
Loading