Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: new tests added for tsne to expand test coverage #2229

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c686edd
feature: new tests added for tsne to expand test coverage
Dec 17, 2024
f3f5223
test: additional test for gpu and golden data embedding test for tsne
Dec 18, 2024
10da764
fix: fix format by running black and isort test_tsne.py
Dec 18, 2024
2f3e9fa
fix: const test check shape instead of str output
Dec 18, 2024
739a90c
fix: test removing raise error test
Dec 18, 2024
822e614
fix: fix test based on comments
Dec 19, 2024
c6bf0bd
fix: parametize basic test, use rng for ramdom datasets for independe…
Jan 6, 2025
5d2da20
fix: additional tests for complex and sparse data, use pytest param f…
Jan 8, 2025
e95f5a3
fix: fix the logic to ensure tsne can keep close point close in embed…
Jan 8, 2025
44f3c14
fix: logic test amke group a and b more different
Jan 8, 2025
cba1ce9
fix: print and more differetn for input on group a and group b for lo…
Jan 8, 2025
ba7658e
fix: add check to check for dpctl array check and convert it to numpy…
Jan 9, 2025
dc04722
fix: format fix for tsne tests
Jan 9, 2025
9791ea4
fix: use _as_numpy to convert to numpy obj
Jan 9, 2025
11f5edc
fix: fix tsne format
Jan 9, 2025
8c1dc28
test: investigate on ci why gpu test is not getting correct result fo…
Jan 9, 2025
1fbc7f0
test-ci: don't comment other tests
Jan 9, 2025
a57cd08
fici-testsee changes with smaller preplexity
Jan 9, 2025
28f9815
fix: remove print
Jan 9, 2025
0753153
fix: fix based on comments
Jan 9, 2025
032cf6b
fix: const data can result embedding to 0 or not, removed the test
Jan 9, 2025
dbfea49
fix: removed extra comments and added new test for constant data
Jan 10, 2025
153c58c
fix: add gpu parametrizaiton to constant test
Jan 13, 2025
4ddb1a8
new test: exact value test, move import inside funtion
Jan 13, 2025
5340cd2
fix: exact value change from random to pca initaozliation for more st…
Jan 14, 2025
9a9c547
fix: removed exact value test
Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 226 additions & 2 deletions sklearnex/manifold/tests/test_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,236 @@
# ===============================================================================

import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.metrics.pairwise import pairwise_distances

# Note: n_components must be 2 for now
david-cortes-intel marked this conversation as resolved.
Show resolved Hide resolved
from onedal.tests.utils._dataframes_support import (
_as_numpy,
_convert_to_dataframe,
get_dataframes_and_queues,
)

def test_sklearnex_import():

@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
def test_sklearnex_import(dataframe, queue):
"""Test TSNE compatibility with different backends and queues, and validate sklearnex module."""
from sklearnex.manifold import TSNE

X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
tsne = TSNE(n_components=2, perplexity=2.0).fit(X)
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
tsne = TSNE(n_components=2, perplexity=2.0, random_state=42, init="pca").fit(X_df)
embedding = tsne.fit_transform(X_df)
embedding = _as_numpy(embedding)
assert "daal4py" in tsne.__module__
assert tsne.n_components == 2
assert tsne.perplexity == 2.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion the only thing missing from test scope is an exact result check, which I agree would be unrealistic for all tests, but since the import test is small would it be possible to add a specific embedding value check here after transform? Something like we have for pca here: https://github.com/uxlfoundation/scikit-learn-intelex/blob/main/sklearnex/decomposition/tests/test_pca.py#L35-L41

Copy link
Author

@yuejiaointel yuejiaointel Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ethan,
I tried to have an exact value test but it is having different numbers across the tests and causing CI failed. I tried to add a few extra parameters (init, number of iterations) to keep it more stable, but it still generate very different numbers for every test. I also read the source code. I think the reason pca can have kind of test because it is using deterministic computations and I can compute exact numbers step by step. But tsne's algorithm is non linear and involve gradient descent and local proximations, which all give variabilities. I removed that part of test for now but not sure if this is the correct understanding, let me know your thoughts!
Best,
Yue

assert tsne.random_state == 42
assert tsne.init == "pca"


@pytest.mark.parametrize(
"X_generator,n_components,perplexity,expected_shape,should_raise",
[
pytest.param(
lambda rng: np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]),
2,
2.0,
(4, 2),
False,
id="Basic functionality",
),
pytest.param(
lambda rng: rng.random((100, 10)),
2,
30.0,
(100, 2),
False,
id="Random data",
),
pytest.param(
lambda rng: np.array([[0, 0], [1, 1], [2, 2]]),
2,
2.0,
(3, 2),
False,
id="Valid minimal data",
),
pytest.param(
lambda rng: np.empty((0, 10)),
2,
5.0,
None,
True,
id="Empty data",
),
pytest.param(
lambda rng: np.array([[0, 0], [1, np.nan], [2, np.inf]]),
2,
5.0,
None,
True,
id="Data with NaN/Inf",
),
pytest.param(
lambda rng: rng.random((50, 500)) * (rng.random((50, 500)) > 0.99),
david-cortes-intel marked this conversation as resolved.
Show resolved Hide resolved
2,
30.0,
(50, 2),
False,
id="Sparse-like high-dimensional data",
),
pytest.param(
lambda rng: np.hstack(
[
np.ones((50, 1)), # First column is 1
rng.random((50, 499)) * (rng.random((50, 499)) > 0.99),
]
),
2,
30.0,
(50, 2),
False,
id="Sparse-like data with constant column",
),
pytest.param(
lambda rng: np.where(
np.arange(50 * 500).reshape(50, 500) % 10 == 0, 0, rng.random((50, 500))
),
2,
30.0,
(50, 2),
False,
id="Sparse-like data with every tenth element zero",
),
pytest.param(
lambda rng: rng.random((10, 5)),
2,
0.5,
(10, 2),
False,
id="Extremely low perplexity",
david-cortes-intel marked this conversation as resolved.
Show resolved Hide resolved
),
],
)
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_functionality_and_edge_cases(
X_generator,
n_components,
perplexity,
expected_shape,
should_raise,
dataframe,
queue,
dtype,
):
from sklearnex.manifold import TSNE

rng = np.random.default_rng(
seed=42
) # Use generator to ensure independent dataset per test
X = X_generator(rng)
X = X.astype(dtype) if X.size > 0 else X
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)

if should_raise:
with pytest.raises(ValueError):
TSNE(n_components=n_components, perplexity=perplexity).fit_transform(X_df)
else:
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42)
embedding = tsne.fit_transform(X_df)
embedding = _as_numpy(embedding)
assert embedding.shape == expected_shape
assert np.all(np.isfinite(embedding))
assert np.any(embedding != 0)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("init", ["pca", "random"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_constant_data(init, dataframe, queue, dtype):
from sklearnex.manifold import TSNE

X = np.ones((10, 10), dtype=dtype)
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
tsne = TSNE(n_components=2, init=init, perplexity=5, random_state=42)
embedding = tsne.fit_transform(X_df)
embedding = _as_numpy(embedding)
assert embedding.shape == (10, 2)
if init == "pca":
assert np.isclose(embedding[:, 0].std(), 0, atol=1e-6) # Constant first dimension
assert np.allclose(embedding[:, 1], 0, atol=1e-6) # Zero second dimension
elif init == "random":
assert np.all(np.isfinite(embedding))


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_reproducibility(dataframe, queue, dtype):
from sklearnex.manifold import TSNE

rng = np.random.default_rng(seed=42)
X = rng.random((50, 10)).astype(dtype)
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
tsne_1 = TSNE(n_components=2, random_state=42).fit_transform(X_df)
tsne_2 = TSNE(n_components=2, random_state=42).fit_transform(X_df)
# in case of dpctl.tensor.usm_ndarray covert to numpy array
tsne_1 = _as_numpy(tsne_1)
tsne_2 = _as_numpy(tsne_2)
assert_allclose(tsne_1, tsne_2, rtol=1e-5)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_complex_and_gpu_validation(dataframe, queue, dtype):
from sklearnex.manifold import TSNE

X = np.array(
[
[1, 1, 1, 1],
[1.1, 1.1, 1.1, 1.1],
[0.9, 0.9, 0.9, 0.9],
[2e9, 2e-9, -2e9, -2e-9],
[5e-5, 5e5, -5e-5, -5e5],
[9e-7, -9e7, 9e-7, -9e7],
[1, -1, 1, -1],
[-1e-9, 1e-9, -1e-9, 1e-9],
[42, 42, 42, 42],
[8, -8, 8e8, -8e-8],
[1e-3, 1e3, -1e3, -1e-3],
[0, 1e9, -1e-9, 1],
[0, 0, 1, -1],
[0, 0, 0, 0],
[-1e5, 0, 1e5, -1],
[1, 0, -1e8, 1e8],
]
)
n_components = 2
perplexity = 3.0
expected_shape = (16, 2)

X = X.astype(dtype)
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42)
embedding = tsne.fit_transform(X_df)

# Validate results
assert embedding.shape == expected_shape
embedding = _as_numpy(embedding)
assert np.all(np.isfinite(embedding))
assert np.any(embedding != 0)

# Ensure close points in original space remain close in embedding
group_a_indices = [0, 1, 2] # Hardcoded index of similar points
group_b_indices = [3, 4, 5] # Hardcoded index of dissimilar points from a
david-cortes-intel marked this conversation as resolved.
Show resolved Hide resolved
embedding_distances = pairwise_distances(
X, metric="euclidean"
) # Get an array of distance where [i, j] is distance b/t i and j
# Check for distance b/t two points in group A < distance of this point and any point in group B
for i in group_a_indices:
for j in group_a_indices:
assert (
embedding_distances[i, j] < embedding_distances[i, group_b_indices].min()
), f"Point {i} in Group A is closer to a point in Group B than to another point in Group A."
Loading