Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: new tests added for tsne to expand test coverage #2229

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 165 additions & 1 deletion sklearnex/manifold/tests/test_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,175 @@

import numpy as np
from numpy.testing import assert_allclose

import pytest
#Note: n_componets must be 2 for now
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#Note: n_componets must be 2 for now
#Note: n_components must be 2 for now

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ethan,
Thx for the comments this is fixed
Best,
Yue

from onedal.tests.utils._dataframes_support import (
_as_numpy,
_convert_to_dataframe,
get_dataframes_and_queues,
)

def test_sklearnex_import():
from sklearnex.manifold import TSNE

X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
tsne = TSNE(n_components=2, perplexity=2.0).fit(X)
assert "daal4py" in tsne.__module__

from sklearnex.manifold import TSNE

@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
def test_sklearnex_tsne_import(dataframe, queue):
"""Test TSNE compatibility with different backends and queues, and validate sklearnex module."""
X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe)
tsne = TSNE(n_components=2, perplexity=2.0).fit(X_df)
assert "daal4py" in tsne.__module__
assert hasattr(tsne, "n_components"), "TSNE missing 'n_components' attribute."
assert tsne.n_components == 2, "TSNE 'n_components' attribute is incorrect."

def test_basic_tsne_functionality():
"""Test TSNE with valid data: basic functionality, random data, reproducibility, and edge cases."""
# Test basic functionality
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any chance that all of these could be parameterized instead? Otherwise, if one of them fails, then the rest wouldn't execute.

Copy link
Contributor

@david-cortes-intel david-cortes-intel Dec 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking into it. But what I meant was to parameterize this whole function by turning the inputs and expectations into parameters, so that one parameterization would be "Test basic functionality", another parameterization "Test with random data", and so on.

X_basic = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
tsne = TSNE(n_components=2, perplexity=2.0).fit(X_basic)
assert tsne.embedding_.shape == (4, 2)

# Test with random data
np.random.seed(42)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe some tests might be getting executed in parallel by pytest (or at least through joblib as I see in some logs). Better to change to the non-global np.random.Generator.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant here was this module: https://numpy.org/doc/stable/reference/random/generator.html

To use it, you need to generate a Generator object, for example like this:

rng = np.random.default_rng(seed=123)

Then, you need to call methods from that generator object, like this:

rng.random(size=(100,10))

X_random = np.random.rand(100, 10)
tsne_random = TSNE(n_components=2, perplexity=30.0).fit(X_random)
assert tsne_random.embedding_.shape == (100, 2)

# Test reproducibility
X_repro = np.random.rand(50, 10)
tsne_1 = TSNE(n_components=2, random_state=42).fit_transform(X_repro)
tsne_2 = TSNE(n_components=2, random_state=42).fit_transform(X_repro)
assert_allclose(tsne_1, tsne_2, rtol=1e-5)

# Test perplexity close to dataset size
X_perplexity = np.random.rand(10, 5)
tsne_perplexity = TSNE(n_components=2, perplexity=9).fit(X_perplexity)
assert tsne_perplexity.embedding_.shape == (10, 2)

# Test large data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels like this one is perhaps not needed, considering that there's already a similar test earlier on with shape (100,10).

X_large = np.random.rand(1000, 50)
tsne_large = TSNE(n_components=2, perplexity=50.0).fit(X_large)
assert tsne_large.embedding_.shape == (1000, 2)

# Test valid minimal data
X_valid = np.array([[0, 0], [1, 1], [2, 2]])
tsne_valid = TSNE(n_components=2, perplexity=2).fit(X_valid)
assert tsne_valid.embedding_.shape == (3, 2)

# Edge case: constant data
X_constant = np.ones((10, 10))
with pytest.raises(ValueError) as excinfo:
TSNE(n_components=2, perplexity=20).fit(X_constant)
assert "perplexity must be less than n_samples" in str(excinfo.value)

# Edge case: empty data
X_empty = np.empty((0, 10))
with pytest.raises(ValueError):
TSNE(n_components=2).fit(X_empty)

# Edge case: data with NaN or infinite values
X_invalid = np.array([[0, 0], [1, np.nan], [2, np.inf]])
with pytest.raises(ValueError):
TSNE(n_components=2).fit(X_invalid)

# Edge Case: Perplexity Larger Than n_samples
X_small = np.random.rand(5, 2) # 5 samples
with pytest.raises(ValueError) as excinfo:
TSNE(n_components=2, perplexity=10).fit(X_small)
assert "perplexity must be less than n_samples" in str(excinfo.value), \
"Large perplexity did not trigger expected ValueError."

# Edge Case: Sparse-Like High-Dimensional Data
np.random.seed(42)
X_sparse_like = np.random.rand(50, 10000) * (np.random.rand(50, 10000) > 0.99)
david-cortes-intel marked this conversation as resolved.
Show resolved Hide resolved
try:
tsne = TSNE(n_components=2, perplexity=30.0)
tsne.fit(X_sparse_like)
except Exception as e:
pytest.fail(f"TSNE failed on sparse-like high-dimensional data: {e}")

# Edge Case: Extremely Low Perplexity
X = np.random.rand(10, 5)
try:
tsne_low_perplexity = TSNE(n_components=2, perplexity=0.5)
tsne_low_perplexity.fit(X)
except Exception as e:
pytest.fail(f"TSNE failed with low perplexity: {e}")



@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_with_specific_complex_dataset(dataframe, queue, dtype):
"""Test TSNE with a specific, highly diverse dataset."""
complex_array = np.array([
[0, 0, 0, 0],
[1, 1, 1, 1],
[-1e-9, 1e-9, -1e-9, 1e-9],
[-1e9, 1e9, -1e9, 1e9],
[1e-3, 1e3, -1e3, -1e-3],
[0, 1e9, -1e-9, 1],
[1, -1, 1, -1],
[42, 42, 42, 42],
[0, 0, 1, -1],
[-1e5, 0, 1e5, -1],
[2e9, 2e-9, -2e9, -2e-9],
[3, -3, 3e3, -3e-3],
[5e-5, 5e5, -5e-5, -5e5],
[1, 0, -1e8, 1e8],
[9e-7, -9e7, 9e-7, -9e7],
[4e-4, 4e4, -4e-4, -4e4],
[6e-6, -6e6, 6e6, -6e-6],
[8, -8, 8e8, -8e-8],
], dtype=dtype)

complex_array_df = _convert_to_dataframe(complex_array, sycl_queue=queue, target_df=dataframe)

try:
tsne = TSNE(n_components=2, perplexity=5.0, random_state=42)
embedding = tsne.fit_transform(complex_array_df)
assert embedding.shape == (complex_array.shape[0], 2), "TSNE embedding shape is incorrect."
except Exception as e:
pytest.fail(f"TSNE failed on the specific complex dataset: {e}")


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues(device_filter_="gpu"))
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_gpu_validation(dataframe, queue, dtype):
"""
GPU validation test for TSNE with a specific complex dataset.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this one be merged into the one above? (test_tsne_with_specific_complex_dataset)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the tests for non-missingness were lost after merging them. Could be good to have them just in case, considering that it tests GPU execution where more things can go wrong.

"""
# Complex dataset for testing
gpu_validation_array = np.array([
[0, 0, 0, 0],
[1, 1, 1, 1],
[-1e9, 1e9, -1e9, 1e9],
[1e-3, 1e3, -1e3, -1e-3],
[1, -1, 1, -1],
[0, 1e9, -1e-9, 1],
[-7e11, 7e11, -7e-11, 7e-11],
[4e-4, 4e4, -4e-4, -4e4],
[6e-6, -6e6, 6e6, -6e-6],
[0, 0, 0, 0],
[1, 1, 1, 1],
], dtype=dtype)

expected_shape = (gpu_validation_array.shape[0], 2)
gpu_array_df = _convert_to_dataframe(
gpu_validation_array, sycl_queue=queue, target_df=dataframe
)
try:
tsne = TSNE(n_components=2, perplexity=3.0, random_state=42)
embedding = tsne.fit_transform(gpu_array_df)
assert embedding.shape == expected_shape, f"Incorrect embedding shape on GPU: {embedding.shape}."
assert np.all(np.isfinite(embedding)), "Embedding contains NaN or infinite values on GPU."
assert np.any(embedding != 0), "GPU embedding contains only zeros, which is invalid."

except Exception as e:
pytest.fail(f"TSNE failed on GPU validation test: {e}")
Loading