-
Notifications
You must be signed in to change notification settings - Fork 179
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feature: new tests added for tsne to expand test coverage #2229
base: main
Are you sure you want to change the base?
Changes from 2 commits
c686edd
f3f5223
10da764
2f3e9fa
739a90c
822e614
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,11 +16,175 @@ | |
|
||
import numpy as np | ||
from numpy.testing import assert_allclose | ||
|
||
import pytest | ||
#Note: n_componets must be 2 for now | ||
from onedal.tests.utils._dataframes_support import ( | ||
_as_numpy, | ||
_convert_to_dataframe, | ||
get_dataframes_and_queues, | ||
) | ||
|
||
def test_sklearnex_import(): | ||
from sklearnex.manifold import TSNE | ||
|
||
X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) | ||
tsne = TSNE(n_components=2, perplexity=2.0).fit(X) | ||
assert "daal4py" in tsne.__module__ | ||
|
||
from sklearnex.manifold import TSNE | ||
|
||
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) | ||
def test_sklearnex_tsne_import(dataframe, queue): | ||
"""Test TSNE compatibility with different backends and queues, and validate sklearnex module.""" | ||
X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) | ||
X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) | ||
tsne = TSNE(n_components=2, perplexity=2.0).fit(X_df) | ||
assert "daal4py" in tsne.__module__ | ||
assert hasattr(tsne, "n_components"), "TSNE missing 'n_components' attribute." | ||
assert tsne.n_components == 2, "TSNE 'n_components' attribute is incorrect." | ||
|
||
def test_basic_tsne_functionality(): | ||
"""Test TSNE with valid data: basic functionality, random data, reproducibility, and edge cases.""" | ||
# Test basic functionality | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any chance that all of these could be parameterized instead? Otherwise, if one of them fails, then the rest wouldn't execute. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for looking into it. But what I meant was to parameterize this whole function by turning the inputs and expectations into parameters, so that one parameterization would be "Test basic functionality", another parameterization "Test with random data", and so on. |
||
X_basic = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) | ||
tsne = TSNE(n_components=2, perplexity=2.0).fit(X_basic) | ||
assert tsne.embedding_.shape == (4, 2) | ||
|
||
# Test with random data | ||
np.random.seed(42) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe some tests might be getting executed in parallel by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I meant here was this module: https://numpy.org/doc/stable/reference/random/generator.html To use it, you need to generate a rng = np.random.default_rng(seed=123) Then, you need to call methods from that generator object, like this: rng.random(size=(100,10)) |
||
X_random = np.random.rand(100, 10) | ||
tsne_random = TSNE(n_components=2, perplexity=30.0).fit(X_random) | ||
assert tsne_random.embedding_.shape == (100, 2) | ||
|
||
# Test reproducibility | ||
X_repro = np.random.rand(50, 10) | ||
tsne_1 = TSNE(n_components=2, random_state=42).fit_transform(X_repro) | ||
tsne_2 = TSNE(n_components=2, random_state=42).fit_transform(X_repro) | ||
assert_allclose(tsne_1, tsne_2, rtol=1e-5) | ||
|
||
# Test perplexity close to dataset size | ||
X_perplexity = np.random.rand(10, 5) | ||
tsne_perplexity = TSNE(n_components=2, perplexity=9).fit(X_perplexity) | ||
assert tsne_perplexity.embedding_.shape == (10, 2) | ||
|
||
# Test large data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It feels like this one is perhaps not needed, considering that there's already a similar test earlier on with shape (100,10). |
||
X_large = np.random.rand(1000, 50) | ||
tsne_large = TSNE(n_components=2, perplexity=50.0).fit(X_large) | ||
assert tsne_large.embedding_.shape == (1000, 2) | ||
|
||
# Test valid minimal data | ||
X_valid = np.array([[0, 0], [1, 1], [2, 2]]) | ||
tsne_valid = TSNE(n_components=2, perplexity=2).fit(X_valid) | ||
assert tsne_valid.embedding_.shape == (3, 2) | ||
|
||
# Edge case: constant data | ||
X_constant = np.ones((10, 10)) | ||
with pytest.raises(ValueError) as excinfo: | ||
TSNE(n_components=2, perplexity=20).fit(X_constant) | ||
assert "perplexity must be less than n_samples" in str(excinfo.value) | ||
|
||
# Edge case: empty data | ||
X_empty = np.empty((0, 10)) | ||
with pytest.raises(ValueError): | ||
TSNE(n_components=2).fit(X_empty) | ||
|
||
# Edge case: data with NaN or infinite values | ||
X_invalid = np.array([[0, 0], [1, np.nan], [2, np.inf]]) | ||
with pytest.raises(ValueError): | ||
TSNE(n_components=2).fit(X_invalid) | ||
|
||
# Edge Case: Perplexity Larger Than n_samples | ||
X_small = np.random.rand(5, 2) # 5 samples | ||
with pytest.raises(ValueError) as excinfo: | ||
TSNE(n_components=2, perplexity=10).fit(X_small) | ||
assert "perplexity must be less than n_samples" in str(excinfo.value), \ | ||
"Large perplexity did not trigger expected ValueError." | ||
|
||
# Edge Case: Sparse-Like High-Dimensional Data | ||
np.random.seed(42) | ||
X_sparse_like = np.random.rand(50, 10000) * (np.random.rand(50, 10000) > 0.99) | ||
david-cortes-intel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
try: | ||
tsne = TSNE(n_components=2, perplexity=30.0) | ||
tsne.fit(X_sparse_like) | ||
except Exception as e: | ||
pytest.fail(f"TSNE failed on sparse-like high-dimensional data: {e}") | ||
|
||
# Edge Case: Extremely Low Perplexity | ||
X = np.random.rand(10, 5) | ||
try: | ||
tsne_low_perplexity = TSNE(n_components=2, perplexity=0.5) | ||
tsne_low_perplexity.fit(X) | ||
except Exception as e: | ||
pytest.fail(f"TSNE failed with low perplexity: {e}") | ||
|
||
|
||
|
||
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) | ||
@pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
def test_tsne_with_specific_complex_dataset(dataframe, queue, dtype): | ||
"""Test TSNE with a specific, highly diverse dataset.""" | ||
complex_array = np.array([ | ||
[0, 0, 0, 0], | ||
[1, 1, 1, 1], | ||
[-1e-9, 1e-9, -1e-9, 1e-9], | ||
[-1e9, 1e9, -1e9, 1e9], | ||
[1e-3, 1e3, -1e3, -1e-3], | ||
[0, 1e9, -1e-9, 1], | ||
[1, -1, 1, -1], | ||
[42, 42, 42, 42], | ||
[0, 0, 1, -1], | ||
[-1e5, 0, 1e5, -1], | ||
[2e9, 2e-9, -2e9, -2e-9], | ||
[3, -3, 3e3, -3e-3], | ||
[5e-5, 5e5, -5e-5, -5e5], | ||
[1, 0, -1e8, 1e8], | ||
[9e-7, -9e7, 9e-7, -9e7], | ||
[4e-4, 4e4, -4e-4, -4e4], | ||
[6e-6, -6e6, 6e6, -6e-6], | ||
[8, -8, 8e8, -8e-8], | ||
], dtype=dtype) | ||
|
||
complex_array_df = _convert_to_dataframe(complex_array, sycl_queue=queue, target_df=dataframe) | ||
|
||
try: | ||
tsne = TSNE(n_components=2, perplexity=5.0, random_state=42) | ||
embedding = tsne.fit_transform(complex_array_df) | ||
assert embedding.shape == (complex_array.shape[0], 2), "TSNE embedding shape is incorrect." | ||
except Exception as e: | ||
pytest.fail(f"TSNE failed on the specific complex dataset: {e}") | ||
|
||
|
||
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues(device_filter_="gpu")) | ||
@pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
def test_tsne_gpu_validation(dataframe, queue, dtype): | ||
""" | ||
GPU validation test for TSNE with a specific complex dataset. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this one be merged into the one above? ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like the tests for non-missingness were lost after merging them. Could be good to have them just in case, considering that it tests GPU execution where more things can go wrong. |
||
""" | ||
# Complex dataset for testing | ||
gpu_validation_array = np.array([ | ||
[0, 0, 0, 0], | ||
[1, 1, 1, 1], | ||
[-1e9, 1e9, -1e9, 1e9], | ||
[1e-3, 1e3, -1e3, -1e-3], | ||
[1, -1, 1, -1], | ||
[0, 1e9, -1e-9, 1], | ||
[-7e11, 7e11, -7e-11, 7e-11], | ||
[4e-4, 4e4, -4e-4, -4e4], | ||
[6e-6, -6e6, 6e6, -6e-6], | ||
[0, 0, 0, 0], | ||
[1, 1, 1, 1], | ||
], dtype=dtype) | ||
|
||
expected_shape = (gpu_validation_array.shape[0], 2) | ||
gpu_array_df = _convert_to_dataframe( | ||
gpu_validation_array, sycl_queue=queue, target_df=dataframe | ||
) | ||
try: | ||
tsne = TSNE(n_components=2, perplexity=3.0, random_state=42) | ||
embedding = tsne.fit_transform(gpu_array_df) | ||
assert embedding.shape == expected_shape, f"Incorrect embedding shape on GPU: {embedding.shape}." | ||
assert np.all(np.isfinite(embedding)), "Embedding contains NaN or infinite values on GPU." | ||
assert np.any(embedding != 0), "GPU embedding contains only zeros, which is invalid." | ||
|
||
except Exception as e: | ||
pytest.fail(f"TSNE failed on GPU validation test: {e}") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Ethan,
Thx for the comments this is fixed
Best,
Yue