Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dask: PCA #6445

Merged
merged 3 commits into from
Jun 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 34 additions & 5 deletions Orange/projection/pca.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import warnings
import numbers
import six
import numpy as np
import scipy.sparse as sp
import dask.array as da
from scipy.linalg import lu, qr, svd

from sklearn import decomposition as skl_decomposition
from sklearn.utils import check_array, check_random_state
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
from sklearn.utils.validation import check_is_fitted

Expand Down Expand Up @@ -261,12 +263,39 @@
self.params = vars()

def fit(self, X, Y=None):
proj = self._initialize_wrapped(X, Y)
if isinstance(X, da.Array):
X = X.rechunk({0: "auto", 1: -1})
return proj.fit(X, Y)

def _initialize_wrapped(self, X=None, Y=None):
params = self.params.copy()
if params["n_components"] is not None:
params["n_components"] = min(min(X.shape), params["n_components"])
proj = self.__wraps__(**params)
proj = proj.fit(X, Y)
return PCAModel(proj, self.domain, len(proj.components_))
params["n_components"] = min(*X.shape, params["n_components"])

if isinstance(X, da.Array) or isinstance(Y, da.Array):
try:
import dask_ml.decomposition as dask_decomposition

if params["iterated_power"] == "auto":
params["iterated_power"] = 0
del params["tol"]

# use IPCA instead of PCA due to memory issues
return dask_decomposition.IncrementalPCA(**params)

except ImportError:
warnings.warn("dask_ml is not installed. Using sklearn instead.")

Check warning on line 288 in Orange/projection/pca.py

View check run for this annotation

Codecov / codecov/patch

Orange/projection/pca.py#L287-L288

Added lines #L287 - L288 were not covered by tests

return self.__wraps__(**params)

def __call__(self, data):
data = self.preprocess(data)
proj = self.fit(data.X, data.Y)
model = PCAModel(proj, data.domain, len(proj.components_))
model.pre_domain = data.domain
model.name = self.name
return model


class SparsePCA(SklProjector):
Expand Down
72 changes: 41 additions & 31 deletions Orange/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@
from Orange.preprocess import Continuize, Normalize
from Orange.projection import pca, PCA, SparsePCA, IncrementalPCA, TruncatedSVD
from Orange.tests import test_filename
from Orange.tests.test_dasktable import with_dasktable


class TestPCA(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.ionosphere = Table(test_filename('datasets/ionosphere.tab'))
cls.iris = Table('iris')
cls.zoo = Table('zoo')

def test_pca(self):
data = self.ionosphere
self.__pca_test_helper(data, n_com=3, min_xpl_var=0.5)
def setUp(self):
self.ionosphere = Table(test_filename('datasets/ionosphere.tab'))
self.iris = Table('iris')
self.zoo = Table('zoo')

@with_dasktable
def test_pca(self, prepare_table):
data = prepare_table(self.ionosphere)
self.__pca_test_helper(data, n_com=3, min_xpl_var=0.49)
self.__pca_test_helper(data, n_com=10, min_xpl_var=0.7)
self.__pca_test_helper(data, n_com=32, min_xpl_var=1)

Expand All @@ -35,7 +36,7 @@ def __pca_test_helper(self, data, n_com, min_xpl_var):
self.assertEqual(n_com, pca_model.n_components)
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
np.testing.assert_almost_equal(pca_model(data).X, proj)
self.assertTrue(np.allclose(pca_model(data).X, proj))

def test_sparse_pca(self):
data = self.ionosphere[:100]
Expand All @@ -50,9 +51,10 @@ def __sparse_pca_test_helper(self, data, n_com, max_err):
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
self.assertLessEqual(pca_model.error_[-1], max_err)

def test_randomized_pca(self):
data = self.ionosphere
self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.5)
@with_dasktable
def test_randomized_pca(self, prepare_table):
data = prepare_table(self.ionosphere)
self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.47)
self.__rnd_pca_test_helper(data, n_com=10, min_xpl_var=0.7)
self.__rnd_pca_test_helper(data, n_com=32, min_xpl_var=0.98)

Expand All @@ -64,7 +66,7 @@ def __rnd_pca_test_helper(self, data, n_com, min_xpl_var):
self.assertEqual(n_com, pca_model.n_components)
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
np.testing.assert_almost_equal(pca_model(data).X, proj)
self.assertTrue(np.allclose(pca_model(data).X, proj))

def test_improved_randomized_pca_properly_called(self):
# It doesn't matter what we put into the matrix
Expand Down Expand Up @@ -215,17 +217,20 @@ def test_transformed_domain_does_not_pickle_data(self):
pca_iris2 = pickle.loads(pickle.dumps(pca_iris))
self.assertIsNone(pca_iris2.domain[0].compute_value.transformed)

def test_chain(self):
zoo_c = Continuize()(self.zoo)
pca = PCA(n_components=3)(zoo_c)(self.zoo)
@with_dasktable
def test_chain(self, prepare_table):
zoo = prepare_table(self.zoo)
zoo_c = Continuize()(zoo)
pca = PCA(n_components=3)(zoo_c)(zoo)
pca2 = PCA(n_components=3)(zoo_c)(zoo_c)
pp = [Continuize()]
pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo)
np.testing.assert_almost_equal(pca.X, pca2.X)
np.testing.assert_almost_equal(pca.X, pca3.X)
pca3 = PCA(n_components=3, preprocessors=pp)(zoo)(zoo)
self.assertTrue(np.allclose(pca.X, pca2.X))
self.assertTrue(np.allclose(pca.X, pca3.X))

def test_PCA_scorer(self):
data = self.iris
@with_dasktable
def test_PCA_scorer(self, prepare_table):
data = prepare_table(self.iris)
pca = PCA(preprocessors=[Normalize()])
pca.component = 1
scores = pca.score_data(data)
Expand All @@ -236,23 +241,28 @@ def test_PCA_scorer(self):
self.assertEqual([round(s, 4) for s in scores[0]],
[0.5224, 0.2634, 0.5813, 0.5656])

def test_PCA_scorer_component(self):
@with_dasktable
def test_PCA_scorer_component(self, prepare_table):
pca = PCA()
for i in range(1, len(self.zoo.domain.attributes) + 1):
zoo = prepare_table(self.zoo)
for i in range(1, len(zoo.domain.attributes) + 1):
pca.component = i
scores = pca.score_data(self.zoo)
scores = pca.score_data(zoo)
self.assertEqual(scores.shape,
(pca.component, len(self.zoo.domain.attributes)))
(pca.component, len(zoo.domain.attributes)))

def test_PCA_scorer_all_components(self):
n_attr = len(self.iris.domain.attributes)
@with_dasktable
def test_PCA_scorer_all_components(self, prepare_table):
iris = prepare_table(self.iris)
n_attr = len(iris.domain.attributes)
pca = PCA()
scores = pca.score_data(self.iris)
scores = pca.score_data(iris)
self.assertEqual(scores.shape, (n_attr, n_attr))

def test_max_components(self):
@with_dasktable
def test_max_components(self, prepare_table):
d = np.random.RandomState(0).rand(20, 20)
data = Table.from_numpy(None, d)
data = prepare_table(Table.from_numpy(None, d))
pca = PCA()(data)
self.assertEqual(len(pca.explained_variance_ratio_), 20)
pca = PCA(n_components=10)(data)
Expand Down
11 changes: 3 additions & 8 deletions Orange/widgets/unsupervised/owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils.slidergraph import SliderGraph
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.utils.annotated_data import add_columns
from Orange.widgets.widget import Input, Output

# Maximum number of PCA components that we can set in the widget
Expand Down Expand Up @@ -329,14 +330,8 @@ def commit(self):
metas=metas)
components.name = 'components'

data_dom = Domain(
self.data.domain.attributes,
self.data.domain.class_vars,
self.data.domain.metas + domain.attributes)
data = Table.from_numpy(
data_dom, self.data.X, self.data.Y,
numpy.hstack((self.data.metas, transformed.X)),
ids=self.data.ids)
data = self.data.transform(add_columns(self.data.domain,
metas=domain.attributes))

self._pca_projector.component = self.ncomponents
self.Outputs.transformed_data.send(transformed)
Expand Down
73 changes: 44 additions & 29 deletions Orange/widgets/unsupervised/tests/test_owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from unittest.mock import patch, Mock

import numpy as np
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip

from Orange.data import Table, Domain, ContinuousVariable, TimeVariable
from Orange.preprocess import preprocess
Expand All @@ -12,22 +14,24 @@
from Orange.widgets.tests.utils import table_dense_sparse, possible_duplicate_table
from Orange.widgets.unsupervised.owpca import OWPCA
from Orange.tests import test_filename
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip
from Orange.tests.test_dasktable import with_dasktable


class TestOWPCA(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWPCA) # type: OWPCA
self.iris = Table("iris") # type: Table

def test_set_variance100(self):
self.widget.set_data(self.iris)
@with_dasktable
def test_set_variance100(self, prepare_table):
data = prepare_table(self.iris)
self.widget.set_data(data)
self.widget.variance_covered = 100
self.widget._update_selection_variance_spin()

def test_constant_data(self):
data = self.iris[::5].copy()
@with_dasktable
def test_constant_data(self, prepare_table):
data = prepare_table(self.iris[::5].copy())
with data.unlocked():
data.X[:, :] = 1.0
# Ignore the warning: the test checks whether the widget shows
Expand All @@ -38,23 +42,26 @@ def test_constant_data(self):
self.assertIsNone(self.get_output(self.widget.Outputs.transformed_data))
self.assertIsNone(self.get_output(self.widget.Outputs.components))

def test_empty_data(self):
@with_dasktable
def test_empty_data(self, prepare_table):
""" Check widget for dataset with no rows and for dataset with no attributes """
self.send_signal(self.widget.Inputs.data, self.iris[:0])
data = prepare_table(self.iris)
self.send_signal(self.widget.Inputs.data, data[:0])
self.assertTrue(self.widget.Error.no_instances.is_shown())

domain = Domain([], None, self.iris.domain.variables)
new_data = Table.from_table(domain, self.iris)
domain = Domain([], None, data.domain.variables)
new_data = data.transform(domain)
self.send_signal(self.widget.Inputs.data, new_data)
self.assertTrue(self.widget.Error.no_features.is_shown())
self.assertFalse(self.widget.Error.no_instances.is_shown())

self.send_signal(self.widget.Inputs.data, None)
self.assertFalse(self.widget.Error.no_features.is_shown())

def test_limit_components(self):
@with_dasktable
def test_limit_components(self, prepare_table):
X = np.random.RandomState(0).rand(101, 101)
data = Table.from_numpy(None, X)
data = prepare_table(Table.from_numpy(None, X))
self.widget.ncomponents = 100
self.send_signal(self.widget.Inputs.data, data)
tran = self.get_output(self.widget.Outputs.transformed_data)
Expand All @@ -80,8 +87,10 @@ def test_migrate_settings_changes_variance_covered_to_int(self):
OWPCA.migrate_settings(settings, 0)
self.assertEqual(settings["variance_covered"], 100)

def test_variance_shown(self):
self.send_signal(self.widget.Inputs.data, self.iris)
@with_dasktable
def test_variance_shown(self, prepare_table):
data = prepare_table(self.iris)
self.send_signal(self.widget.Inputs.data, data)
self.widget.maxp = 2
self.widget._setup_plot()
var2 = self.widget.variance_covered
Expand All @@ -90,15 +99,18 @@ def test_variance_shown(self):
var3 = self.widget.variance_covered
self.assertGreater(var3, var2)

def test_unique_domain_components(self):
table = possible_duplicate_table('components')
@with_dasktable
def test_unique_domain_components(self, prepare_table):
table = prepare_table(possible_duplicate_table('components'))
self.send_signal(self.widget.Inputs.data, table)
out = self.get_output(self.widget.Outputs.components)
self.assertEqual(out.domain.metas[0].name, 'components (1)')

def test_variance_attr(self):
@with_dasktable
def test_variance_attr(self, prepare_table):
data = prepare_table(self.iris)
self.widget.ncomponents = 2
self.send_signal(self.widget.Inputs.data, self.iris)
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_stop_blocking()
self.widget._variance_ratio = np.array([0.5, 0.25, 0.2, 0.05])
self.widget.commit.now()
Expand Down Expand Up @@ -225,12 +237,13 @@ def test_normalized_gives_correct_result(self, prepare_table):

np.testing.assert_almost_equal(widget_result.X, pca_embedding)

def test_do_not_mask_features(self):
@with_dasktable
def test_do_not_mask_features(self, prepare_table):
# the widget used to replace cached variables when creating the
# components output (until 20170726)
data = Table("iris.tab")
data = prepare_table(self.iris)
ndata = data.copy()
self.widget.set_data(data)
ndata = Table("iris.tab")
self.assertEqual(data.domain[0], ndata.domain[0])

def test_on_cut_changed(self):
Expand All @@ -248,27 +261,29 @@ def test_on_cut_changed(self):
invalidate.assert_not_called()
self.assertEqual(widget.ncomponents, 0)

def test_output_data(self):
@with_dasktable
def test_output_data(self, prepare_table):
widget = self.widget
widget.ncomponents = 2
domain = Domain(self.iris.domain.attributes[:3],
self.iris.domain.class_var,
self.iris.domain.attributes[3:])
iris = self.iris.transform(domain)
data = prepare_table(self.iris)
domain = Domain(data.domain.attributes[:3],
data.domain.class_var,
data.domain.attributes[3:])
iris = data.transform(domain)
self.send_signal(widget.Inputs.data, iris)
output = self.get_output(widget.Outputs.data)
outdom = output.domain
self.assertEqual(domain.attributes, outdom.attributes)
self.assertEqual(domain.class_var, outdom.class_var)
self.assertEqual(domain.metas, outdom.metas[:1])
self.assertEqual(len(outdom.metas), 3)
np.testing.assert_equal(iris.X, output.X)
np.testing.assert_equal(iris.Y, output.Y)
self.assertTrue(np.all(iris.X == output.X))
self.assertTrue(np.all(iris.Y == output.Y))
np.testing.assert_equal(iris.metas[:, 0], output.metas[:, 0])

trans = self.get_output(widget.Outputs.transformed_data)
self.assertEqual(trans.domain.attributes, outdom.metas[1:])
np.testing.assert_equal(trans.X, output.metas[:, 1:])
np.testing.assert_equal(trans.X, output.metas[:, 1:]) # dask

self.send_signal(widget.Inputs.data, None)
output = self.get_output(widget.Outputs.data)
Expand Down
Loading