From d9c2351f63651930229793c7ec7f19b9a538b35e Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Mon, 8 May 2023 15:53:30 +0200 Subject: [PATCH 1/3] dask compatible pca --- Orange/projection/pca.py | 39 ++++++++++++++++++++++++---- Orange/widgets/unsupervised/owpca.py | 11 +++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/Orange/projection/pca.py b/Orange/projection/pca.py index 98787557f32..14ac5d33333 100644 --- a/Orange/projection/pca.py +++ b/Orange/projection/pca.py @@ -1,11 +1,13 @@ +import warnings import numbers import six import numpy as np import scipy.sparse as sp +import dask.array as da from scipy.linalg import lu, qr, svd from sklearn import decomposition as skl_decomposition -from sklearn.utils import check_array, check_random_state +from sklearn.utils import check_random_state from sklearn.utils.extmath import svd_flip, safe_sparse_dot from sklearn.utils.validation import check_is_fitted @@ -261,12 +263,39 @@ def __init__(self, n_components=None, copy=True, whiten=False, self.params = vars() def fit(self, X, Y=None): + proj = self._initialize_wrapped(X, Y) + if isinstance(X, da.Array): + X = X.rechunk({0: "auto", 1: -1}) + return proj.fit(X, Y) + + def _initialize_wrapped(self, X=None, Y=None): params = self.params.copy() if params["n_components"] is not None: - params["n_components"] = min(min(X.shape), params["n_components"]) - proj = self.__wraps__(**params) - proj = proj.fit(X, Y) - return PCAModel(proj, self.domain, len(proj.components_)) + params["n_components"] = min(*X.shape, params["n_components"]) + + if isinstance(X, da.Array) or isinstance(Y, da.Array): + try: + import dask_ml.decomposition as dask_decomposition + + if params["iterated_power"] == "auto": + params["iterated_power"] = 0 + del params["tol"] + + # use IPCA instead of PCA due to memory issues + return dask_decomposition.IncrementalPCA(**params) + + except ImportError: + warnings.warn("dask_ml is not installed. Using sklearn instead.") + + return self.__wraps__(**params) + + def __call__(self, data): + data = self.preprocess(data) + proj = self.fit(data.X, data.Y) + model = PCAModel(proj, data.domain, len(proj.components_)) + model.pre_domain = data.domain + model.name = self.name + return model class SparsePCA(SklProjector): diff --git a/Orange/widgets/unsupervised/owpca.py b/Orange/widgets/unsupervised/owpca.py index 0a917fe0f11..c361444cef0 100644 --- a/Orange/widgets/unsupervised/owpca.py +++ b/Orange/widgets/unsupervised/owpca.py @@ -14,6 +14,7 @@ from Orange.widgets import widget, gui, settings from Orange.widgets.utils.slidergraph import SliderGraph from Orange.widgets.utils.widgetpreview import WidgetPreview +from Orange.widgets.utils.annotated_data import add_columns from Orange.widgets.widget import Input, Output # Maximum number of PCA components that we can set in the widget @@ -329,14 +330,8 @@ def commit(self): metas=metas) components.name = 'components' - data_dom = Domain( - self.data.domain.attributes, - self.data.domain.class_vars, - self.data.domain.metas + domain.attributes) - data = Table.from_numpy( - data_dom, self.data.X, self.data.Y, - numpy.hstack((self.data.metas, transformed.X)), - ids=self.data.ids) + data = self.data.transform(add_columns(self.data.domain, + metas=domain.attributes)) self._pca_projector.component = self.ncomponents self.Outputs.transformed_data.send(transformed) From 7809fa03ab0e107eea031dc3bc85032c7763d3c2 Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Tue, 20 Jun 2023 14:15:52 +0200 Subject: [PATCH 2/3] add widget tests --- .../widgets/unsupervised/tests/test_owpca.py | 73 +++++++++++-------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/Orange/widgets/unsupervised/tests/test_owpca.py b/Orange/widgets/unsupervised/tests/test_owpca.py index 3ea5009afe5..ea59072e649 100644 --- a/Orange/widgets/unsupervised/tests/test_owpca.py +++ b/Orange/widgets/unsupervised/tests/test_owpca.py @@ -4,6 +4,8 @@ from unittest.mock import patch, Mock import numpy as np +from sklearn.utils import check_random_state +from sklearn.utils.extmath import svd_flip from Orange.data import Table, Domain, ContinuousVariable, TimeVariable from Orange.preprocess import preprocess @@ -12,8 +14,7 @@ from Orange.widgets.tests.utils import table_dense_sparse, possible_duplicate_table from Orange.widgets.unsupervised.owpca import OWPCA from Orange.tests import test_filename -from sklearn.utils import check_random_state -from sklearn.utils.extmath import svd_flip +from Orange.tests.test_dasktable import with_dasktable class TestOWPCA(WidgetTest): @@ -21,13 +22,16 @@ def setUp(self): self.widget = self.create_widget(OWPCA) # type: OWPCA self.iris = Table("iris") # type: Table - def test_set_variance100(self): - self.widget.set_data(self.iris) + @with_dasktable + def test_set_variance100(self, prepare_table): + data = prepare_table(self.iris) + self.widget.set_data(data) self.widget.variance_covered = 100 self.widget._update_selection_variance_spin() - def test_constant_data(self): - data = self.iris[::5].copy() + @with_dasktable + def test_constant_data(self, prepare_table): + data = prepare_table(self.iris[::5].copy()) with data.unlocked(): data.X[:, :] = 1.0 # Ignore the warning: the test checks whether the widget shows @@ -38,13 +42,15 @@ def test_constant_data(self): self.assertIsNone(self.get_output(self.widget.Outputs.transformed_data)) self.assertIsNone(self.get_output(self.widget.Outputs.components)) - def test_empty_data(self): + @with_dasktable + def test_empty_data(self, prepare_table): """ Check widget for dataset with no rows and for dataset with no attributes """ - self.send_signal(self.widget.Inputs.data, self.iris[:0]) + data = prepare_table(self.iris) + self.send_signal(self.widget.Inputs.data, data[:0]) self.assertTrue(self.widget.Error.no_instances.is_shown()) - domain = Domain([], None, self.iris.domain.variables) - new_data = Table.from_table(domain, self.iris) + domain = Domain([], None, data.domain.variables) + new_data = data.transform(domain) self.send_signal(self.widget.Inputs.data, new_data) self.assertTrue(self.widget.Error.no_features.is_shown()) self.assertFalse(self.widget.Error.no_instances.is_shown()) @@ -52,9 +58,10 @@ def test_empty_data(self): self.send_signal(self.widget.Inputs.data, None) self.assertFalse(self.widget.Error.no_features.is_shown()) - def test_limit_components(self): + @with_dasktable + def test_limit_components(self, prepare_table): X = np.random.RandomState(0).rand(101, 101) - data = Table.from_numpy(None, X) + data = prepare_table(Table.from_numpy(None, X)) self.widget.ncomponents = 100 self.send_signal(self.widget.Inputs.data, data) tran = self.get_output(self.widget.Outputs.transformed_data) @@ -80,8 +87,10 @@ def test_migrate_settings_changes_variance_covered_to_int(self): OWPCA.migrate_settings(settings, 0) self.assertEqual(settings["variance_covered"], 100) - def test_variance_shown(self): - self.send_signal(self.widget.Inputs.data, self.iris) + @with_dasktable + def test_variance_shown(self, prepare_table): + data = prepare_table(self.iris) + self.send_signal(self.widget.Inputs.data, data) self.widget.maxp = 2 self.widget._setup_plot() var2 = self.widget.variance_covered @@ -90,15 +99,18 @@ def test_variance_shown(self): var3 = self.widget.variance_covered self.assertGreater(var3, var2) - def test_unique_domain_components(self): - table = possible_duplicate_table('components') + @with_dasktable + def test_unique_domain_components(self, prepare_table): + table = prepare_table(possible_duplicate_table('components')) self.send_signal(self.widget.Inputs.data, table) out = self.get_output(self.widget.Outputs.components) self.assertEqual(out.domain.metas[0].name, 'components (1)') - def test_variance_attr(self): + @with_dasktable + def test_variance_attr(self, prepare_table): + data = prepare_table(self.iris) self.widget.ncomponents = 2 - self.send_signal(self.widget.Inputs.data, self.iris) + self.send_signal(self.widget.Inputs.data, data) self.wait_until_stop_blocking() self.widget._variance_ratio = np.array([0.5, 0.25, 0.2, 0.05]) self.widget.commit.now() @@ -225,12 +237,13 @@ def test_normalized_gives_correct_result(self, prepare_table): np.testing.assert_almost_equal(widget_result.X, pca_embedding) - def test_do_not_mask_features(self): + @with_dasktable + def test_do_not_mask_features(self, prepare_table): # the widget used to replace cached variables when creating the # components output (until 20170726) - data = Table("iris.tab") + data = prepare_table(self.iris) + ndata = data.copy() self.widget.set_data(data) - ndata = Table("iris.tab") self.assertEqual(data.domain[0], ndata.domain[0]) def test_on_cut_changed(self): @@ -248,13 +261,15 @@ def test_on_cut_changed(self): invalidate.assert_not_called() self.assertEqual(widget.ncomponents, 0) - def test_output_data(self): + @with_dasktable + def test_output_data(self, prepare_table): widget = self.widget widget.ncomponents = 2 - domain = Domain(self.iris.domain.attributes[:3], - self.iris.domain.class_var, - self.iris.domain.attributes[3:]) - iris = self.iris.transform(domain) + data = prepare_table(self.iris) + domain = Domain(data.domain.attributes[:3], + data.domain.class_var, + data.domain.attributes[3:]) + iris = data.transform(domain) self.send_signal(widget.Inputs.data, iris) output = self.get_output(widget.Outputs.data) outdom = output.domain @@ -262,13 +277,13 @@ def test_output_data(self): self.assertEqual(domain.class_var, outdom.class_var) self.assertEqual(domain.metas, outdom.metas[:1]) self.assertEqual(len(outdom.metas), 3) - np.testing.assert_equal(iris.X, output.X) - np.testing.assert_equal(iris.Y, output.Y) + self.assertTrue(np.all(iris.X == output.X)) + self.assertTrue(np.all(iris.Y == output.Y)) np.testing.assert_equal(iris.metas[:, 0], output.metas[:, 0]) trans = self.get_output(widget.Outputs.transformed_data) self.assertEqual(trans.domain.attributes, outdom.metas[1:]) - np.testing.assert_equal(trans.X, output.metas[:, 1:]) + np.testing.assert_equal(trans.X, output.metas[:, 1:]) # dask self.send_signal(widget.Inputs.data, None) output = self.get_output(widget.Outputs.data) From b176d286132f260630b8e469fce09c37d48a4d09 Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Tue, 20 Jun 2023 17:15:26 +0200 Subject: [PATCH 3/3] add pca tests --- Orange/tests/test_pca.py | 72 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/Orange/tests/test_pca.py b/Orange/tests/test_pca.py index 2375a6239e2..807869a3c15 100644 --- a/Orange/tests/test_pca.py +++ b/Orange/tests/test_pca.py @@ -12,18 +12,19 @@ from Orange.preprocess import Continuize, Normalize from Orange.projection import pca, PCA, SparsePCA, IncrementalPCA, TruncatedSVD from Orange.tests import test_filename +from Orange.tests.test_dasktable import with_dasktable class TestPCA(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.ionosphere = Table(test_filename('datasets/ionosphere.tab')) - cls.iris = Table('iris') - cls.zoo = Table('zoo') - - def test_pca(self): - data = self.ionosphere - self.__pca_test_helper(data, n_com=3, min_xpl_var=0.5) + def setUp(self): + self.ionosphere = Table(test_filename('datasets/ionosphere.tab')) + self.iris = Table('iris') + self.zoo = Table('zoo') + + @with_dasktable + def test_pca(self, prepare_table): + data = prepare_table(self.ionosphere) + self.__pca_test_helper(data, n_com=3, min_xpl_var=0.49) self.__pca_test_helper(data, n_com=10, min_xpl_var=0.7) self.__pca_test_helper(data, n_com=32, min_xpl_var=1) @@ -35,7 +36,7 @@ def __pca_test_helper(self, data, n_com, min_xpl_var): self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) - np.testing.assert_almost_equal(pca_model(data).X, proj) + self.assertTrue(np.allclose(pca_model(data).X, proj)) def test_sparse_pca(self): data = self.ionosphere[:100] @@ -50,9 +51,10 @@ def __sparse_pca_test_helper(self, data, n_com, max_err): self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) self.assertLessEqual(pca_model.error_[-1], max_err) - def test_randomized_pca(self): - data = self.ionosphere - self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.5) + @with_dasktable + def test_randomized_pca(self, prepare_table): + data = prepare_table(self.ionosphere) + self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.47) self.__rnd_pca_test_helper(data, n_com=10, min_xpl_var=0.7) self.__rnd_pca_test_helper(data, n_com=32, min_xpl_var=0.98) @@ -64,7 +66,7 @@ def __rnd_pca_test_helper(self, data, n_com, min_xpl_var): self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) - np.testing.assert_almost_equal(pca_model(data).X, proj) + self.assertTrue(np.allclose(pca_model(data).X, proj)) def test_improved_randomized_pca_properly_called(self): # It doesn't matter what we put into the matrix @@ -215,17 +217,20 @@ def test_transformed_domain_does_not_pickle_data(self): pca_iris2 = pickle.loads(pickle.dumps(pca_iris)) self.assertIsNone(pca_iris2.domain[0].compute_value.transformed) - def test_chain(self): - zoo_c = Continuize()(self.zoo) - pca = PCA(n_components=3)(zoo_c)(self.zoo) + @with_dasktable + def test_chain(self, prepare_table): + zoo = prepare_table(self.zoo) + zoo_c = Continuize()(zoo) + pca = PCA(n_components=3)(zoo_c)(zoo) pca2 = PCA(n_components=3)(zoo_c)(zoo_c) pp = [Continuize()] - pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo) - np.testing.assert_almost_equal(pca.X, pca2.X) - np.testing.assert_almost_equal(pca.X, pca3.X) + pca3 = PCA(n_components=3, preprocessors=pp)(zoo)(zoo) + self.assertTrue(np.allclose(pca.X, pca2.X)) + self.assertTrue(np.allclose(pca.X, pca3.X)) - def test_PCA_scorer(self): - data = self.iris + @with_dasktable + def test_PCA_scorer(self, prepare_table): + data = prepare_table(self.iris) pca = PCA(preprocessors=[Normalize()]) pca.component = 1 scores = pca.score_data(data) @@ -236,23 +241,28 @@ def test_PCA_scorer(self): self.assertEqual([round(s, 4) for s in scores[0]], [0.5224, 0.2634, 0.5813, 0.5656]) - def test_PCA_scorer_component(self): + @with_dasktable + def test_PCA_scorer_component(self, prepare_table): pca = PCA() - for i in range(1, len(self.zoo.domain.attributes) + 1): + zoo = prepare_table(self.zoo) + for i in range(1, len(zoo.domain.attributes) + 1): pca.component = i - scores = pca.score_data(self.zoo) + scores = pca.score_data(zoo) self.assertEqual(scores.shape, - (pca.component, len(self.zoo.domain.attributes))) + (pca.component, len(zoo.domain.attributes))) - def test_PCA_scorer_all_components(self): - n_attr = len(self.iris.domain.attributes) + @with_dasktable + def test_PCA_scorer_all_components(self, prepare_table): + iris = prepare_table(self.iris) + n_attr = len(iris.domain.attributes) pca = PCA() - scores = pca.score_data(self.iris) + scores = pca.score_data(iris) self.assertEqual(scores.shape, (n_attr, n_attr)) - def test_max_components(self): + @with_dasktable + def test_max_components(self, prepare_table): d = np.random.RandomState(0).rand(20, 20) - data = Table.from_numpy(None, d) + data = prepare_table(Table.from_numpy(None, d)) pca = PCA()(data) self.assertEqual(len(pca.explained_variance_ratio_), 20) pca = PCA(n_components=10)(data)