From 51012c37385e3d199fe18785ef979456efed7b89 Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Mon, 22 May 2023 14:47:35 +0200 Subject: [PATCH 1/3] dask: linear regression learner --- Orange/base.py | 14 +++++++-- Orange/classification/logistic_regression.py | 2 +- Orange/classification/neural_network.py | 2 +- Orange/data/dask.py | 9 ++++++ Orange/regression/linear.py | 30 ++++++++++++++++++++ Orange/regression/neural_network.py | 2 +- Orange/widgets/evaluate/owpredictions.py | 2 +- Orange/widgets/utils/owlearnerwidget.py | 3 +- 8 files changed, 56 insertions(+), 8 deletions(-) diff --git a/Orange/base.py b/Orange/base.py index 07500b96a5b..3376b1080f1 100644 --- a/Orange/base.py +++ b/Orange/base.py @@ -7,6 +7,7 @@ import numpy as np import scipy +import dask.array as da from Orange.data import Table, Storage, Instance, Value, Domain from Orange.data.filter import HasClass @@ -507,7 +508,9 @@ def __init__(self, skl_model): self.skl_model = skl_model def predict(self, X): - value = self.skl_model.predict(X) + if isinstance(X, da.Array): + X = X.rechunk({0: "auto", 1: -1}) + value = np.asarray(self.skl_model.predict(X)) # SVM has probability attribute which defines if method compute probs has_prob_attr = hasattr(self.skl_model, "probability") if (has_prob_attr and self.skl_model.probability @@ -581,13 +584,18 @@ def __call__(self, data, progress_callback=None): m.params = self.params return m - def _initialize_wrapped(self): + # pylint: disable=unused-argument + def _initialize_wrapped(self, X=None, Y=None): + # wrap sklearn/dask_ml according to type of X/Y # pylint: disable=not-callable return self.__wraps__(**self.params) def fit(self, X, Y, W=None): - clf = self._initialize_wrapped() + clf = self._initialize_wrapped(X, Y) Y = Y.reshape(-1) + if isinstance(X, da.Array) or isinstance(Y, da.Array): + X = X.rechunk({0: "auto", 1: -1}) + Y = Y.rechunk({0: X.chunksize[0]}) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) diff --git a/Orange/classification/logistic_regression.py b/Orange/classification/logistic_regression.py index aeb4fbfc1cb..cd886dea7fd 100644 --- a/Orange/classification/logistic_regression.py +++ b/Orange/classification/logistic_regression.py @@ -41,7 +41,7 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0, super().__init__(preprocessors=preprocessors) self.params = vars() - def _initialize_wrapped(self): + def _initialize_wrapped(self, X=None, Y=None): params = self.params.copy() # The default scikit-learn solver `lbfgs` (v0.22) does not support the # l1 penalty. diff --git a/Orange/classification/neural_network.py b/Orange/classification/neural_network.py index 53dff79bed4..ee29cfff330 100644 --- a/Orange/classification/neural_network.py +++ b/Orange/classification/neural_network.py @@ -26,7 +26,7 @@ class MLPClassifierWCallback(skl_nn.MLPClassifier, NIterCallbackMixin): class NNClassificationLearner(NNBase, SklLearner): __wraps__ = MLPClassifierWCallback - def _initialize_wrapped(self): + def _initialize_wrapped(self, X=None, Y=None): clf = SklLearner._initialize_wrapped(self) clf.orange_callback = getattr(self, "callback", None) return clf diff --git a/Orange/data/dask.py b/Orange/data/dask.py index a0c9e8aaef6..c9e4bb99b38 100644 --- a/Orange/data/dask.py +++ b/Orange/data/dask.py @@ -255,6 +255,15 @@ def __len__(self): self.X.compute_chunk_sizes() return self.X.shape[0] + def _filter_has_class(self, negate=False): + if self._Y.ndim == 1: + retain = np.isnan(self._Y) + else: + retain = np.any(np.isnan(self._Y), axis=1) + if not negate: + retain = np.logical_not(retain) + return self.from_table_rows(self, np.asarray(retain)) + def dask_stats(X, compute_variance=False): is_numeric = np.issubdtype(X.dtype, np.number) diff --git a/Orange/regression/linear.py b/Orange/regression/linear.py index 7342d3a3150..93653cc3981 100644 --- a/Orange/regression/linear.py +++ b/Orange/regression/linear.py @@ -1,4 +1,7 @@ +import warnings + import numpy as np +import dask.array as da import sklearn.linear_model as skl_linear_model import sklearn.preprocessing as skl_preprocessing @@ -27,12 +30,36 @@ def score(self, data): class LinearRegressionLearner(SklLearner, _FeatureScorerMixin): __wraps__ = skl_linear_model.LinearRegression + __penalty__ = None # Arguments are needed for signatures, pylint: disable=unused-argument def __init__(self, preprocessors=None, fit_intercept=True): super().__init__(preprocessors=preprocessors) self.params = vars() + def _initialize_wrapped(self, X=None, Y=None): + if isinstance(X, da.Array) or isinstance(Y, da.Array): + try: + import dask_ml.linear_model + + params = self.params.copy() + params["solver"] = "gradient_descent" + params["penalty"] = self.__penalty__ + if self.__penalty__ is not None: + params["solver"] = "admm" + params["C"] = 1 / params.pop("alpha") + params["max_iter"] = params["max_iter"] or 100 + for key in ["copy_X", "precompute", "positive"]: + params.pop(key, None) + if self.__penalty__ == "elasticnet": + from dask_glm.regularizers import ElasticNet + params["penalty"] = ElasticNet(weight=params.pop("l1_ratio")) + + return dask_ml.linear_model.LinearRegression(**params) + except ImportError: + warnings.warn("dask_ml is not installed, using sklearn instead.") + return self.__wraps__(**self.params) + def fit(self, X, Y, W=None): model = super().fit(X, Y, W) return LinearModel(model.skl_model) @@ -40,6 +67,7 @@ def fit(self, X, Y, W=None): class RidgeRegressionLearner(LinearRegressionLearner): __wraps__ = skl_linear_model.Ridge + __penalty__ = "l2" # Arguments are needed for signatures, pylint: disable=unused-argument def __init__(self, alpha=1.0, fit_intercept=True, copy_X=True, @@ -50,6 +78,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, copy_X=True, class LassoRegressionLearner(LinearRegressionLearner): __wraps__ = skl_linear_model.Lasso + __penalty__ = "l1" # Arguments are needed for signatures, pylint: disable=unused-argument def __init__(self, alpha=1.0, fit_intercept=True, precompute=False, @@ -61,6 +90,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, precompute=False, class ElasticNetLearner(LinearRegressionLearner): __wraps__ = skl_linear_model.ElasticNet + __penalty__ = "elasticnet" # Arguments are needed for signatures, pylint: disable=unused-argument def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, diff --git a/Orange/regression/neural_network.py b/Orange/regression/neural_network.py index 7a8b553756d..4c384411ad3 100644 --- a/Orange/regression/neural_network.py +++ b/Orange/regression/neural_network.py @@ -13,7 +13,7 @@ class MLPRegressorWCallback(skl_nn.MLPRegressor, NIterCallbackMixin): class NNRegressionLearner(NNBase, SklLearner): __wraps__ = MLPRegressorWCallback - def _initialize_wrapped(self): + def _initialize_wrapped(self, X=None, Y=None): clf = SklLearner._initialize_wrapped(self) clf.orange_callback = getattr(self, "callback", None) return clf diff --git a/Orange/widgets/evaluate/owpredictions.py b/Orange/widgets/evaluate/owpredictions.py index b9b5ec36e74..72c478a961e 100644 --- a/Orange/widgets/evaluate/owpredictions.py +++ b/Orange/widgets/evaluate/owpredictions.py @@ -397,7 +397,7 @@ def _call_predictors(self): results.domain = self.data.domain results.row_indices = numpy.arange(len(self.data)) results.folds = (Ellipsis, ) - results.actual = self.data.Y + results.actual = numpy.asarray(self.data.Y) results.unmapped_probabilities = prob results.unmapped_predicted = pred results.probabilities = results.predicted = None diff --git a/Orange/widgets/utils/owlearnerwidget.py b/Orange/widgets/utils/owlearnerwidget.py index 8ead41c2507..5d812bd7dd6 100644 --- a/Orange/widgets/utils/owlearnerwidget.py +++ b/Orange/widgets/utils/owlearnerwidget.py @@ -1,4 +1,5 @@ from copy import deepcopy +import numpy as np from AnyQt.QtCore import QTimer, Qt @@ -252,7 +253,7 @@ def check_data(self): self.Error.data_error(reason) elif not len(self.data): self.Error.data_error("Dataset is empty.") - elif len(ut.unique(self.data.Y)) < 2: + elif len(np.asarray(ut.unique(self.data.Y))) < 2: self.Error.data_error("Data contains a single target value.") elif self.data.X.size == 0: self.Error.data_error("Data has no features to learn from.") From 764d15f48a22848cebcfe6fb6517fb36ea3d7d00 Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Mon, 26 Jun 2023 18:37:57 +0200 Subject: [PATCH 2/3] tests --- Orange/tests/test_linear_regression.py | 47 ++++++++++++++++++++------ 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/Orange/tests/test_linear_regression.py b/Orange/tests/test_linear_regression.py index 97c28b23fae..3a85eac4494 100644 --- a/Orange/tests/test_linear_regression.py +++ b/Orange/tests/test_linear_regression.py @@ -14,14 +14,24 @@ ElasticNetCVLearner, MeanLearner) from Orange.evaluation import CrossValidation, RMSE +from Orange.tests.test_dasktable import with_dasktable, temp_dasktable class TestLinearRegressionLearner(unittest.TestCase): + learners = [ + RidgeRegressionLearner(), + LassoRegressionLearner(), + ElasticNetLearner(), + ElasticNetCVLearner(), + MeanLearner() + ] + @classmethod def setUpClass(cls): cls.housing = Table("housing") - def test_LinearRegression(self): + @with_dasktable + def test_LinearRegression(self, prepare_table): nrows = 1000 ncols = 3 x = np.random.randint(-20, 51, (nrows, ncols)) @@ -31,23 +41,17 @@ def test_LinearRegression(self): x1, x2 = np.split(x, 2) y1, y2 = np.split(y, 2) - t = Table.from_numpy(None, x1, y1) + t = prepare_table(Table.from_numpy(None, x1, y1)) learn = LinearRegressionLearner() clf = learn(t) z = clf(x2) self.assertTrue((abs(z.reshape(-1, 1) - y2) < 2.0).all()) def test_Regression(self): - ridge = RidgeRegressionLearner() - lasso = LassoRegressionLearner() - elastic = ElasticNetLearner() - elasticCV = ElasticNetCVLearner() - mean = MeanLearner() - learners = [ridge, lasso, elastic, elasticCV, mean] cv = CrossValidation(k=2) - res = cv(self.housing, learners) + res = cv(self.housing, self.learners) rmse = RMSE(res) - for i in range(len(learners) - 1): + for i in range(len(self.learners) - 1): self.assertLess(rmse[i], rmse[-1]) def test_linear_scorer(self): @@ -110,7 +114,7 @@ def test_comparison_elastic_net(self): en = ElasticNetLearner(alpha=a, l1_ratio=1) en_model = en(self.housing) np.testing.assert_allclose( - lasso_model.coefficients, en_model.coefficients, atol=1e-07) + lasso_model.coefficients, en_model.coefficients, atol=a/10) def test_linear_regression_repr(self): learner = LinearRegressionLearner() @@ -118,3 +122,24 @@ def test_linear_regression_repr(self): learner2 = eval(repr_text) self.assertIsInstance(learner2, LinearRegressionLearner) + + +class TestLinearRegressionLearnerOnDask(TestLinearRegressionLearner): + learners = [ + RidgeRegressionLearner(), + LassoRegressionLearner(), + ElasticNetLearner(), + MeanLearner() + ] + + @classmethod + def setUpClass(cls): + cls.housing = temp_dasktable(Table("housing")) + + @unittest.skip("already tested") + def test_LinearRegression(self, _): + super().test_LinearRegression(_) + + @unittest.skip("scores differ from sklearn") + def test_comparison_with_sklearn(self): + super().test_comparison_with_sklearn() From f9547a3c71603806297f8f71b059b3e63953f356 Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Mon, 24 Jul 2023 13:39:40 +0200 Subject: [PATCH 3/3] lint --- Orange/regression/linear.py | 28 ++++++++++++++++---------- Orange/tests/test_linear_regression.py | 1 + 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/Orange/regression/linear.py b/Orange/regression/linear.py index 93653cc3981..238a3a3b360 100644 --- a/Orange/regression/linear.py +++ b/Orange/regression/linear.py @@ -6,6 +6,13 @@ import sklearn.linear_model as skl_linear_model import sklearn.preprocessing as skl_preprocessing +try: + import dask_ml.linear_model as dask_linear_model + from dask_glm.regularizers import ElasticNet +except ImportError: + dask_linear_model = skl_linear_model + ElasticNet = ... + from Orange.data import Variable, ContinuousVariable from Orange.preprocess import Normalize from Orange.preprocess.score import LearnerScorer @@ -39,25 +46,24 @@ def __init__(self, preprocessors=None, fit_intercept=True): def _initialize_wrapped(self, X=None, Y=None): if isinstance(X, da.Array) or isinstance(Y, da.Array): - try: - import dask_ml.linear_model - + if dask_linear_model is skl_linear_model: + warnings.warn("dask_ml is not installed, using sklearn instead.") + else: params = self.params.copy() + penalty = self.__penalty__ params["solver"] = "gradient_descent" - params["penalty"] = self.__penalty__ - if self.__penalty__ is not None: + + if penalty is not None: + if penalty == "elasticnet": + penalty = ElasticNet(weight=params.pop("l1_ratio")) + params["penalty"] = penalty params["solver"] = "admm" params["C"] = 1 / params.pop("alpha") params["max_iter"] = params["max_iter"] or 100 for key in ["copy_X", "precompute", "positive"]: params.pop(key, None) - if self.__penalty__ == "elasticnet": - from dask_glm.regularizers import ElasticNet - params["penalty"] = ElasticNet(weight=params.pop("l1_ratio")) - return dask_ml.linear_model.LinearRegression(**params) - except ImportError: - warnings.warn("dask_ml is not installed, using sklearn instead.") + return dask_linear_model.LinearRegression(**params) return self.__wraps__(**self.params) def fit(self, X, Y, W=None): diff --git a/Orange/tests/test_linear_regression.py b/Orange/tests/test_linear_regression.py index 3a85eac4494..bf1933e44ce 100644 --- a/Orange/tests/test_linear_regression.py +++ b/Orange/tests/test_linear_regression.py @@ -124,6 +124,7 @@ def test_linear_regression_repr(self): self.assertIsInstance(learner2, LinearRegressionLearner) +# pylint: disable=invalid-name class TestLinearRegressionLearnerOnDask(TestLinearRegressionLearner): learners = [ RidgeRegressionLearner(),