Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dask: Linear Regression #6513

Merged
merged 3 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import scipy
import dask.array as da

from Orange.data import Table, Storage, Instance, Value, Domain
from Orange.data.filter import HasClass
Expand Down Expand Up @@ -507,7 +508,9 @@ def __init__(self, skl_model):
self.skl_model = skl_model

def predict(self, X):
value = self.skl_model.predict(X)
if isinstance(X, da.Array):
X = X.rechunk({0: "auto", 1: -1})
value = np.asarray(self.skl_model.predict(X))
# SVM has probability attribute which defines if method compute probs
has_prob_attr = hasattr(self.skl_model, "probability")
if (has_prob_attr and self.skl_model.probability
Expand Down Expand Up @@ -581,13 +584,18 @@ def __call__(self, data, progress_callback=None):
m.params = self.params
return m

def _initialize_wrapped(self):
# pylint: disable=unused-argument
def _initialize_wrapped(self, X=None, Y=None):
# wrap sklearn/dask_ml according to type of X/Y
# pylint: disable=not-callable
return self.__wraps__(**self.params)

def fit(self, X, Y, W=None):
clf = self._initialize_wrapped()
clf = self._initialize_wrapped(X, Y)
Y = Y.reshape(-1)
if isinstance(X, da.Array) or isinstance(Y, da.Array):
X = X.rechunk({0: "auto", 1: -1})
Y = Y.rechunk({0: X.chunksize[0]})
if W is None or not self.supports_weights:
return self.__returns__(clf.fit(X, Y))
return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))
Expand Down
2 changes: 1 addition & 1 deletion Orange/classification/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, penalty="l2", dual=False, tol=0.0001, C=1.0,
super().__init__(preprocessors=preprocessors)
self.params = vars()

def _initialize_wrapped(self):
def _initialize_wrapped(self, X=None, Y=None):
params = self.params.copy()
# The default scikit-learn solver `lbfgs` (v0.22) does not support the
# l1 penalty.
Expand Down
2 changes: 1 addition & 1 deletion Orange/classification/neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class MLPClassifierWCallback(skl_nn.MLPClassifier, NIterCallbackMixin):
class NNClassificationLearner(NNBase, SklLearner):
__wraps__ = MLPClassifierWCallback

def _initialize_wrapped(self):
def _initialize_wrapped(self, X=None, Y=None):
clf = SklLearner._initialize_wrapped(self)
clf.orange_callback = getattr(self, "callback", None)
return clf
9 changes: 9 additions & 0 deletions Orange/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,15 @@
self.X.compute_chunk_sizes()
return self.X.shape[0]

def _filter_has_class(self, negate=False):
if self._Y.ndim == 1:
retain = np.isnan(self._Y)
else:
retain = np.any(np.isnan(self._Y), axis=1)

Check warning on line 262 in Orange/data/dask.py

View check run for this annotation

Codecov / codecov/patch

Orange/data/dask.py#L262

Added line #L262 was not covered by tests
if not negate:
retain = np.logical_not(retain)
return self.from_table_rows(self, np.asarray(retain))


def dask_stats(X, compute_variance=False):
is_numeric = np.issubdtype(X.dtype, np.number)
Expand Down
36 changes: 36 additions & 0 deletions Orange/regression/linear.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
import warnings

import numpy as np
import dask.array as da

import sklearn.linear_model as skl_linear_model
import sklearn.preprocessing as skl_preprocessing

try:
import dask_ml.linear_model as dask_linear_model
from dask_glm.regularizers import ElasticNet
except ImportError:
dask_linear_model = skl_linear_model
ElasticNet = ...

Check warning on line 14 in Orange/regression/linear.py

View check run for this annotation

Codecov / codecov/patch

Orange/regression/linear.py#L12-L14

Added lines #L12 - L14 were not covered by tests

from Orange.data import Variable, ContinuousVariable
from Orange.preprocess import Normalize
from Orange.preprocess.score import LearnerScorer
Expand All @@ -27,19 +37,43 @@

class LinearRegressionLearner(SklLearner, _FeatureScorerMixin):
__wraps__ = skl_linear_model.LinearRegression
__penalty__ = None

# Arguments are needed for signatures, pylint: disable=unused-argument
def __init__(self, preprocessors=None, fit_intercept=True):
super().__init__(preprocessors=preprocessors)
self.params = vars()

def _initialize_wrapped(self, X=None, Y=None):
if isinstance(X, da.Array) or isinstance(Y, da.Array):
if dask_linear_model is skl_linear_model:
warnings.warn("dask_ml is not installed, using sklearn instead.")

Check warning on line 50 in Orange/regression/linear.py

View check run for this annotation

Codecov / codecov/patch

Orange/regression/linear.py#L50

Added line #L50 was not covered by tests
else:
params = self.params.copy()
penalty = self.__penalty__
params["solver"] = "gradient_descent"

if penalty is not None:
if penalty == "elasticnet":
penalty = ElasticNet(weight=params.pop("l1_ratio"))
params["penalty"] = penalty
params["solver"] = "admm"
params["C"] = 1 / params.pop("alpha")
params["max_iter"] = params["max_iter"] or 100
for key in ["copy_X", "precompute", "positive"]:
params.pop(key, None)

return dask_linear_model.LinearRegression(**params)
return self.__wraps__(**self.params)

def fit(self, X, Y, W=None):
model = super().fit(X, Y, W)
return LinearModel(model.skl_model)


class RidgeRegressionLearner(LinearRegressionLearner):
__wraps__ = skl_linear_model.Ridge
__penalty__ = "l2"

# Arguments are needed for signatures, pylint: disable=unused-argument
def __init__(self, alpha=1.0, fit_intercept=True, copy_X=True,
Expand All @@ -50,6 +84,7 @@

class LassoRegressionLearner(LinearRegressionLearner):
__wraps__ = skl_linear_model.Lasso
__penalty__ = "l1"

# Arguments are needed for signatures, pylint: disable=unused-argument
def __init__(self, alpha=1.0, fit_intercept=True, precompute=False,
Expand All @@ -61,6 +96,7 @@

class ElasticNetLearner(LinearRegressionLearner):
__wraps__ = skl_linear_model.ElasticNet
__penalty__ = "elasticnet"

# Arguments are needed for signatures, pylint: disable=unused-argument
def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
Expand Down
2 changes: 1 addition & 1 deletion Orange/regression/neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class MLPRegressorWCallback(skl_nn.MLPRegressor, NIterCallbackMixin):
class NNRegressionLearner(NNBase, SklLearner):
__wraps__ = MLPRegressorWCallback

def _initialize_wrapped(self):
def _initialize_wrapped(self, X=None, Y=None):
clf = SklLearner._initialize_wrapped(self)
clf.orange_callback = getattr(self, "callback", None)
return clf
48 changes: 37 additions & 11 deletions Orange/tests/test_linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,24 @@
ElasticNetCVLearner,
MeanLearner)
from Orange.evaluation import CrossValidation, RMSE
from Orange.tests.test_dasktable import with_dasktable, temp_dasktable


class TestLinearRegressionLearner(unittest.TestCase):
learners = [
RidgeRegressionLearner(),
LassoRegressionLearner(),
ElasticNetLearner(),
ElasticNetCVLearner(),
MeanLearner()
]

@classmethod
def setUpClass(cls):
cls.housing = Table("housing")

def test_LinearRegression(self):
@with_dasktable
def test_LinearRegression(self, prepare_table):
nrows = 1000
ncols = 3
x = np.random.randint(-20, 51, (nrows, ncols))
Expand All @@ -31,23 +41,17 @@ def test_LinearRegression(self):

x1, x2 = np.split(x, 2)
y1, y2 = np.split(y, 2)
t = Table.from_numpy(None, x1, y1)
t = prepare_table(Table.from_numpy(None, x1, y1))
learn = LinearRegressionLearner()
clf = learn(t)
z = clf(x2)
self.assertTrue((abs(z.reshape(-1, 1) - y2) < 2.0).all())

def test_Regression(self):
ridge = RidgeRegressionLearner()
lasso = LassoRegressionLearner()
elastic = ElasticNetLearner()
elasticCV = ElasticNetCVLearner()
mean = MeanLearner()
learners = [ridge, lasso, elastic, elasticCV, mean]
cv = CrossValidation(k=2)
res = cv(self.housing, learners)
res = cv(self.housing, self.learners)
rmse = RMSE(res)
for i in range(len(learners) - 1):
for i in range(len(self.learners) - 1):
self.assertLess(rmse[i], rmse[-1])

def test_linear_scorer(self):
Expand Down Expand Up @@ -110,11 +114,33 @@ def test_comparison_elastic_net(self):
en = ElasticNetLearner(alpha=a, l1_ratio=1)
en_model = en(self.housing)
np.testing.assert_allclose(
lasso_model.coefficients, en_model.coefficients, atol=1e-07)
lasso_model.coefficients, en_model.coefficients, atol=a/10)

def test_linear_regression_repr(self):
learner = LinearRegressionLearner()
repr_text = repr(learner)
learner2 = eval(repr_text)

self.assertIsInstance(learner2, LinearRegressionLearner)


# pylint: disable=invalid-name
class TestLinearRegressionLearnerOnDask(TestLinearRegressionLearner):
learners = [
RidgeRegressionLearner(),
LassoRegressionLearner(),
ElasticNetLearner(),
MeanLearner()
]

@classmethod
def setUpClass(cls):
cls.housing = temp_dasktable(Table("housing"))

@unittest.skip("already tested")
def test_LinearRegression(self, _):
super().test_LinearRegression(_)

@unittest.skip("scores differ from sklearn")
def test_comparison_with_sklearn(self):
super().test_comparison_with_sklearn()
2 changes: 1 addition & 1 deletion Orange/widgets/evaluate/owpredictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def _call_predictors(self):
results.domain = self.data.domain
results.row_indices = numpy.arange(len(self.data))
results.folds = (Ellipsis, )
results.actual = self.data.Y
results.actual = numpy.asarray(self.data.Y)
results.unmapped_probabilities = prob
results.unmapped_predicted = pred
results.probabilities = results.predicted = None
Expand Down
3 changes: 2 additions & 1 deletion Orange/widgets/utils/owlearnerwidget.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from copy import deepcopy
import numpy as np

from AnyQt.QtCore import QTimer, Qt

Expand Down Expand Up @@ -252,7 +253,7 @@ def check_data(self):
self.Error.data_error(reason)
elif not len(self.data):
self.Error.data_error("Dataset is empty.")
elif len(ut.unique(self.data.Y)) < 2:
elif len(np.asarray(ut.unique(self.data.Y))) < 2:
self.Error.data_error("Data contains a single target value.")
elif self.data.X.size == 0:
self.Error.data_error("Data has no features to learn from.")
Expand Down
Loading