Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "Column as Model" #6852

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Orange/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .sgd import *
from .neural_network import *
from .calibration import *
from .column import *
try:
from .catgb import *
except ModuleNotFoundError:
Expand Down
98 changes: 98 additions & 0 deletions Orange/classification/column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import Optional

import numpy as np

from Orange.data import Variable, DiscreteVariable, Domain, Table
from Orange.classification import Model, Learner


__all__ = ["ColumnLearner", "ColumnClassifier"]


class ColumnLearner(Learner):
def __init__(self,
class_var: DiscreteVariable,
column: Variable,
offset: Optional[float] = None,
k: Optional[float] = None):
super().__init__()
self.class_var = class_var
self.column = column
self.offset = offset
self.k = k
self.name = f"column '{column.name}'"

def fit_storage(self, _):
return ColumnClassifier(
self.class_var, self.column, self.offset, self.k)


class ColumnClassifier(Model):
def __init__(self,
class_var: DiscreteVariable,
column: Variable,
offset: Optional[float] = None,
k: Optional[float] = None):
super().__init__(Domain([column], class_var))
assert class_var.is_discrete
if column.is_continuous:
if len(class_var.values) != 2:
raise ValueError("Numeric column can only be used with "
"binary class variable")
self.value_mapping = None
else:
assert isinstance(column, DiscreteVariable)
assert offset is None and k is None
if not self.check_value_sets(class_var, column):
raise ValueError(
"Column contains values that are not in class variable")
if class_var.values[:len(column.values)] == column.values:
self.value_mapping = None
else:
self.value_mapping = np.array(
[class_var.to_val(x) for x in column.values])
self.class_var = class_var
self.column = column
self.offset = offset
self.k = k
self.name = f"column '{column.name}'"

@staticmethod
def check_prob_range(values: np.ndarray):
return np.nanmin(values) >= 0 and np.nanmax(values) <= 1

@staticmethod
def check_value_sets(class_var: DiscreteVariable,
column_var: DiscreteVariable):
return set(column_var.values) <= set(class_var.values)

def predict_storage(self, data: Table):
vals = data.get_column(self.column)
rows = np.isfinite(vals)
nclasses = len(self.class_var.values)
proba = np.full((len(data), nclasses), 1 / nclasses)
if self.column.is_discrete:
mapped = vals[rows].astype(int)
if self.value_mapping is not None:
mapped = self.value_mapping[mapped]
vals = vals.copy()
vals[rows] = mapped
proba[rows] = 0
proba[rows, mapped] = 1
else:
if self.k is None:
if not self.check_prob_range(vals):
raise ValueError("Column values must be in [0, 1] range "
"unless logistic function is applied")
proba[rows, 1] = vals[rows]
else:
proba[rows, 1] = (
1 / (1 + np.exp(-self.k * (vals[rows] - self.offset))))

proba[rows, 0] = 1 - proba[rows, 1]
vals = (proba[:, 1] > 0.5).astype(float)
vals[~rows] = np.nan
return vals, proba

def __str__(self):
return f'ColumnClassifier {self.column.name}'
165 changes: 165 additions & 0 deletions Orange/classification/tests/test_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import unittest
from unittest.mock import patch

import numpy as np

from Orange.classification import ColumnLearner, ColumnClassifier
from Orange.data import DiscreteVariable, ContinuousVariable, Domain, Table


class ColumnTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.domain = Domain([DiscreteVariable("d1", values=["a", "b"]),
DiscreteVariable("d2", values=["c", "d"]),
DiscreteVariable("d3", values=["d", "c"]),
ContinuousVariable("c1"),
ContinuousVariable("c2")
],
DiscreteVariable("cls", values=["c", "d"]),
[DiscreteVariable("m1", values=["a", "b"]),
DiscreteVariable("m2", values=["d"]),
ContinuousVariable("c3")]
)
cls.data = Table.from_numpy(
cls.domain,
np.array([[0, 0, 0, 1, 0.5],
[0, 1, 1, 0.25, -3],
[1, 0, np.nan, np.nan, np.nan]]),
np.array([0, 1, 1]),
np.array([[0, 0, 2],
[1, 0, 8],
[np.nan, np.nan, 5]])
)

@patch("Orange.classification.column.ColumnClassifier")
def test_fit_storage(self, clsfr):
learner = ColumnLearner(self.domain.class_var, self.domain["d2"])
self.assertEqual(learner.name, "column 'd2'")
learner.fit_storage(self.data)
clsfr.assert_called_with(self.domain.class_var, self.domain["d2"], None, None)

learner = ColumnLearner(self.domain.class_var, self.domain["c3"])
learner.fit_storage(self.data)
clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], None, None)

learner = ColumnLearner(self.domain.class_var, self.domain["c3"], 42, 3.5)
self.assertEqual(learner.name, "column 'c3'")
learner.fit_storage(self.data)
clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], 42, 3.5)

def test_classifier_init_checks(self):
cls = ColumnClassifier(self.domain.class_var, self.domain["d2"])
cls.name = "column 'd2'"

cls = ColumnClassifier(self.domain.class_var, self.domain["d3"])
cls.name = "column 'd3'"

cls = ColumnClassifier(self.domain.class_var, self.domain["c3"])
cls.name = "column 'c3'"

self.assertRaises(
ValueError,
ColumnClassifier,
self.domain.class_var, self.domain["d1"])

self.assertRaises(
ValueError,
ColumnClassifier,
DiscreteVariable("x", values=("a", "b", "c")), self.domain["c3"])

def test_check_prob_range(self):
self.assertTrue(
ColumnClassifier.check_prob_range(np.array([0, 0.5, 1]))
)
self.assertTrue(
ColumnClassifier.check_prob_range(np.array([0, 0.5, np.nan]))
)
self.assertFalse(
ColumnClassifier.check_prob_range(np.array([0, 0.5, 1.5]))
)
self.assertFalse(
ColumnClassifier.check_prob_range(np.array([0, 0.5, -1]))
)

def test_check_value_sets(self):
d1, d2, d3, *_ = self.domain.attributes
c = self.domain.class_var
m2: DiscreteVariable = self.domain["m2"]
self.assertFalse(ColumnClassifier.check_value_sets(c, d1))
self.assertTrue(ColumnClassifier.check_value_sets(c ,d2))
self.assertTrue(ColumnClassifier.check_value_sets(c, d3))
self.assertTrue(ColumnClassifier.check_value_sets(c, m2))
self.assertFalse(ColumnClassifier.check_value_sets(m2, c))

def test_predict_discrete(self):
# Just copy
model = ColumnClassifier(self.domain.class_var, self.domain["d2"])
self.assertEqual(model.name, "column 'd2'")
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [0, 1, 0])
np.testing.assert_equal(probs, [[1, 0], [0, 1], [1, 0]])

# Values are not in the same order -> map
model = ColumnClassifier(self.domain.class_var, self.domain["d3"])
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 0, np.nan])
np.testing.assert_equal(probs, [[0, 1], [1, 0], [0.5, 0.5]])

# Not in the same order, and one is missing -> map
model = ColumnClassifier(self.domain.class_var, self.domain["m2"])
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 1, np.nan])
np.testing.assert_equal(probs, [[0, 1], [0, 1], [0.5, 0.5]])

# Non-binary class
domain = Domain(
self.domain.attributes,
DiscreteVariable("cls", values=["a", "c", "b", "d", "e"]))
data = Table.from_numpy(domain, self.data.X, self.data.Y)
model = ColumnClassifier(domain.class_var, domain["d3"])
classes, probs = model(data, model.ValueProbs)
np.testing.assert_equal(classes, [3, 1, np.nan])
np.testing.assert_almost_equal(
probs,
np.array([[0, 0, 0, 1, 0],
[0, 1, 0, 0, 0],
[0.2, 0.2, 0.2, 0.2, 0.2]]))

def test_predict_as_direct_probs(self):
model = ColumnClassifier(self.domain.class_var, self.domain["c1"])
self.assertEqual(model.name, "column 'c1'")
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 0, np.nan])
np.testing.assert_equal(probs, [[0, 1], [0.75, 0.25], [0.5, 0.5]])

model = ColumnClassifier(self.domain.class_var, self.domain["c2"])
self.assertRaises(ValueError, model, self.data)

model = ColumnClassifier(self.domain.class_var, self.domain["c3"])
self.assertRaises(ValueError, model, self.data)

def test_predict_with_logistic(self):
model = ColumnClassifier(
self.domain.class_var, self.domain["c1"], 0.5, 3)
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [1, 0, np.nan])
np.testing.assert_almost_equal(
probs[:, 1], [1 / (1 + np.exp(-3 * (1 - 0.5))),
1 / (1 + np.exp(-3 * (0.25 - 0.5))),
0.5])
np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1])

model = ColumnClassifier(
self.domain.class_var, self.domain["c2"], 0.5, 3)
classes, probs = model(self.data, model.ValueProbs)
np.testing.assert_equal(classes, [0, 0, np.nan])
np.testing.assert_almost_equal(
probs[:, 1], [1 / (1 + np.exp(-3 * (0.5 - 0.5))),
1 / (1 + np.exp(-3 * (-3 - 0.5))),
0.5])
np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1])


if __name__ == "__main__":
unittest.main()
29 changes: 17 additions & 12 deletions Orange/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
SVMLearner, LinearSVMLearner, OneClassSVMLearner, TreeLearner, KNNLearner,
SimpleRandomForestLearner, EllipticEnvelopeLearner, ThresholdLearner,
CalibratedLearner)
from Orange.classification.column import ColumnLearner
from Orange.classification.rules import _RuleLearner
from Orange.data import (ContinuousVariable, DiscreteVariable,
Domain, Table)
Expand All @@ -30,6 +31,10 @@
from Orange.tests.dummy_learners import DummyLearner, DummyMulticlassLearner
from Orange.tests import test_filename

# While this could be determined automatically from __init__ signatures,
# it is better to do it explicitly
LEARNERS_WITH_ARGUMENTS = (ThresholdLearner, CalibratedLearner, ColumnLearner)


def all_learners():
classification_modules = pkgutil.walk_packages(
Expand Down Expand Up @@ -214,8 +219,7 @@ def test_result_shape(self):
"""
iris = Table('iris')
for learner in all_learners():
# calibration, threshold learners' __init__ requires arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue

with self.subTest(learner.__name__):
Expand Down Expand Up @@ -256,6 +260,8 @@ def test_result_shape_numpy(self):
args = []
if learner in (ThresholdLearner, CalibratedLearner):
args = [LogisticRegressionLearner()]
elif learner in LEARNERS_WITH_ARGUMENTS:
continue
data = iris_bin if learner is ThresholdLearner else iris
model = learner(*args)(data)
transformed_iris = model.data_to_model_domain(data)
Expand All @@ -277,6 +283,10 @@ def test_predict_proba(self):
continue
if learner in (ThresholdLearner, CalibratedLearner):
model = learner(LogisticRegressionLearner())(data)
elif learner in LEARNERS_WITH_ARGUMENTS:
# note that above two also require arguments, but we
# provide them
continue
else:
model = learner()(data)
probs = model.predict_proba(data)
Expand Down Expand Up @@ -385,8 +395,7 @@ def test_unknown(self):
def test_missing_class(self):
table = Table(test_filename("datasets/adult_sample_missing"))
for learner in all_learners():
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
# Skip slow tests
if isinstance(learner, _RuleLearner):
Expand Down Expand Up @@ -414,8 +423,7 @@ def test_all_learners_accessible_in_Orange_classification_namespace(self):
def test_all_models_work_after_unpickling(self):
datasets = [Table('iris'), Table('titanic')]
for learner in list(all_learners()):
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
# Skip slow tests
if issubclass(learner, _RuleLearner):
Expand All @@ -438,8 +446,7 @@ def test_all_models_work_after_unpickling(self):
def test_all_models_work_after_unpickling_pca(self):
datasets = [Table('iris'), Table('titanic')]
for learner in list(all_learners()):
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
# Skip slow tests
if issubclass(learner, _RuleLearner):
Expand All @@ -462,8 +469,7 @@ def test_all_models_work_after_unpickling_pca(self):

def test_adequacy_all_learners(self):
for learner in all_learners():
# calibration, threshold learners' __init__ requires arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
with self.subTest(learner.__name__):
learner = learner()
Expand All @@ -472,8 +478,7 @@ def test_adequacy_all_learners(self):

def test_adequacy_all_learners_multiclass(self):
for learner in all_learners():
# calibration, threshold learners' __init__ require arguments
if learner in (ThresholdLearner, CalibratedLearner):
if learner in LEARNERS_WITH_ARGUMENTS:
continue
with self.subTest(learner.__name__):
learner = learner()
Expand Down
Loading
Loading