Skip to content

Commit

Permalink
fix tests, replace mse with squared_error for tree based algorithms f…
Browse files Browse the repository at this point in the history
…rom sklearn (#595)
  • Loading branch information
pplonski committed Dec 30, 2022
1 parent 7f51765 commit 6850fb7
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 32 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,18 +287,18 @@ print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))

## :point_right: Regression Example

Regression example on Boston house prices data. On test data it scores ~ 10.85 mean squared error (MSE).
Regression example on `California Housing` house prices data.

```python
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from supervised.automl import AutoML # mljar-supervised

# Load the data
housing = load_boston()
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(
pd.DataFrame(housing.data, columns=housing.feature_names),
housing.target,
Expand Down
6 changes: 3 additions & 3 deletions supervised/algorithms/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def __init__(self, params):
self.library_version = sklearn.__version__
self.max_iters = additional.get("max_steps", 1)
self.model = DecisionTreeRegressor(
criterion=params.get("criterion", "mse"),
criterion=params.get("criterion", "squared_error"),
max_depth=params.get("max_depth", 3),
random_state=params.get("seed", 1),
)
Expand Down Expand Up @@ -276,7 +276,7 @@ def interpret(

dt_regression_params = {
"criterion": [
"mse",
"squared_error",
"friedman_mse",
], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
"max_depth": [2, 3, 4],
Expand All @@ -288,7 +288,7 @@ def interpret(
"text_transform",
]

regression_default_params = {"criterion": "mse", "max_depth": 3}
regression_default_params = {"criterion": "squared_error", "max_depth": 3}

AlgorithmsRegistry.add(
REGRESSION,
Expand Down
6 changes: 3 additions & 3 deletions supervised/algorithms/extra_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(self, params):
)
self.model = ExtraTreesRegressor(
n_estimators=self.trees_in_step,
criterion=params.get("criterion", "mse"),
criterion=params.get("criterion", "squared_error"),
max_features=params.get("max_features", 0.6),
max_depth=params.get("max_depth", 6),
min_samples_split=params.get("min_samples_split", 30),
Expand Down Expand Up @@ -137,15 +137,15 @@ def file_extension(self):

regression_et_params = {
"criterion": [
"mse"
"squared_error"
], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
"max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
"min_samples_split": [10, 20, 30, 40, 50],
"max_depth": [3, 4, 5, 6, 7],
}

regression_default_params = {
"criterion": "mse",
"criterion": "squared_error",
"max_features": 0.9,
"min_samples_split": 30,
"max_depth": 4,
Expand Down
6 changes: 3 additions & 3 deletions supervised/algorithms/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(self, params):
)
self.model = RandomForestRegressor(
n_estimators=self.trees_in_step,
criterion=params.get("criterion", "mse"),
criterion=params.get("criterion", "squared_error"),
max_features=params.get("max_features", 0.8),
max_depth=params.get("max_depth", 6),
min_samples_split=params.get("min_samples_split", 4),
Expand Down Expand Up @@ -141,15 +141,15 @@ def file_extension(self):

regression_rf_params = {
"criterion": [
"mse"
"squared_error"
], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
"max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
"min_samples_split": [10, 20, 30, 40, 50],
"max_depth": [3, 4, 5, 6, 7],
}

regression_default_params = {
"criterion": "mse",
"criterion": "squared_error",
"max_features": 0.9,
"min_samples_split": 30,
"max_depth": 4,
Expand Down
4 changes: 2 additions & 2 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,11 @@ def __init__(
Regression Example:
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> from sklearn.datasets import fetch_california_housing
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.metrics import mean_squared_error
>>> from supervised import AutoML
>>> housing = load_boston()
>>> housing = fetch_california_housing()
>>> X_train, X_test, y_train, y_test = train_test_split(
... pd.DataFrame(housing.data, columns=housing.feature_names),
... housing.target,
Expand Down
4 changes: 2 additions & 2 deletions supervised/tuner/optuna/extra_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
self.y_validation = y_validation
self.eval_metric = eval_metric
self.n_jobs = n_jobs
self.objective = "mse" if ml_task == REGRESSION else "gini"
self.objective = "squared_error" if ml_task == REGRESSION else "gini"
self.max_steps = 10 # ET is trained in steps 100 trees each
self.seed = random_state

Expand All @@ -46,7 +46,7 @@ def __call__(self, trial):
else ExtraTreesAlgorithm
)
self.objective = (
"mse"
"squared_error"
if self.ml_task == REGRESSION
else trial.suggest_categorical("criterion", ["gini", "entropy"])
)
Expand Down
4 changes: 2 additions & 2 deletions supervised/tuner/optuna/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
self.y_validation = y_validation
self.eval_metric = eval_metric
self.n_jobs = n_jobs
self.objective = "mse" if ml_task == REGRESSION else "gini"
self.objective = "squared_error" if ml_task == REGRESSION else "gini"
self.max_steps = 10 # RF is trained in steps 100 trees each
self.seed = random_state

Expand All @@ -45,7 +45,7 @@ def __call__(self, trial):
else RandomForestAlgorithm
)
self.objective = (
"mse"
"squared_error"
if self.ml_task == REGRESSION
else trial.suggest_categorical("criterion", ["gini", "entropy"])
)
Expand Down
15 changes: 9 additions & 6 deletions tests/tests_automl/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
from supervised.exceptions import AutoMLException

iris = datasets.load_iris()
boston = datasets.load_boston()
housing = datasets.fetch_california_housing()
# limit data size for faster tests
housing.data = housing.data[:500]
housing.target = housing.target[:500]
breast_cancer = datasets.load_breast_cancer()


Expand Down Expand Up @@ -148,10 +151,10 @@ def test_predict_proba_in_regression(self):
model = AutoML(
explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
)
model.fit(boston.data, boston.target)
model.fit(housing.data, housing.target)
with self.assertRaises(AutoMLException) as context:
# Try to call predict_proba in regression task
model.predict_proba(boston.data)
model.predict_proba(housing.data)

def test_iris_dataset(self):
""" Tests AutoML in the iris dataset (Multiclass classification)"""
Expand All @@ -161,12 +164,12 @@ def test_iris_dataset(self):
score = model.fit(iris.data, iris.target).score(iris.data, iris.target)
self.assertGreater(score, 0.5)

def test_boston_dataset(self):
""" Tests AutoML in the boston dataset (Regression)"""
def test_housing_dataset(self):
""" Tests AutoML in the housing dataset (Regression)"""
model = AutoML(
explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
)
score = model.fit(boston.data, boston.target).score(boston.data, boston.target)
score = model.fit(housing.data, housing.target).score(housing.data, housing.target)
self.assertGreater(score, 0.5)

def test_breast_cancer_dataset(self):
Expand Down
19 changes: 11 additions & 8 deletions tests/tests_automl/test_automl_sample_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
from supervised.exceptions import AutoMLException

iris = datasets.load_iris()
boston = datasets.load_boston()
housing = datasets.fetch_california_housing()
# limit data size for faster tests
housing.data = housing.data[:500]
housing.target = housing.target[:500]
breast_cancer = datasets.load_breast_cancer()


Expand Down Expand Up @@ -45,25 +48,25 @@ def test_iris_dataset_sample_weight(self):
)
assert_almost_equal(score_1, score_2)

def test_boston_dataset(self):
"""Tests AutoML in the boston dataset (Regression)
def test_housing_dataset(self):
"""Tests AutoML in the housing dataset (Regression)
without and with sample weight"""
model = AutoML(
explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
)
score_1 = model.fit(boston.data, boston.target).score(
boston.data, boston.target
score_1 = model.fit(housing.data, housing.target).score(
housing.data, housing.target
)
self.assertGreater(score_1, 0.5)

shutil.rmtree(self.automl_dir, ignore_errors=True)
model = AutoML(
explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
)
sample_weight = np.ones(boston.data.shape[0])
sample_weight = np.ones(housing.data.shape[0])
score_2 = model.fit(
boston.data, boston.target, sample_weight=sample_weight
).score(boston.data, boston.target, sample_weight=sample_weight)
housing.data, housing.target, sample_weight=sample_weight
).score(housing.data, housing.target, sample_weight=sample_weight)
assert_almost_equal(score_1, score_2)

def test_breast_cancer_dataset(self):
Expand Down

0 comments on commit 6850fb7

Please sign in to comment.