fix tests, replace mse with squared_error for tree based algorithms f…

…rom sklearn (#595)
mljar · Dec 30, 2022 · 6850fb7 · 6850fb7
1 parent 7f51765
commit 6850fb7
Show file tree

Hide file tree

Showing 9 changed files with 38 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -287,18 +287,18 @@ print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))
 
 ## :point_right: Regression Example
 
-Regression example on Boston house prices data. On test data it scores ~ 10.85 mean squared error (MSE).
+Regression example on `California Housing` house prices data.
 
 ```python
 import numpy as np
 import pandas as pd
-from sklearn.datasets import load_boston
+from sklearn.datasets import fetch_california_housing
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error
 from supervised.automl import AutoML # mljar-supervised
 
 # Load the data
-housing = load_boston()
+housing = fetch_california_housing()
 X_train, X_test, y_train, y_test = train_test_split(
     pd.DataFrame(housing.data, columns=housing.feature_names),
     housing.target,

diff --git a/supervised/algorithms/decision_tree.py b/supervised/algorithms/decision_tree.py
@@ -170,7 +170,7 @@ def __init__(self, params):
         self.library_version = sklearn.__version__
         self.max_iters = additional.get("max_steps", 1)
         self.model = DecisionTreeRegressor(
-            criterion=params.get("criterion", "mse"),
+            criterion=params.get("criterion", "squared_error"),
             max_depth=params.get("max_depth", 3),
             random_state=params.get("seed", 1),
         )
@@ -276,7 +276,7 @@ def interpret(
 
 dt_regression_params = {
     "criterion": [
-        "mse",
+        "squared_error",
         "friedman_mse",
     ],  # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
     "max_depth": [2, 3, 4],
@@ -288,7 +288,7 @@ def interpret(
     "text_transform",
 ]
 
-regression_default_params = {"criterion": "mse", "max_depth": 3}
+regression_default_params = {"criterion": "squared_error", "max_depth": 3}
 
 AlgorithmsRegistry.add(
     REGRESSION,

diff --git a/supervised/algorithms/extra_trees.py b/supervised/algorithms/extra_trees.py
@@ -67,7 +67,7 @@ def __init__(self, params):
         )
         self.model = ExtraTreesRegressor(
             n_estimators=self.trees_in_step,
-            criterion=params.get("criterion", "mse"),
+            criterion=params.get("criterion", "squared_error"),
             max_features=params.get("max_features", 0.6),
             max_depth=params.get("max_depth", 6),
             min_samples_split=params.get("min_samples_split", 30),
@@ -137,15 +137,15 @@ def file_extension(self):
 
 regression_et_params = {
     "criterion": [
-        "mse"
+        "squared_error"
     ],  # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
     "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
     "min_samples_split": [10, 20, 30, 40, 50],
     "max_depth": [3, 4, 5, 6, 7],
 }
 
 regression_default_params = {
-    "criterion": "mse",
+    "criterion": "squared_error",
     "max_features": 0.9,
     "min_samples_split": 30,
     "max_depth": 4,

diff --git a/supervised/algorithms/random_forest.py b/supervised/algorithms/random_forest.py
@@ -68,7 +68,7 @@ def __init__(self, params):
         )
         self.model = RandomForestRegressor(
             n_estimators=self.trees_in_step,
-            criterion=params.get("criterion", "mse"),
+            criterion=params.get("criterion", "squared_error"),
             max_features=params.get("max_features", 0.8),
             max_depth=params.get("max_depth", 6),
             min_samples_split=params.get("min_samples_split", 4),
@@ -141,15 +141,15 @@ def file_extension(self):
 
 regression_rf_params = {
     "criterion": [
-        "mse"
+        "squared_error"
     ],  # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
     "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
     "min_samples_split": [10, 20, 30, 40, 50],
     "max_depth": [3, 4, 5, 6, 7],
 }
 
 regression_default_params = {
-    "criterion": "mse",
+    "criterion": "squared_error",
     "max_features": 0.9,
     "min_samples_split": 30,
     "max_depth": 4,

diff --git a/supervised/automl.py b/supervised/automl.py
@@ -286,11 +286,11 @@ def __init__(
             Regression Example:
 
             >>> import pandas as pd
-            >>> from sklearn.datasets import load_boston
+            >>> from sklearn.datasets import fetch_california_housing
             >>> from sklearn.model_selection import train_test_split
             >>> from sklearn.metrics import mean_squared_error
             >>> from supervised import AutoML
-            >>> housing = load_boston()
+            >>> housing = fetch_california_housing()
             >>> X_train, X_test, y_train, y_test = train_test_split(
             ...       pd.DataFrame(housing.data, columns=housing.feature_names),
             ...       housing.target,

diff --git a/supervised/tuner/optuna/extra_trees.py b/supervised/tuner/optuna/extra_trees.py
@@ -34,7 +34,7 @@ def __init__(
         self.y_validation = y_validation
         self.eval_metric = eval_metric
         self.n_jobs = n_jobs
-        self.objective = "mse" if ml_task == REGRESSION else "gini"
+        self.objective = "squared_error" if ml_task == REGRESSION else "gini"
         self.max_steps = 10  # ET is trained in steps 100 trees each
         self.seed = random_state
 
@@ -46,7 +46,7 @@ def __call__(self, trial):
                 else ExtraTreesAlgorithm
             )
             self.objective = (
-                "mse"
+                "squared_error"
                 if self.ml_task == REGRESSION
                 else trial.suggest_categorical("criterion", ["gini", "entropy"])
             )

diff --git a/supervised/tuner/optuna/random_forest.py b/supervised/tuner/optuna/random_forest.py
@@ -33,7 +33,7 @@ def __init__(
         self.y_validation = y_validation
         self.eval_metric = eval_metric
         self.n_jobs = n_jobs
-        self.objective = "mse" if ml_task == REGRESSION else "gini"
+        self.objective = "squared_error" if ml_task == REGRESSION else "gini"
         self.max_steps = 10  # RF is trained in steps 100 trees each
         self.seed = random_state
 
@@ -45,7 +45,7 @@ def __call__(self, trial):
                 else RandomForestAlgorithm
             )
             self.objective = (
-                "mse"
+                "squared_error"
                 if self.ml_task == REGRESSION
                 else trial.suggest_categorical("criterion", ["gini", "entropy"])
             )

diff --git a/tests/tests_automl/test_automl.py b/tests/tests_automl/test_automl.py
@@ -14,7 +14,10 @@
 from supervised.exceptions import AutoMLException
 
 iris = datasets.load_iris()
-boston = datasets.load_boston()
+housing = datasets.fetch_california_housing()
+# limit data size for faster tests
+housing.data = housing.data[:500]
+housing.target = housing.target[:500]
 breast_cancer = datasets.load_breast_cancer()
 
 
@@ -148,10 +151,10 @@ def test_predict_proba_in_regression(self):
         model = AutoML(
             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
         )
-        model.fit(boston.data, boston.target)
+        model.fit(housing.data, housing.target)
         with self.assertRaises(AutoMLException) as context:
             # Try to call predict_proba in regression task
-            model.predict_proba(boston.data)
+            model.predict_proba(housing.data)
 
     def test_iris_dataset(self):
         """ Tests AutoML in the iris dataset (Multiclass classification)"""
@@ -161,12 +164,12 @@ def test_iris_dataset(self):
         score = model.fit(iris.data, iris.target).score(iris.data, iris.target)
         self.assertGreater(score, 0.5)
 
-    def test_boston_dataset(self):
-        """ Tests AutoML in the boston dataset (Regression)"""
+    def test_housing_dataset(self):
+        """ Tests AutoML in the housing dataset (Regression)"""
         model = AutoML(
             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
         )
-        score = model.fit(boston.data, boston.target).score(boston.data, boston.target)
+        score = model.fit(housing.data, housing.target).score(housing.data, housing.target)
         self.assertGreater(score, 0.5)
 
     def test_breast_cancer_dataset(self):

diff --git a/tests/tests_automl/test_automl_sample_weight.py b/tests/tests_automl/test_automl_sample_weight.py
@@ -15,7 +15,10 @@
 from supervised.exceptions import AutoMLException
 
 iris = datasets.load_iris()
-boston = datasets.load_boston()
+housing = datasets.fetch_california_housing()
+# limit data size for faster tests
+housing.data = housing.data[:500]
+housing.target = housing.target[:500]
 breast_cancer = datasets.load_breast_cancer()
 
 
@@ -45,25 +48,25 @@ def test_iris_dataset_sample_weight(self):
         )
         assert_almost_equal(score_1, score_2)
 
-    def test_boston_dataset(self):
-        """Tests AutoML in the boston dataset (Regression)
+    def test_housing_dataset(self):
+        """Tests AutoML in the housing dataset (Regression)
         without and with sample weight"""
         model = AutoML(
             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
         )
-        score_1 = model.fit(boston.data, boston.target).score(
-            boston.data, boston.target
+        score_1 = model.fit(housing.data, housing.target).score(
+            housing.data, housing.target
         )
         self.assertGreater(score_1, 0.5)
 
         shutil.rmtree(self.automl_dir, ignore_errors=True)
         model = AutoML(
             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
         )
-        sample_weight = np.ones(boston.data.shape[0])
+        sample_weight = np.ones(housing.data.shape[0])
         score_2 = model.fit(
-            boston.data, boston.target, sample_weight=sample_weight
-        ).score(boston.data, boston.target, sample_weight=sample_weight)
+            housing.data, housing.target, sample_weight=sample_weight
+        ).score(housing.data, housing.target, sample_weight=sample_weight)
         assert_almost_equal(score_1, score_2)
 
     def test_breast_cancer_dataset(self):