From 754fdd41ebb7e1a7b5a88853e859a3c7ac5d2627 Mon Sep 17 00:00:00 2001 From: Piotr Plonski Date: Sat, 13 Apr 2019 13:59:33 +0200 Subject: [PATCH] predicting labels --- supervised/automl.py | 10 ++-------- supervised/models/ensemble.py | 7 ++++++- tests/test_automl.py | 19 +++++++++---------- tests/test_automl_performance.py | 2 +- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/supervised/automl.py b/supervised/automl.py index cbeb47e7..07f971a5 100644 --- a/supervised/automl.py +++ b/supervised/automl.py @@ -86,7 +86,7 @@ def get_additional_metrics(self): oof_predictions["target"], oof_predictions["prediction"], BINARY_CLASSIFICATION ) self._threshold = self._max_metrics["f1"]["threshold"] - print(self._metrics_details, self._max_metrics, self._confusion_matrix) + # print(self._metrics_details, self._max_metrics, self._confusion_matrix) def _get_model_params(self, model_type, X, y): model_info = ModelsRegistry.registry[BINARY_CLASSIFICATION][model_type] @@ -243,18 +243,12 @@ def fit(self, X, y): def predict(self, X): if self._best_model is not None: predictions = self._best_model.predict(X) - - print("columns", predictions.columns, predictions.head()) neg_label, pos_label = predictions.columns[0][2:], predictions.columns[1][2:] if neg_label == '0' and pos_label == '1': neg_label, pos_label = 0, 1 # assume that it is binary classification predictions['label'] = predictions.iloc[:, 1] > self._threshold - - booleanDictionary = {True: pos_label, False: neg_label} - - predictions['label'] = predictions['label'].map(booleanDictionary) - + predictions['label'] = predictions['label'].map({True: pos_label, False: neg_label}) return predictions #return pd.DataFrame( diff --git a/supervised/models/ensemble.py b/supervised/models/ensemble.py index 0809e8d5..129959fd 100644 --- a/supervised/models/ensemble.py +++ b/supervised/models/ensemble.py @@ -33,6 +33,7 @@ def __init__(self): self.selected_models = [] self.train_time = None self.total_best_sum = None # total sum of predictions, the oof of ensemble + self.target = None def get_train_time(self): return self.train_time @@ -44,7 +45,7 @@ def get_name(self): return self.algorithm_short_name def get_out_of_folds(self): - return pd.DataFrame({"prediction": self.total_best_sum}) + return pd.DataFrame({"prediction": self.total_best_sum, "target": self.target}) def _get_mean(self, X, best_sum, best_count, selected): resp = copy.deepcopy(X[selected]) @@ -58,6 +59,10 @@ def get_oof_matrix(self, models): for i, m in enumerate(models): oof = m.get_out_of_folds() oofs["model_{}".format(i)] = oof["prediction"] + if self.target is None: + self.target = oof["target"] # it will be needed for computing advance model statistics + # it can be a mess in the future when target will be transformed depending on each model + X = pd.DataFrame(oofs) self.models = models # remeber models, will be needed in predictions return X diff --git a/tests/test_automl.py b/tests/test_automl.py index 62dc664d..fefbadbd 100644 --- a/tests/test_automl.py +++ b/tests/test_automl.py @@ -26,7 +26,7 @@ def setUpClass(cls): ) cls.X = pd.DataFrame(cls.X, columns=["f0", "f1", "f2", "f3", "f4"]) #cls.y = pd.DataFrame(cls.y) - ''' + def test_fit_and_predict(self): metric = Metric({"name": "logloss"}) @@ -35,7 +35,7 @@ def test_fit_and_predict(self): hill_climbing_steps=0) automl.fit(self.X, self.y) - y_predicted = automl.predict(self.X)["prediction"] + y_predicted = automl.predict(self.X)["p_1"] self.assertTrue(y_predicted is not None) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.7) @@ -44,29 +44,28 @@ def test_fit_and_predict(self): automl2 = AutoML() automl2.from_json(params) - y_predicted2 = automl2.predict(self.X)["prediction"] + y_predicted2 = automl2.predict(self.X)["p_1"] self.assertTrue(y_predicted2 is not None) loss2 = metric(self.y, y_predicted2) self.assertTrue(loss2 < 0.7) assert_almost_equal(automl._threshold, automl2._threshold) - ''' + def test_predict_labels(self): # 3.csv') # df = pd.read_csv('tests/data/adult_missing_values_missing_target_500rows.csv') X = df[df.columns[:-1]] y = df[df.columns[-1]] - automl = AutoML(total_time_limit=1, algorithms=["Xgboost"], - start_random_models=1, + automl = AutoML(total_time_limit=15, algorithms=["Xgboost"], + start_random_models=5, hill_climbing_steps=0, - train_ensemble=False) + train_ensemble=True) automl.fit(X, y) y_predicted = automl.predict(X) - print(y_predicted) - pass - + self.assertTrue('A' in np.unique(y_predicted['label'])) + self.assertTrue('B' in np.unique(y_predicted['label'])) if __name__ == "__main__": unittest.main() diff --git a/tests/test_automl_performance.py b/tests/test_automl_performance.py index eb80684e..49604b35 100644 --- a/tests/test_automl_performance.py +++ b/tests/test_automl_performance.py @@ -40,7 +40,7 @@ def test_fit_and_predict(self): ) automl.fit(X_train, y_train) - response = automl.predict(X_test)["prediction"] + response = automl.predict(X_test)["p_1"] labels = automl.predict(X_test)["label"] # Compute the logloss on test dataset