Skip to content

Commit

Permalink
predicting labels
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Apr 13, 2019
1 parent c570a50 commit 754fdd4
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 20 deletions.
10 changes: 2 additions & 8 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def get_additional_metrics(self):
oof_predictions["target"], oof_predictions["prediction"], BINARY_CLASSIFICATION
)
self._threshold = self._max_metrics["f1"]["threshold"]
print(self._metrics_details, self._max_metrics, self._confusion_matrix)
# print(self._metrics_details, self._max_metrics, self._confusion_matrix)

def _get_model_params(self, model_type, X, y):
model_info = ModelsRegistry.registry[BINARY_CLASSIFICATION][model_type]
Expand Down Expand Up @@ -243,18 +243,12 @@ def fit(self, X, y):
def predict(self, X):
if self._best_model is not None:
predictions = self._best_model.predict(X)

print("columns", predictions.columns, predictions.head())
neg_label, pos_label = predictions.columns[0][2:], predictions.columns[1][2:]
if neg_label == '0' and pos_label == '1':
neg_label, pos_label = 0, 1
# assume that it is binary classification
predictions['label'] = predictions.iloc[:, 1] > self._threshold

booleanDictionary = {True: pos_label, False: neg_label}

predictions['label'] = predictions['label'].map(booleanDictionary)

predictions['label'] = predictions['label'].map({True: pos_label, False: neg_label})

return predictions
#return pd.DataFrame(
Expand Down
7 changes: 6 additions & 1 deletion supervised/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(self):
self.selected_models = []
self.train_time = None
self.total_best_sum = None # total sum of predictions, the oof of ensemble
self.target = None

def get_train_time(self):
return self.train_time
Expand All @@ -44,7 +45,7 @@ def get_name(self):
return self.algorithm_short_name

def get_out_of_folds(self):
return pd.DataFrame({"prediction": self.total_best_sum})
return pd.DataFrame({"prediction": self.total_best_sum, "target": self.target})

def _get_mean(self, X, best_sum, best_count, selected):
resp = copy.deepcopy(X[selected])
Expand All @@ -58,6 +59,10 @@ def get_oof_matrix(self, models):
for i, m in enumerate(models):
oof = m.get_out_of_folds()
oofs["model_{}".format(i)] = oof["prediction"]
if self.target is None:
self.target = oof["target"] # it will be needed for computing advance model statistics
# it can be a mess in the future when target will be transformed depending on each model

X = pd.DataFrame(oofs)
self.models = models # remeber models, will be needed in predictions
return X
Expand Down
19 changes: 9 additions & 10 deletions tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def setUpClass(cls):
)
cls.X = pd.DataFrame(cls.X, columns=["f0", "f1", "f2", "f3", "f4"])
#cls.y = pd.DataFrame(cls.y)
'''

def test_fit_and_predict(self):
metric = Metric({"name": "logloss"})

Expand All @@ -35,7 +35,7 @@ def test_fit_and_predict(self):
hill_climbing_steps=0)
automl.fit(self.X, self.y)

y_predicted = automl.predict(self.X)["prediction"]
y_predicted = automl.predict(self.X)["p_1"]
self.assertTrue(y_predicted is not None)
loss = metric(self.y, y_predicted)
self.assertTrue(loss < 0.7)
Expand All @@ -44,29 +44,28 @@ def test_fit_and_predict(self):
automl2 = AutoML()
automl2.from_json(params)

y_predicted2 = automl2.predict(self.X)["prediction"]
y_predicted2 = automl2.predict(self.X)["p_1"]
self.assertTrue(y_predicted2 is not None)
loss2 = metric(self.y, y_predicted2)
self.assertTrue(loss2 < 0.7)

assert_almost_equal(automl._threshold, automl2._threshold)
'''


def test_predict_labels(self):
# 3.csv') #
df = pd.read_csv('tests/data/adult_missing_values_missing_target_500rows.csv')
X = df[df.columns[:-1]]
y = df[df.columns[-1]]
automl = AutoML(total_time_limit=1, algorithms=["Xgboost"],
start_random_models=1,
automl = AutoML(total_time_limit=15, algorithms=["Xgboost"],
start_random_models=5,
hill_climbing_steps=0,
train_ensemble=False)
train_ensemble=True)
automl.fit(X, y)

y_predicted = automl.predict(X)
print(y_predicted)
pass

self.assertTrue('A' in np.unique(y_predicted['label']))
self.assertTrue('B' in np.unique(y_predicted['label']))

if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion tests/test_automl_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_fit_and_predict(self):
)
automl.fit(X_train, y_train)

response = automl.predict(X_test)["prediction"]
response = automl.predict(X_test)["p_1"]
labels = automl.predict(X_test)["label"]

# Compute the logloss on test dataset
Expand Down

0 comments on commit 754fdd4

Please sign in to comment.