Skip to content

Commit

Permalink
set metric to be optimized (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Apr 23, 2019
1 parent d4384fd commit 7ffa7f1
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 25 deletions.
8 changes: 5 additions & 3 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
top_models_to_improve=5,
train_ensemble=True,
verbose=True,
optimize_metric="logloss",
seed=1,
):
self._total_time_limit = total_time_limit
Expand Down Expand Up @@ -81,13 +82,14 @@ def __init__(
None,
)
self._seed = seed
self._optimize_metric = optimize_metric

def get_leaderboard(self):
ldb = {"uid":[], "model_type":[], "metric_type":[], "metric_value": [], "train_time": []}
for m in self._models:
ldb["uid"] += [m.uid]
ldb["model_type"] += [m.get_name()]
ldb["metric_type"] += ["logloss"]
ldb["metric_type"] += [self._optimize_metric]
ldb["metric_value"] += [m.get_final_loss()]
ldb["train_time"] += [m.get_train_time()]
return pd.DataFrame(ldb)
Expand Down Expand Up @@ -141,7 +143,7 @@ def keep_model(self, model):

def train_model(self, params, X, y):
metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
early_stop = EarlyStopping({"metric": {"name": "logloss"}})
early_stop = EarlyStopping({"metric": {"name": self._optimize_metric}})
time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
il = IterativeLearner(
params, callbacks=[early_stop, time_constraint, metric_logger]
Expand Down Expand Up @@ -220,7 +222,7 @@ def hill_climbing_step(self, X, y):

def ensemble_step(self, y):
if self._train_ensemble:
self.ensemble = Ensemble()
self.ensemble = Ensemble(self._optimize_metric)
X_oof = self.ensemble.get_oof_matrix(self._models)
self.ensemble.fit(X_oof, y)
self.keep_model(self.ensemble)
Expand Down
6 changes: 6 additions & 0 deletions supervised/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ def improvement(self, previous, current):
return current < previous
return current > previous

def get_maximum(self):
if self.minimize_direction:
return 10e12
else:
return -10e12

def worst_value(self):
if self.minimize_direction:
return np.Inf
Expand Down
24 changes: 14 additions & 10 deletions supervised/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from supervised.tuner.registry import ModelsRegistry
from supervised.tuner.registry import BINARY_CLASSIFICATION
from supervised.models.learner_factory import LearnerFactory
from supervised.iterative_learner_framework import IterativeLearner
import operator

log = logging.getLogger(__name__)
Expand All @@ -23,14 +24,13 @@ class Ensemble:
algorithm_name = "Greedy Ensemble"
algorithm_short_name = "Ensemble"

def __init__(self):
def __init__(self, optimize_metric="logloss"):
self.library_version = "0.1"
self.uid = str(uuid.uuid4())
self.model_file = self.uid + ".ensemble.model"
self.model_file_path = os.path.join(storage_path, self.model_file)
# right now only logloss can be optimized by ensemble
self.metric = Metric({"name": "logloss"})
self.best_loss = 10e12 # the best loss obtained by ensemble
self.metric = Metric({"name": optimize_metric})
self.best_loss = self.metric.get_maximum() # the best loss obtained by ensemble
self.models = None
self.selected_models = []
self.train_time = None
Expand Down Expand Up @@ -78,18 +78,19 @@ def fit(self, X, y):

best_sum = None # sum of best algorihtms
for j in range(X.shape[1]): # iterate over all solutions
min_score = 10e12
min_score = self.metric.get_maximum()
best_index = -1
# try to add some algorithm to the best_sum to minimize metric
for i in range(X.shape[1]):
y_ens = self._get_mean(X, best_sum, j + 1, "model_{}".format(i))
score = self.metric(y, y_ens)
if score < min_score:

if self.metric.improvement(previous=min_score, current=score):
min_score = score
best_index = i

# there is improvement, save it
if min_score + 10e-6 < self.best_loss:
if self.metric.improvement(previous=self.best_loss, current=min_score):
self.best_loss = min_score
selected_algs_cnt = j

Expand Down Expand Up @@ -131,7 +132,7 @@ def to_json(self):
for selected in self.selected_models:
model = selected["model"]
repeat = selected["repeat"]
models_json += [{"model": model.save(), "repeat": repeat}]
models_json += [{"model": model.to_json(), "repeat": repeat}]

json_desc = {
"library_version": self.library_version,
Expand All @@ -143,7 +144,6 @@ def to_json(self):
return json_desc

def from_json(self, json_desc):

self.library_version = json_desc.get("library_version", self.library_version)
self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)
self.algorithm_short_name = json_desc.get(
Expand All @@ -155,6 +155,10 @@ def from_json(self, json_desc):
for selected in models_json:
model = selected["model"]
repeat = selected["repeat"]

il = IterativeLearner(model.get("params"))
il.from_json(model)
self.selected_models += [
{"model": LearnerFactory.load(model), "repeat": repeat}
#{"model": LearnerFactory.load(model), "repeat": repeat}
{"model": il, "repeat": repeat}
]
24 changes: 20 additions & 4 deletions tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def setUpClass(cls):
)
cls.X = pd.DataFrame(cls.X, columns=["f0", "f1", "f2", "f3", "f4"])
# cls.y = pd.DataFrame(cls.y)
'''

def test_reproduce_fit(self):
metric = Metric({"name": "logloss"})
losses = []
Expand All @@ -44,7 +44,7 @@ def test_reproduce_fit(self):
y_predicted = automl.predict(self.X)["p_1"]
loss = metric(self.y, y_predicted)
losses += [loss]
assert_almost_equal(losses[0], losses[1], decimal=6)
assert_almost_equal(losses[0], losses[1], decimal=4)

def test_fit_and_predict(self):
metric = Metric({"name": "logloss"})
Expand All @@ -54,9 +54,10 @@ def test_fit_and_predict(self):
algorithms=["Xgboost"],
start_random_models=5,
hill_climbing_steps=0,
seed=13
)
automl.fit(self.X, self.y)
y_predicted = automl.predict(self.X)["p_1"]
self.assertTrue(y_predicted is not None)
loss = metric(self.y, y_predicted)
Expand All @@ -73,6 +74,20 @@ def test_fit_and_predict(self):

assert_almost_equal(automl._threshold, automl2._threshold)

def test_fit_optimize_auc(self):
automl = AutoML(
total_time_limit=5,
algorithms=["Xgboost"],
start_random_models=2,
hill_climbing_steps=0,
optimize_metric="auc",
seed=16
)
automl.fit(self.X, self.y)
ldb = automl.get_leaderboard()
self.assertEqual(ldb["metric_type"][0], "auc")
self.assertEqual(np.sum(ldb["metric_value"] > 0.5), ldb.shape[0]) # all better than 0.5 AUC

def test_predict_labels(self):
df = pd.read_csv("tests/data/adult_missing_values_missing_target_500rows.csv")
X = df[df.columns[:-1]]
Expand All @@ -83,13 +98,13 @@ def test_predict_labels(self):
start_random_models=5,
hill_climbing_steps=0,
train_ensemble=True,
seed=14
)
automl.fit(X, y)

y_predicted = automl.predict(X)
self.assertTrue("A" in np.unique(y_predicted["label"]))
self.assertTrue("B" in np.unique(y_predicted["label"]))
'''

def test_predict_labels(self):
automl = AutoML(
Expand All @@ -98,6 +113,7 @@ def test_predict_labels(self):
start_random_models=5,
hill_climbing_steps=0,
train_ensemble=True,
seed=15
)
automl.fit(self.X, self.y)
ldb = automl.get_leaderboard()
Expand Down
22 changes: 14 additions & 8 deletions tests/tests_models/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,21 @@
from supervised.models.learner_factory import LearnerFactory


class SimpleModel:
class SimpleFramework:
def __init__(self, params):
pass

def predict(self, X):
return np.array([0.1, 0.2, 0.8, 0.9])

def save(self):
return {"params": {"model_type": "simple"}}
def to_json(self):
return {"params": {"model_type": "simple",
"learner": {"model_type": "simple"},
'validation': {'validation_type': 'kfold', 'k_folds': 5, 'shuffle': True}
}}

def from_json(self, json_desc):
pass

def load(self, json_desc):
pass
Expand All @@ -44,7 +50,7 @@ def setUpClass(cls):

def test_fit_predict(self):
ensemble = Ensemble()
ensemble.models = [SimpleModel({})] * 5
ensemble.models = [SimpleFramework({})] * 5
ensemble.fit(self.X, self.y)
self.assertEqual(1, ensemble.selected_models[1]["repeat"])
self.assertEqual(1, ensemble.selected_models[1]["repeat"])
Expand All @@ -55,11 +61,11 @@ def test_fit_predict(self):
assert_almost_equal(y[2], 0.8)
assert_almost_equal(y[3], 0.9)

'''
def test_save_load(self):
LearnerFactory.learners["simple"] = SimpleModel

ensemble = Ensemble()
ensemble.models = [SimpleModel({})] * 5
ensemble.models = [SimpleFramework({})] * 5
ensemble.fit(self.X, self.y)
y = ensemble.predict(self.X)
assert_almost_equal(y[0], 0.1)
Expand All @@ -68,7 +74,7 @@ def test_save_load(self):
ensemble2.from_json(ensemble_json)
y2 = ensemble2.predict(self.X)
assert_almost_equal(y2[0], 0.1)

'''

if __name__ == "__main__":
unittest.main()

0 comments on commit 7ffa7f1

Please sign in to comment.