Skip to content

Commit

Permalink
minor fixes and set version 0.8.4
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Jan 29, 2021
1 parent a63e946 commit 914b372
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 42 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="mljar-supervised",
version="0.8.3",
version="0.8.4",
description="Automates Machine Learning Pipeline with Feature Engineering and Hyper-Parameters Tuning",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion supervised/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = "0.8.3"
__version__ = "0.8.4"

from supervised.automl import AutoML
65 changes: 41 additions & 24 deletions supervised/algorithms/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class CatBoostAlgorithm(BaseAlgorithm):

algorithm_name = "CatBoost"
algorithm_short_name = "CatBoost"
warmup_iterations = 20

def __init__(self, params):
super(CatBoostAlgorithm, self).__init__(params)
Expand Down Expand Up @@ -62,9 +63,10 @@ def __init__(self, params):
rsm=self.learner_params["rsm"],
loss_function=self.learner_params["loss_function"],
verbose=False,
allow_writing_files=False,
allow_writing_files=False
)
self.cat_features = None
self.best_ntree_limit = 0

logger.debug("CatBoostAlgorithm.__init__")

Expand All @@ -73,7 +75,7 @@ def _assess_iterations(self, X, y, eval_set, max_time = None):
max_time = 3600
try:
model = copy.deepcopy(self.model)
model.set_params(iterations=1)
model.set_params(iterations=self.warmup_iterations)
start_time = time.time()
model.fit(
X,
Expand All @@ -82,14 +84,16 @@ def _assess_iterations(self, X, y, eval_set, max_time = None):
init_model=None if self.model.tree_count_ is None else self.model,
eval_set=eval_set,
early_stopping_rounds=self.early_stopping_rounds,
verbose_eval=False,
verbose_eval=False
)
elapsed_time = np.round(time.time() - start_time, 2)
elapsed_time = (time.time() - start_time) / float(self.warmup_iterations)
#print(max_time, elapsed_time, max_time / elapsed_time, np.round(time.time() - start_time, 2))
new_rounds = int(min(10000, max_time / elapsed_time))
new_rounds = max(max_rounds, 10)
return new_rounds
new_rounds = max(new_rounds, 10)
return model, new_rounds
except Exception as e:
return 1000
#print(str(e))
return None, 1000

def fit(
self,
Expand Down Expand Up @@ -122,41 +126,53 @@ def fit(
)

# disable for now ...
new_iterations = self._assess_iterations(X, y, eval_set, max_time = None)
model_init, new_iterations = self._assess_iterations(X, y, eval_set, max_time)
self.model.set_params(iterations=new_iterations)
#self.model.set_params(iterations=self.rounds)


self.model.fit(
X,
y,
sample_weight=sample_weight,
cat_features=self.cat_features,
init_model=None if self.model.tree_count_ is None else self.model,
init_model=model_init,
eval_set=eval_set,
early_stopping_rounds=self.early_stopping_rounds,
verbose_eval=False,
)
verbose_eval=False
)
if self.model.best_iteration_ is not None:
self.best_ntree_limit = self.model.best_iteration_+self.warmup_iterations+1
else:
# just take all the trees
# the warm-up trees are already included
# dont need to add +1
self.best_ntree_limit = self.model.tree_count_

if log_to_file is not None:

metric_name = list(self.model.evals_result_["learn"].keys())[0]
train_scores = self.model.evals_result_["learn"][metric_name]
validation_scores = self.model.evals_result_["validation"][metric_name]
if model_init is not None:
train_scores = model_init.evals_result_["learn"][metric_name] + train_scores
validation_scores = model_init.evals_result_["validation"][metric_name] + validation_scores

result = pd.DataFrame(
{
"iteration": range(
len(self.model.evals_result_["learn"][metric_name])
),
"train": self.model.evals_result_["learn"][metric_name],
"validation": self.model.evals_result_["validation"][metric_name],
"iteration": range(len(train_scores)),
"train": train_scores,
"validation": validation_scores,
}
)
result.to_csv(log_to_file, index=False, header=False)

def predict(self, X):
self.reload()
if self.params["ml_task"] == BINARY_CLASSIFICATION:
return self.model.predict_proba(X)[:, 1]
return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)[:, 1]
elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION:
return self.model.predict_proba(X)
return self.model.predict(X)
return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)

return self.model.predict(X, ntree_end=self.best_ntree_limit)

def copy(self):
return copy.deepcopy(self)
Expand Down Expand Up @@ -184,6 +200,7 @@ def get_params(self):
"algorithm_short_name": self.algorithm_short_name,
"uid": self.uid,
"params": self.params,
"best_ntree_limit": self.best_ntree_limit
}

def set_params(self, json_desc):
Expand All @@ -194,6 +211,7 @@ def set_params(self, json_desc):
)
self.uid = json_desc.get("uid", self.uid)
self.params = json_desc.get("params", self.params)
self.best_ntree_limit = json_desc.get("best_ntree_limit", self.best_ntree_limit)

def file_extension(self):
return "catboost"
Expand All @@ -212,7 +230,7 @@ def get_metric_name(self):


classification_params = {
"learning_rate": [0.05, 0.1, 0.2],
"learning_rate": [0.025, 0.05, 0.1, 0.2],
"depth": [4, 5, 6, 7, 8, 9],
"rsm": [0.7, 0.8, 0.9, 1], # random subspace method
"loss_function": ["Logloss"],
Expand Down Expand Up @@ -270,7 +288,6 @@ def get_metric_name(self):

regression_params = copy.deepcopy(classification_params)
regression_params["loss_function"] = ["RMSE", "MAE"]
regression_params["learning_rate"] = [0.1, 0.15, 0.2]

regression_required_preprocessing = [
"missing_values_inputation",
Expand All @@ -281,7 +298,7 @@ def get_metric_name(self):


regression_default_params = {
"learning_rate": 0.15,
"learning_rate": 0.1,
"depth": 6,
"rsm": 1,
"loss_function": "RMSE",
Expand Down
18 changes: 9 additions & 9 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ def train_model(self, params):
):
logger.info(f"Cannot train {params['name']} because of the time constraint")
return False

# let's create directory to log all training artifacts
model_path = os.path.join(self._results_path, params["name"])
self.create_dir(model_path)
Expand All @@ -276,15 +275,15 @@ def train_model(self, params):
)

# disable for now
#max_time_for_learner = 3600
#if self._total_time_limit is not None:
# k_folds = self._validation_strategy.get("k_folds", 1.0)
# at_least_algorithms = 10.0
#
# max_time_for_learner = self._total_time_limit / k_folds / at_least_algorithms
# max_time_for_learner += 60.0
max_time_for_learner = 3600
if self._total_time_limit is not None:
k_folds = self._validation_strategy.get("k_folds", 1.0)
at_least_algorithms = 10.0

max_time_for_learner = max(self._total_time_limit / k_folds / at_least_algorithms, 60)

#print("max_time_for_learner --->", max_time_for_learner)
#params["max_time_for_learner"] = max_time_for_learner
params["max_time_for_learner"] = max_time_for_learner


total_time_constraint = TotalTimeConstraint(
Expand Down Expand Up @@ -953,6 +952,7 @@ def _fit(self, X, y, sample_weight=None):
generated_params[0]["learner"]["model_type"], self._fit_level
):
self.verbose_print(f"Skip {step} because of the time limit.")
continue
else:
model_str = "models" if len(generated_params) > 1 else "model"
self.verbose_print(
Expand Down
24 changes: 20 additions & 4 deletions supervised/tuner/mljar_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,13 +255,18 @@ def get_params_stack_models(self, stacked_models):
X_train_stacked_path = ""
added_columns = []

generated_params = []
model_types = ["Xgboost", "LightGBM", "CatBoost"]
generated_params = {m: [] for m in model_types}
types_score_order = []
# resue old params
for m in stacked_models:
# use only Xgboost, LightGBM and CatBoost as stacked models
if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]:
if m.get_type() not in model_types:
continue

if m.get_type() not in types_score_order:
types_score_order += [m.get_type()]

if m.params.get("injected_sample_weight", False):
# dont use boost_on_errors model for stacking
# there will be additional boost_on_errors step
Expand Down Expand Up @@ -296,8 +301,19 @@ def get_params_stack_models(self, stacked_models):
for col in added_columns:
params["preprocessing"]["columns_preprocessing"][col] = [scale]

generated_params += [params]
return generated_params
generated_params[m.get_type()] += [params]

return_params = []
for i in range(100):
total = 0
for m in types_score_order:
if generated_params[m]:
return_params += [generated_params[m].pop(0)]
total += len(generated_params[m])
if total == 0:
break

return return_params

def adjust_validation_params(self, models_cnt):
generated_params = []
Expand Down
6 changes: 3 additions & 3 deletions supervised/tuner/time_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ def time_should_use(self, fit_level):

ratios = {
"default_algorithms": 0.3,
"not_so_random": 0.3,
"not_so_random": 0.35,
"mix_encoding": 0.05,
"golden_features": 0.05,
"kmeans_features": 0.05,
"insert_random_feature": 0.05,
"features_selection": 0.05,
"hill_climbing_1": 0.2, # enough to have only first step from hill climbing
"boost_on_errors": 0.05,
"stack": 0.15,
"stack": 0.2,
}

if (
Expand Down Expand Up @@ -137,7 +137,7 @@ def enough_time_for_step(self, fit_level):

total_time_spend = time.time() - self._start_time
compound = self.compound_time_should_use(fit_level)
#print(fit_level, total_time_spend, compound, self._total_time_limit)
#print("Enough time for step", fit_level, np.round(total_time_spend,2), np.round(compound,2))
if total_time_spend > compound:
# dont train more
return False
Expand Down

0 comments on commit 914b372

Please sign in to comment.