diff --git a/setup.py b/setup.py index 05a10f87..52f84e50 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="mljar-supervised", - version="0.8.3", + version="0.8.4", description="Automates Machine Learning Pipeline with Feature Engineering and Hyper-Parameters Tuning", long_description=long_description, long_description_content_type="text/markdown", diff --git a/supervised/__init__.py b/supervised/__init__.py index f3773e88..10626612 100644 --- a/supervised/__init__.py +++ b/supervised/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.3" +__version__ = "0.8.4" from supervised.automl import AutoML diff --git a/supervised/algorithms/catboost.py b/supervised/algorithms/catboost.py index 331be4cc..c97c52fd 100644 --- a/supervised/algorithms/catboost.py +++ b/supervised/algorithms/catboost.py @@ -26,6 +26,7 @@ class CatBoostAlgorithm(BaseAlgorithm): algorithm_name = "CatBoost" algorithm_short_name = "CatBoost" + warmup_iterations = 20 def __init__(self, params): super(CatBoostAlgorithm, self).__init__(params) @@ -62,9 +63,10 @@ def __init__(self, params): rsm=self.learner_params["rsm"], loss_function=self.learner_params["loss_function"], verbose=False, - allow_writing_files=False, + allow_writing_files=False ) self.cat_features = None + self.best_ntree_limit = 0 logger.debug("CatBoostAlgorithm.__init__") @@ -73,7 +75,7 @@ def _assess_iterations(self, X, y, eval_set, max_time = None): max_time = 3600 try: model = copy.deepcopy(self.model) - model.set_params(iterations=1) + model.set_params(iterations=self.warmup_iterations) start_time = time.time() model.fit( X, @@ -82,14 +84,16 @@ def _assess_iterations(self, X, y, eval_set, max_time = None): init_model=None if self.model.tree_count_ is None else self.model, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, - verbose_eval=False, + verbose_eval=False ) - elapsed_time = np.round(time.time() - start_time, 2) + elapsed_time = (time.time() - start_time) / float(self.warmup_iterations) + #print(max_time, elapsed_time, max_time / elapsed_time, np.round(time.time() - start_time, 2)) new_rounds = int(min(10000, max_time / elapsed_time)) - new_rounds = max(max_rounds, 10) - return new_rounds + new_rounds = max(new_rounds, 10) + return model, new_rounds except Exception as e: - return 1000 + #print(str(e)) + return None, 1000 def fit( self, @@ -122,30 +126,41 @@ def fit( ) # disable for now ... - new_iterations = self._assess_iterations(X, y, eval_set, max_time = None) + model_init, new_iterations = self._assess_iterations(X, y, eval_set, max_time) self.model.set_params(iterations=new_iterations) - #self.model.set_params(iterations=self.rounds) - + self.model.fit( X, y, sample_weight=sample_weight, cat_features=self.cat_features, - init_model=None if self.model.tree_count_ is None else self.model, + init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, - verbose_eval=False, - ) + verbose_eval=False + ) + if self.model.best_iteration_ is not None: + self.best_ntree_limit = self.model.best_iteration_+self.warmup_iterations+1 + else: + # just take all the trees + # the warm-up trees are already included + # dont need to add +1 + self.best_ntree_limit = self.model.tree_count_ + if log_to_file is not None: metric_name = list(self.model.evals_result_["learn"].keys())[0] + train_scores = self.model.evals_result_["learn"][metric_name] + validation_scores = self.model.evals_result_["validation"][metric_name] + if model_init is not None: + train_scores = model_init.evals_result_["learn"][metric_name] + train_scores + validation_scores = model_init.evals_result_["validation"][metric_name] + validation_scores + result = pd.DataFrame( { - "iteration": range( - len(self.model.evals_result_["learn"][metric_name]) - ), - "train": self.model.evals_result_["learn"][metric_name], - "validation": self.model.evals_result_["validation"][metric_name], + "iteration": range(len(train_scores)), + "train": train_scores, + "validation": validation_scores, } ) result.to_csv(log_to_file, index=False, header=False) @@ -153,10 +168,11 @@ def fit( def predict(self, X): self.reload() if self.params["ml_task"] == BINARY_CLASSIFICATION: - return self.model.predict_proba(X)[:, 1] + return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)[:, 1] elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: - return self.model.predict_proba(X) - return self.model.predict(X) + return self.model.predict_proba(X, ntree_end=self.best_ntree_limit) + + return self.model.predict(X, ntree_end=self.best_ntree_limit) def copy(self): return copy.deepcopy(self) @@ -184,6 +200,7 @@ def get_params(self): "algorithm_short_name": self.algorithm_short_name, "uid": self.uid, "params": self.params, + "best_ntree_limit": self.best_ntree_limit } def set_params(self, json_desc): @@ -194,6 +211,7 @@ def set_params(self, json_desc): ) self.uid = json_desc.get("uid", self.uid) self.params = json_desc.get("params", self.params) + self.best_ntree_limit = json_desc.get("best_ntree_limit", self.best_ntree_limit) def file_extension(self): return "catboost" @@ -212,7 +230,7 @@ def get_metric_name(self): classification_params = { - "learning_rate": [0.05, 0.1, 0.2], + "learning_rate": [0.025, 0.05, 0.1, 0.2], "depth": [4, 5, 6, 7, 8, 9], "rsm": [0.7, 0.8, 0.9, 1], # random subspace method "loss_function": ["Logloss"], @@ -270,7 +288,6 @@ def get_metric_name(self): regression_params = copy.deepcopy(classification_params) regression_params["loss_function"] = ["RMSE", "MAE"] -regression_params["learning_rate"] = [0.1, 0.15, 0.2] regression_required_preprocessing = [ "missing_values_inputation", @@ -281,7 +298,7 @@ def get_metric_name(self): regression_default_params = { - "learning_rate": 0.15, + "learning_rate": 0.1, "depth": 6, "rsm": 1, "loss_function": "RMSE", diff --git a/supervised/base_automl.py b/supervised/base_automl.py index 753662f8..9f731e3d 100644 --- a/supervised/base_automl.py +++ b/supervised/base_automl.py @@ -265,7 +265,6 @@ def train_model(self, params): ): logger.info(f"Cannot train {params['name']} because of the time constraint") return False - # let's create directory to log all training artifacts model_path = os.path.join(self._results_path, params["name"]) self.create_dir(model_path) @@ -276,15 +275,15 @@ def train_model(self, params): ) # disable for now - #max_time_for_learner = 3600 - #if self._total_time_limit is not None: - # k_folds = self._validation_strategy.get("k_folds", 1.0) - # at_least_algorithms = 10.0 - # - # max_time_for_learner = self._total_time_limit / k_folds / at_least_algorithms - # max_time_for_learner += 60.0 + max_time_for_learner = 3600 + if self._total_time_limit is not None: + k_folds = self._validation_strategy.get("k_folds", 1.0) + at_least_algorithms = 10.0 + + max_time_for_learner = max(self._total_time_limit / k_folds / at_least_algorithms, 60) + #print("max_time_for_learner --->", max_time_for_learner) - #params["max_time_for_learner"] = max_time_for_learner + params["max_time_for_learner"] = max_time_for_learner total_time_constraint = TotalTimeConstraint( @@ -953,6 +952,7 @@ def _fit(self, X, y, sample_weight=None): generated_params[0]["learner"]["model_type"], self._fit_level ): self.verbose_print(f"Skip {step} because of the time limit.") + continue else: model_str = "models" if len(generated_params) > 1 else "model" self.verbose_print( diff --git a/supervised/tuner/mljar_tuner.py b/supervised/tuner/mljar_tuner.py index fdad8faa..d5978158 100644 --- a/supervised/tuner/mljar_tuner.py +++ b/supervised/tuner/mljar_tuner.py @@ -255,13 +255,18 @@ def get_params_stack_models(self, stacked_models): X_train_stacked_path = "" added_columns = [] - generated_params = [] + model_types = ["Xgboost", "LightGBM", "CatBoost"] + generated_params = {m: [] for m in model_types} + types_score_order = [] # resue old params for m in stacked_models: # use only Xgboost, LightGBM and CatBoost as stacked models - if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]: + if m.get_type() not in model_types: continue + if m.get_type() not in types_score_order: + types_score_order += [m.get_type()] + if m.params.get("injected_sample_weight", False): # dont use boost_on_errors model for stacking # there will be additional boost_on_errors step @@ -296,8 +301,19 @@ def get_params_stack_models(self, stacked_models): for col in added_columns: params["preprocessing"]["columns_preprocessing"][col] = [scale] - generated_params += [params] - return generated_params + generated_params[m.get_type()] += [params] + + return_params = [] + for i in range(100): + total = 0 + for m in types_score_order: + if generated_params[m]: + return_params += [generated_params[m].pop(0)] + total += len(generated_params[m]) + if total == 0: + break + + return return_params def adjust_validation_params(self, models_cnt): generated_params = [] diff --git a/supervised/tuner/time_controller.py b/supervised/tuner/time_controller.py index 6ed92b8a..7bdf0fc6 100644 --- a/supervised/tuner/time_controller.py +++ b/supervised/tuner/time_controller.py @@ -61,7 +61,7 @@ def time_should_use(self, fit_level): ratios = { "default_algorithms": 0.3, - "not_so_random": 0.3, + "not_so_random": 0.35, "mix_encoding": 0.05, "golden_features": 0.05, "kmeans_features": 0.05, @@ -69,7 +69,7 @@ def time_should_use(self, fit_level): "features_selection": 0.05, "hill_climbing_1": 0.2, # enough to have only first step from hill climbing "boost_on_errors": 0.05, - "stack": 0.15, + "stack": 0.2, } if ( @@ -137,7 +137,7 @@ def enough_time_for_step(self, fit_level): total_time_spend = time.time() - self._start_time compound = self.compound_time_should_use(fit_level) - #print(fit_level, total_time_spend, compound, self._total_time_limit) + #print("Enough time for step", fit_level, np.round(total_time_spend,2), np.round(compound,2)) if total_time_spend > compound: # dont train more return False