disable features selection when not enough time (#284)

mljar · Jan 10, 2021 · 3df2cc1 · 3df2cc1
1 parent bbdcf3e
commit 3df2cc1
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 82 deletions.
diff --git a/supervised/base_automl.py b/supervised/base_automl.py
@@ -874,7 +874,11 @@ def _fit(self, X, y):
                     generated_params = self._all_params[step]
                 else:
                     generated_params = tuner.generate_params(
-                        step, self._models, self._results_path, self._stacked_models
+                        step,
+                        self._models,
+                        self._results_path,
+                        self._stacked_models,
+                        self._total_time_limit,
                     )
 
                 if generated_params is None or not generated_params:

diff --git a/supervised/tuner/mljar_tuner.py b/supervised/tuner/mljar_tuner.py
@@ -120,7 +120,9 @@ def get_model_name(self, model_type, models_cnt, special=""):
     def filter_random_feature_model(self, models):
         return [m for m in models if "RandomFeature" not in m.get_name()]
 
-    def generate_params(self, step, models, results_path, stacked_models):
+    def generate_params(
+        self, step, models, results_path, stacked_models, total_time_limit
+    ):
 
         models_cnt = len(models)
         if step == "adjust_validation":
@@ -132,13 +134,13 @@ def generate_params(self, step, models, results_path, stacked_models):
         elif step == "not_so_random":
             return self.get_not_so_random_params(models_cnt)
         elif step == "all_ints_encoding":
-            return self.get_all_int_categorical_strategy(models)
+            return self.get_all_int_categorical_strategy(models, total_time_limit)
         elif step == "loo_encoding":
-            return self.get_loo_categorical_strategy(models)
+            return self.get_loo_categorical_strategy(models, total_time_limit)
         elif step == "golden_features":
-            return self.get_golden_features_params(models, results_path)
+            return self.get_golden_features_params(models, results_path, total_time_limit)
         elif step == "insert_random_feature":
-            return self.get_params_to_insert_random_feature(models)
+            return self.get_params_to_insert_random_feature(models, total_time_limit)
         elif step == "features_selection":
             return self.get_features_selection_params(
                 self.filter_random_feature_model(models), results_path
@@ -421,32 +423,16 @@ def get_not_so_random_params(self, models_cnt):
         return return_params
 
     def get_hill_climbing_params(self, current_models):
-
-        # second, hill climbing
-        # for _ in range(self._hill_climbing_steps):
-        # just do one step
-        # get models orderer by loss
-        # TODO: refactor this callbacks.callbacks[0]
-        scores = [m.get_final_loss() for m in current_models]
-        model_types = [m.get_type() for m in current_models]
-        df_models = pd.DataFrame(
-            {"model": current_models, "score": scores, "model_type": model_types}
-        )
-        # do group by for debug reason
-        df_models = df_models.groupby("model_type").apply(
-            lambda x: x.sort_values("score")
-        )
-        unique_model_types = np.unique(df_models.model_type)
-
+        df_models, algorithms = self.df_models_algorithms(current_models)
         generated_params = []
-        for m_type in unique_model_types:
+        for m_type in algorithms:
             if m_type in ["Baseline", "Decision Tree", "Linear", "Nearest Neighbors"]:
-                # dont tune Baseline and Decision Tree
+                # dont tune ...
                 continue
             models = df_models[df_models.model_type == m_type]["model"]
 
             for i in range(min(self._top_models_to_improve, len(models))):
-                m = models[i]
+                m = models.iloc[i]
 
                 for p in HillClimbing.get(
                     m.params.get("learner"),
@@ -486,33 +472,21 @@ def get_hill_climbing_params(self, current_models):
                             generated_params += [all_params]
         return generated_params
 
-    def get_all_int_categorical_strategy(self, current_models):
+    def get_all_int_categorical_strategy(self, current_models, total_time_limit):
         return self.get_categorical_strategy(
-            current_models, PreprocessingTuner.CATEGORICALS_ALL_INT
+            current_models, PreprocessingTuner.CATEGORICALS_ALL_INT, total_time_limit
         )
 
-    def get_loo_categorical_strategy(self, current_models):
+    def get_loo_categorical_strategy(self, current_models, total_time_limit):
         return self.get_categorical_strategy(
-            current_models, PreprocessingTuner.CATEGORICALS_LOO
+            current_models, PreprocessingTuner.CATEGORICALS_LOO, total_time_limit
         )
 
-    def get_categorical_strategy(self, current_models, strategy):
-
-        # get models orderer by loss
-        # TODO: refactor this callbacks.callbacks[0]
-        scores = [m.get_final_loss() for m in current_models]
-        model_types = [m.get_type() for m in current_models]
-        df_models = pd.DataFrame(
-            {"model": current_models, "score": scores, "model_type": model_types}
-        )
-        # do group by for debug reason
-        df_models = df_models.groupby("model_type").apply(
-            lambda x: x.sort_values("score")
-        )
-        unique_model_types = np.unique(df_models.model_type)
+    def get_categorical_strategy(self, current_models, strategy, total_time_limit):
 
+        df_models, algorithms = self.df_models_algorithms(current_models, time_limit=0.03*total_time_limit)
         generated_params = []
-        for m_type in unique_model_types:
+        for m_type in algorithms:
             # try to add categorical strategy only for below algorithms
             if m_type not in [
                 "Xgboost",
@@ -525,7 +499,7 @@ def get_categorical_strategy(self, current_models, strategy):
             models = df_models[df_models.model_type == m_type]["model"]
 
             for i in range(min(1, len(models))):
-                m = models[i]
+                m = models.iloc[i]
 
                 params = copy.deepcopy(m.params)
                 cols_preprocessing = params["preprocessing"]["columns_preprocessing"]
@@ -569,30 +543,43 @@ def get_categorical_strategy(self, current_models, strategy):
                     generated_params += [params]
         return generated_params
 
-    def get_golden_features_params(self, current_models, results_path):
-        # get models orderer by loss
-        # TODO: refactor this callbacks.callbacks[0]
+    def df_models_algorithms(self, current_models, time_limit=None):
         scores = [m.get_final_loss() for m in current_models]
         model_types = [m.get_type() for m in current_models]
+        names = [m.get_name() for m in current_models]
+        train_times = [m.get_train_time() for m in current_models]
+
         df_models = pd.DataFrame(
-            {"model": current_models, "score": scores, "model_type": model_types}
-        )
-        # do group by for debug reason
-        df_models = df_models.groupby("model_type").apply(
-            lambda x: x.sort_values("score")
+            {"model": current_models, "score": scores, "model_type": model_types, "name": names, "train_time": train_times}
         )
-        unique_model_types = np.unique(df_models.model_type)
+        if time_limit is not None:
+            df_models = df_models[df_models.train_time < time_limit]
+            df_models.reset_index(drop=True, inplace=True)
+
+        df_models.sort_values(by="score", ascending=True, inplace=True)
+        model_types = list(df_models.model_type)
+        u, idx = np.unique(model_types, return_index=True)
+        algorithms = u[np.argsort(idx)]
+
+        #print(df_models)
+        #print(algorithms)
+        return df_models, algorithms
+
+
+    def get_golden_features_params(self, current_models, results_path, total_time_limit):
+
+        df_models, algorithms = self.df_models_algorithms(current_models, time_limit=0.03*total_time_limit)
 
         generated_params = []
-        for m_type in unique_model_types:
+        for m_type in algorithms:
             # try to add golden features only for below algorithms
             if m_type not in ["Xgboost", "LightGBM", "CatBoost"]:
                 continue
             models = df_models[df_models.model_type == m_type]["model"]
 
             for i in range(min(1, len(models))):
-                m = models[i]
-
+                m = models.iloc[i]
+    
                 params = copy.deepcopy(m.params)
                 params["preprocessing"]["golden_features"] = {
                     "results_path": results_path,
@@ -611,15 +598,49 @@ def get_golden_features_params(self, current_models, results_path):
                     generated_params += [params]
         return generated_params
 
-    def get_params_to_insert_random_feature(self, current_models):
-        # get models orderer by loss
-        # TODO: refactor this callbacks.callbacks[0]
-        scores = [m.get_final_loss() for m in current_models]
-        model_types = [m.get_type() for m in current_models]
-        df_models = pd.DataFrame(
-            {"model": current_models, "score": scores, "model_type": model_types}
-        )
-        df_models.sort_values(by="score", ascending=True, inplace=True)
+    def time_features_selection(self, current_models):
+
+        df_models, algorithms = self.df_models_algorithms(current_models)        
+
+        time_needed = 0
+        for m_type in algorithms:
+
+            if m_type not in [
+                "Xgboost",
+                "LightGBM",
+                "CatBoost",
+                "Neural Network",
+                "Random Forest",
+                "Extra Trees",
+            ]:
+                continue
+            models = df_models[df_models.model_type == m_type]["model"]
+
+            for i in range(min(1, len(models))):
+                m = models.iloc[i]
+                if time_needed == 0:
+                    # best model will be used two times
+                    # one for insert random feature
+                    # one for selected features
+                    time_needed += 2.0 * m.get_train_time()
+                else:
+                    time_needed += m.get_train_time()
+                print(m.get_name(), m.get_final_loss(), time_needed)
+        print("Time needed for features selection ~", np.round(time_needed), "seconds")
+        return time_needed
+
+    def get_params_to_insert_random_feature(self, current_models, total_time_limit):
+
+        time_needed = self.time_features_selection(current_models)
+
+        if time_needed > 0.1 * total_time_limit:
+            print("Not enough time to perform features selection. Skip")
+            print(
+                f"Please increase total_time_limit to at least ({int(np.round(10.0*time_needed))+60} seconds) to have features selection"
+            )
+            return None
+
+        df_models, algorithms = self.df_models_algorithms(current_models)
 
         m = df_models.iloc[0]["model"]
 
@@ -653,22 +674,12 @@ def get_features_selection_params(self, current_models, results_path):
         # skip this step
         if len(drop_features) <= 1:
             return None
-        # get models orderer by loss
-        # TODO: refactor this callbacks.callbacks[0]
-        scores = [m.get_final_loss() for m in current_models]
-        model_types = [m.get_type() for m in current_models]
-        df_models = pd.DataFrame(
-            {"model": current_models, "score": scores, "model_type": model_types}
-        )
-        # do group by for debug reason
-        df_models = df_models.groupby("model_type").apply(
-            lambda x: x.sort_values("score")
-        )
-        unique_model_types = np.unique(df_models.model_type)
+
+        df_models, algorithms = self.df_models_algorithms(current_models)      
 
         generated_params = []
-        for m_type in unique_model_types:
-            # try to add golden features only for below algorithms
+        for m_type in algorithms:
+            # try to do features selection only for below algorithms
             if m_type not in [
                 "Xgboost",
                 "LightGBM",
@@ -681,7 +692,7 @@ def get_features_selection_params(self, current_models, results_path):
             models = df_models[df_models.model_type == m_type]["model"]
 
             for i in range(min(1, len(models))):
-                m = models[i]
+                m = models.iloc[i]
 
                 params = copy.deepcopy(m.params)
                 params["preprocessing"]["drop_features"] = drop_features