avoid_generating_more_column_than_data (#332)

microsoft · Sep 25, 2024 · cc0a86d · cc0a86d
1 parent 6d5efa8
commit cc0a86d
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 0 deletions.
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -394,6 +394,13 @@ def evaluate(
         if version == 1:
             feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
             conclusions.append(feedback_str)
+        elif version == 2:
+            input_shape = self.scen.input_shape
+            _, gen_df = self._get_df(gt_implementation, implementation)
+            if gen_df.shape[-1] > input_shape[-1]:
+                conclusions.append(
+                    "Output dataframe has more columns than input feature which is not acceptable in feature processing tasks. Please check the implementation to avoid generating too many columns. Consider this implementation as a failure."
+                )
 
         # Check if the index of the dataframe is ("datetime", "instrument")
         feedback_str, _ = FactorOutputFormatEvaluator(self.scen).evaluate(implementation, gt_implementation)

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -128,6 +128,8 @@ def source_data(self) -> str:
         pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
         pickle.dump(others, open(data_folder / "others.pkl", "wb"))
 
+        self.input_shape = X_train.shape
+
         buffer = io.StringIO()
         X_valid.info(verbose=True, buf=buffer, show_counts=True)
         data_info = buffer.getvalue()