fix: partial bug in bench (#368)

* Add more log * Fix eval * update * keep ratio <= 1 * feat: cache exception (#369) * 0 instead of NaN * remove unused evaluators * save gen_factor_l_all_rounds * black reformat * cache exception --------- Co-authored-by: Tim <[email protected]>
microsoft · Sep 29, 2024 · af9808f · af9808f
1 parent c2168f8
commit af9808f
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 55 deletions.
diff --git a/rdagent/app/benchmark/factor/analysis.py b/rdagent/app/benchmark/factor/analysis.py
@@ -4,9 +4,9 @@
 
 import fire
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 import seaborn as sns
-from scipy.stats import gmean
 
 from rdagent.components.benchmark.conf import BenchmarkSettings
 from rdagent.components.benchmark.eval_method import FactorImplementEval
@@ -44,7 +44,24 @@ def process_results(self, results):
             final_res[experiment] = processed_data.iloc[-1, :]
         return final_res
 
-    def reformat_succ_rate(self, display_df):
+    def reformat_index(self, display_df):
+        """
+        reform the results from
+
+        .. code-block:: python
+
+                              success rate
+            High_Beta_Factor           0.2
+
+        to
+
+        .. code-block:: python
+
+                                                    success rate
+            Category Difficulty Factor
+            量价       Hard       High_Beta_Factor           0.2
+
+        """
         new_idx = []
         display_df = display_df[display_df.index.isin(self.index_map.keys())]
         for idx in display_df.index:
@@ -78,11 +95,9 @@ def result_all_key_order(self, x):
     def analyze_data(self, sum_df):
         index = [
             "FactorSingleColumnEvaluator",
-            "FactorOutputFormatEvaluator",
             "FactorRowCountEvaluator",
             "FactorIndexEvaluator",
-            "FactorMissingValuesEvaluator",
-            "FactorEqualValueCountEvaluator",
+            "FactorEqualValueRatioEvaluator",
             "FactorCorrelationEvaluator",
             "run factor error",
         ]
@@ -93,40 +108,35 @@ def analyze_data(self, sum_df):
         succ_rate = ~run_error
         succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
 
-        succ_rate_f = self.reformat_succ_rate(succ_rate)
-        succ_rate_f
-
-        sum_df_clean["FactorRowCountEvaluator"]
+        succ_rate_f = self.reformat_index(succ_rate)
 
-        format_issue = sum_df_clean["FactorRowCountEvaluator"].astype(bool) & sum_df_clean[
-            "FactorIndexEvaluator"
-        ].astype(bool)
+        # if it rasis Error when running the evaluator, we will get NaN
+        # Running failures are reguarded to zero score.
         format_issue = sum_df_clean[["FactorRowCountEvaluator", "FactorIndexEvaluator"]].apply(
-            lambda x: gmean(x), axis=1
+            lambda x: np.mean(x.fillna(0.0)), axis=1
         )
+        format_succ_rate = format_issue.unstack().T.mean(axis=0).to_frame("success rate")
+        format_succ_rate_f = self.reformat_index(format_succ_rate)
 
-        eval_series = format_issue.unstack()
-        succ_rate = eval_series.T.fillna(False)
-
-        format_succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
-        format_succ_rate_f = self.reformat_succ_rate(format_succ_rate)
-
-        corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+        corr = sum_df_clean["FactorCorrelationEvaluator"].fillna(0.0)
         corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
-        corr_res = self.reformat_succ_rate(corr)
-        corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
+        corr_res = self.reformat_index(corr)
+        corr_max = sum_df_clean["FactorCorrelationEvaluator"]
 
         corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
-        corr_max_res = self.reformat_succ_rate(corr_max)
+        corr_max_res = self.reformat_index(corr_max)
 
-        value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
+        value_max = sum_df_clean["FactorEqualValueRatioEvaluator"]
         value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
-        value_max_res = self.reformat_succ_rate(value_max)
+        value_max_res = self.reformat_index(value_max)
 
         value_avg = (
-            (sum_df_clean["FactorMissingValuesEvaluator"] * format_issue).unstack().T.mean(axis=0).to_frame("avg_value")
+            (sum_df_clean["FactorEqualValueRatioEvaluator"] * format_issue)
+            .unstack()
+            .T.mean(axis=0)
+            .to_frame("avg_value")
         )
-        value_avg_res = self.reformat_succ_rate(value_avg)
+        value_avg_res = self.reformat_index(value_avg)
 
         result_all = pd.concat(
             {

diff --git a/rdagent/components/benchmark/eval_method.py b/rdagent/components/benchmark/eval_method.py
@@ -8,11 +8,9 @@
 from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
     FactorCorrelationEvaluator,
-    FactorEqualValueCountEvaluator,
+    FactorEqualValueRatioEvaluator,
     FactorEvaluator,
     FactorIndexEvaluator,
-    FactorMissingValuesEvaluator,
-    FactorOutputFormatEvaluator,
     FactorRowCountEvaluator,
     FactorSingleColumnEvaluator,
 )
@@ -151,20 +149,16 @@ def __init__(
     ):
         online_evaluator_l = [
             FactorSingleColumnEvaluator(scen),
-            FactorOutputFormatEvaluator(scen),
             FactorRowCountEvaluator(scen),
             FactorIndexEvaluator(scen),
-            FactorMissingValuesEvaluator(scen),
-            FactorEqualValueCountEvaluator(scen),
+            FactorEqualValueRatioEvaluator(scen),
             FactorCorrelationEvaluator(hard_check=False, scen=scen),
         ]
         super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
         self.test_round = test_round
 
-    def eval(self):
+    def develop(self):
         gen_factor_l_all_rounds = []
-        test_cases_all_rounds = []
-        res = defaultdict(list)
         for _ in tqdm(range(self.test_round), desc="Rounds of Eval"):
             print("\n========================================================")
             print(f"Eval {_}-th times...")
@@ -181,8 +175,14 @@ def eval(self):
                     "The number of cases to eval should be equal to the number of test cases.",
                 )
             gen_factor_l_all_rounds.extend(gen_factor_l.sub_workspace_list)
-            test_cases_all_rounds.extend(self.test_cases.ground_truth)
 
+        return gen_factor_l_all_rounds
+
+    def eval(self, gen_factor_l_all_rounds):
+        test_cases_all_rounds = []
+        res = defaultdict(list)
+        for _ in range(self.test_round):
+            test_cases_all_rounds.extend(self.test_cases.ground_truth)
         eval_res_list = multiprocessing_wrapper(
             [
                 (self.eval_case, (gt_case, gen_factor))

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -250,7 +250,7 @@ def evaluate(
                 "The source dataframe is None. Please check the implementation.",
                 False,
             )
-        ratio = len(gen_df) / len(gt_df)
+        ratio = min(len(gen_df), len(gt_df)) / max(len(gen_df), len(gt_df))
         return (
             f"The ratio of rows count in the source dataframe to the ground truth dataframe is {ratio:.2f}. "
             + "Please verify the implementation. "
@@ -304,7 +304,7 @@ def evaluate(
             )
 
 
-class FactorEqualValueCountEvaluator(FactorEvaluator):
+class FactorEqualValueRatioEvaluator(FactorEvaluator):
     def evaluate(
         self,
         implementation: Workspace,
@@ -392,6 +392,7 @@ def evaluate(
         output_format_result = None
         equal_value_ratio_result = 0
         high_correlation_result = False
+        row_result = None
 
         # Check if both dataframe has only one columns Mute this since factor task might generate more than one columns now
         if version == 1:
@@ -429,7 +430,7 @@ def evaluate(
             )
             conclusions.append(feedback_str)
 
-            feedback_str, equal_value_ratio_result = FactorEqualValueCountEvaluator(self.scen).evaluate(
+            feedback_str, equal_value_ratio_result = FactorEqualValueRatioEvaluator(self.scen).evaluate(
                 implementation, gt_implementation
             )
             conclusions.append(feedback_str)
@@ -448,16 +449,15 @@ def evaluate(
 
         if gt_implementation is not None and (equal_value_ratio_result > 0.99) or high_correlation_result:
             decision_from_value_check = True
-        if version == 1:
-            if row_result <= 0.99 or output_format_result is False or daily_check_result is False:
-                decision_from_value_check = False
-            else:
-                decision_from_value_check = None
-        elif version == 2:
-            if output_format_result is False or daily_check_result is False:
-                decision_from_value_check = False
-            else:
-                decision_from_value_check = None
+        elif (
+            row_result is not None
+            and row_result <= 0.99
+            or output_format_result is False
+            or daily_check_result is False
+        ):
+            decision_from_value_check = False
+        else:
+            decision_from_value_check = None
         return conclusion_str, decision_from_value_check
 
 

diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
@@ -117,11 +117,14 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                 target_file_name = md5_hash(data_type + self.code_dict["factor.py"])
                 cache_file_path = Path(FACTOR_IMPLEMENT_SETTINGS.cache_location) / f"{target_file_name}.pkl"
                 Path(FACTOR_IMPLEMENT_SETTINGS.cache_location).mkdir(exist_ok=True, parents=True)
-                if cache_file_path.exists() and not self.raise_exception:
+                if cache_file_path.exists():
                     cached_res = pickle.load(open(cache_file_path, "rb"))
-                    if store_result and cached_res[1] is not None:
-                        self.executed_factor_value_dataframe = cached_res[1]
-                    return cached_res
+                    if not self.raise_exception or len(cached_res) == 3:
+                        if cached_res[2]:
+                            raise cached_res[2]
+                        if store_result and cached_res[1] is not None:
+                            self.executed_factor_value_dataframe = cached_res[1]
+                        return cached_res[:1]
 
             if self.executed_factor_value_dataframe is not None:
                 return self.FB_FROM_CACHE, self.executed_factor_value_dataframe
@@ -147,6 +150,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
 
             execution_feedback = self.FB_EXECUTION_SUCCEEDED
             execution_success = False
+            execution_error = None
 
             if self.target_task.version == 1:
                 execution_code_path = code_path
@@ -177,10 +181,14 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                     )
                 if self.raise_exception:
                     raise CustomRuntimeError(execution_feedback)
+                else:
+                    execution_error = CustomRuntimeError(execution_feedback)
             except subprocess.TimeoutExpired:
                 execution_feedback += f"Execution timeout error and the timeout is set to {FACTOR_IMPLEMENT_SETTINGS.file_based_execution_timeout} seconds."
                 if self.raise_exception:
                     raise CustomRuntimeError(execution_feedback)
+                else:
+                    execution_error = CustomRuntimeError(execution_feedback)
 
             workspace_output_file_path = self.workspace_path / "result.h5"
             if workspace_output_file_path.exists() and execution_success:
@@ -195,13 +203,15 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
                 executed_factor_value_dataframe = None
                 if self.raise_exception:
                     raise NoOutputError(execution_feedback)
+                else:
+                    execution_error = NoOutputError(execution_feedback)
 
             if store_result and executed_factor_value_dataframe is not None:
                 self.executed_factor_value_dataframe = executed_factor_value_dataframe
 
         if FACTOR_IMPLEMENT_SETTINGS.enable_execution_cache:
             pickle.dump(
-                (execution_feedback, executed_factor_value_dataframe),
+                (execution_feedback, executed_factor_value_dataframe, execution_error),
                 open(cache_file_path, "wb"),
             )
         return execution_feedback, executed_factor_value_dataframe

diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
@@ -78,6 +78,8 @@ def multistep_evolve(
             )
             # TODO: Due to design issues, we have chosen to ignore this mypy error.
             logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
+            for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
+                logger.info(f"evolving code workspace: {sw}")
 
             # 4. Pack evolve results
             es = EvoStep(evo, queried_knowledge)

diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -179,6 +179,11 @@ def execute(self) -> object | None:
         self.inject_code(**self.code_dict)
         return None
 
+    def __str__(self) -> str:
+        return f"Workspace[{self.workspace_path=}" + (
+            "]" if self.target_task is None else f",{self.target_task.name=}]"
+        )
+
 
 ASpecificWSForExperiment = TypeVar("ASpecificWSForExperiment", bound=Workspace)
 ASpecificWSForSubTasks = TypeVar("ASpecificWSForSubTasks", bound=Workspace)