Skip to content

Commit

Permalink
fix: partial bug in bench (#368)
Browse files Browse the repository at this point in the history
* Add more log

* Fix eval

* update

* keep ratio <= 1

* feat: cache exception (#369)

* 0 instead of NaN

* remove unused evaluators

* save gen_factor_l_all_rounds

* black reformat

* cache exception

---------

Co-authored-by: Tim <[email protected]>
  • Loading branch information
you-n-g and qew21 authored Sep 29, 2024
1 parent c2168f8 commit af9808f
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 55 deletions.
64 changes: 37 additions & 27 deletions rdagent/app/benchmark/factor/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import fire
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import gmean

from rdagent.components.benchmark.conf import BenchmarkSettings
from rdagent.components.benchmark.eval_method import FactorImplementEval
Expand Down Expand Up @@ -44,7 +44,24 @@ def process_results(self, results):
final_res[experiment] = processed_data.iloc[-1, :]
return final_res

def reformat_succ_rate(self, display_df):
def reformat_index(self, display_df):
"""
reform the results from
.. code-block:: python
success rate
High_Beta_Factor 0.2
to
.. code-block:: python
success rate
Category Difficulty Factor
量价 Hard High_Beta_Factor 0.2
"""
new_idx = []
display_df = display_df[display_df.index.isin(self.index_map.keys())]
for idx in display_df.index:
Expand Down Expand Up @@ -78,11 +95,9 @@ def result_all_key_order(self, x):
def analyze_data(self, sum_df):
index = [
"FactorSingleColumnEvaluator",
"FactorOutputFormatEvaluator",
"FactorRowCountEvaluator",
"FactorIndexEvaluator",
"FactorMissingValuesEvaluator",
"FactorEqualValueCountEvaluator",
"FactorEqualValueRatioEvaluator",
"FactorCorrelationEvaluator",
"run factor error",
]
Expand All @@ -93,40 +108,35 @@ def analyze_data(self, sum_df):
succ_rate = ~run_error
succ_rate = succ_rate.mean(axis=0).to_frame("success rate")

succ_rate_f = self.reformat_succ_rate(succ_rate)
succ_rate_f

sum_df_clean["FactorRowCountEvaluator"]
succ_rate_f = self.reformat_index(succ_rate)

format_issue = sum_df_clean["FactorRowCountEvaluator"].astype(bool) & sum_df_clean[
"FactorIndexEvaluator"
].astype(bool)
# if it rasis Error when running the evaluator, we will get NaN
# Running failures are reguarded to zero score.
format_issue = sum_df_clean[["FactorRowCountEvaluator", "FactorIndexEvaluator"]].apply(
lambda x: gmean(x), axis=1
lambda x: np.mean(x.fillna(0.0)), axis=1
)
format_succ_rate = format_issue.unstack().T.mean(axis=0).to_frame("success rate")
format_succ_rate_f = self.reformat_index(format_succ_rate)

eval_series = format_issue.unstack()
succ_rate = eval_series.T.fillna(False)

format_succ_rate = succ_rate.mean(axis=0).to_frame("success rate")
format_succ_rate_f = self.reformat_succ_rate(format_succ_rate)

corr = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
corr = sum_df_clean["FactorCorrelationEvaluator"].fillna(0.0)
corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
corr_res = self.reformat_succ_rate(corr)
corr_max = sum_df_clean["FactorCorrelationEvaluator"] * format_issue
corr_res = self.reformat_index(corr)
corr_max = sum_df_clean["FactorCorrelationEvaluator"]

corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
corr_max_res = self.reformat_succ_rate(corr_max)
corr_max_res = self.reformat_index(corr_max)

value_max = sum_df_clean["FactorMissingValuesEvaluator"] * format_issue
value_max = sum_df_clean["FactorEqualValueRatioEvaluator"]
value_max = value_max.unstack().T.max(axis=0).to_frame("max_value")
value_max_res = self.reformat_succ_rate(value_max)
value_max_res = self.reformat_index(value_max)

value_avg = (
(sum_df_clean["FactorMissingValuesEvaluator"] * format_issue).unstack().T.mean(axis=0).to_frame("avg_value")
(sum_df_clean["FactorEqualValueRatioEvaluator"] * format_issue)
.unstack()
.T.mean(axis=0)
.to_frame("avg_value")
)
value_avg_res = self.reformat_succ_rate(value_avg)
value_avg_res = self.reformat_index(value_avg)

result_all = pd.concat(
{
Expand Down
20 changes: 10 additions & 10 deletions rdagent/components/benchmark/eval_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.components.coder.factor_coder.CoSTEER.evaluators import (
FactorCorrelationEvaluator,
FactorEqualValueCountEvaluator,
FactorEqualValueRatioEvaluator,
FactorEvaluator,
FactorIndexEvaluator,
FactorMissingValuesEvaluator,
FactorOutputFormatEvaluator,
FactorRowCountEvaluator,
FactorSingleColumnEvaluator,
)
Expand Down Expand Up @@ -151,20 +149,16 @@ def __init__(
):
online_evaluator_l = [
FactorSingleColumnEvaluator(scen),
FactorOutputFormatEvaluator(scen),
FactorRowCountEvaluator(scen),
FactorIndexEvaluator(scen),
FactorMissingValuesEvaluator(scen),
FactorEqualValueCountEvaluator(scen),
FactorEqualValueRatioEvaluator(scen),
FactorCorrelationEvaluator(hard_check=False, scen=scen),
]
super().__init__(online_evaluator_l, test_cases, method, *args, **kwargs)
self.test_round = test_round

def eval(self):
def develop(self):
gen_factor_l_all_rounds = []
test_cases_all_rounds = []
res = defaultdict(list)
for _ in tqdm(range(self.test_round), desc="Rounds of Eval"):
print("\n========================================================")
print(f"Eval {_}-th times...")
Expand All @@ -181,8 +175,14 @@ def eval(self):
"The number of cases to eval should be equal to the number of test cases.",
)
gen_factor_l_all_rounds.extend(gen_factor_l.sub_workspace_list)
test_cases_all_rounds.extend(self.test_cases.ground_truth)

return gen_factor_l_all_rounds

def eval(self, gen_factor_l_all_rounds):
test_cases_all_rounds = []
res = defaultdict(list)
for _ in range(self.test_round):
test_cases_all_rounds.extend(self.test_cases.ground_truth)
eval_res_list = multiprocessing_wrapper(
[
(self.eval_case, (gt_case, gen_factor))
Expand Down
26 changes: 13 additions & 13 deletions rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def evaluate(
"The source dataframe is None. Please check the implementation.",
False,
)
ratio = len(gen_df) / len(gt_df)
ratio = min(len(gen_df), len(gt_df)) / max(len(gen_df), len(gt_df))
return (
f"The ratio of rows count in the source dataframe to the ground truth dataframe is {ratio:.2f}. "
+ "Please verify the implementation. "
Expand Down Expand Up @@ -304,7 +304,7 @@ def evaluate(
)


class FactorEqualValueCountEvaluator(FactorEvaluator):
class FactorEqualValueRatioEvaluator(FactorEvaluator):
def evaluate(
self,
implementation: Workspace,
Expand Down Expand Up @@ -392,6 +392,7 @@ def evaluate(
output_format_result = None
equal_value_ratio_result = 0
high_correlation_result = False
row_result = None

# Check if both dataframe has only one columns Mute this since factor task might generate more than one columns now
if version == 1:
Expand Down Expand Up @@ -429,7 +430,7 @@ def evaluate(
)
conclusions.append(feedback_str)

feedback_str, equal_value_ratio_result = FactorEqualValueCountEvaluator(self.scen).evaluate(
feedback_str, equal_value_ratio_result = FactorEqualValueRatioEvaluator(self.scen).evaluate(
implementation, gt_implementation
)
conclusions.append(feedback_str)
Expand All @@ -448,16 +449,15 @@ def evaluate(

if gt_implementation is not None and (equal_value_ratio_result > 0.99) or high_correlation_result:
decision_from_value_check = True
if version == 1:
if row_result <= 0.99 or output_format_result is False or daily_check_result is False:
decision_from_value_check = False
else:
decision_from_value_check = None
elif version == 2:
if output_format_result is False or daily_check_result is False:
decision_from_value_check = False
else:
decision_from_value_check = None
elif (
row_result is not None
and row_result <= 0.99
or output_format_result is False
or daily_check_result is False
):
decision_from_value_check = False
else:
decision_from_value_check = None
return conclusion_str, decision_from_value_check


Expand Down
20 changes: 15 additions & 5 deletions rdagent/components/coder/factor_coder/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,14 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
target_file_name = md5_hash(data_type + self.code_dict["factor.py"])
cache_file_path = Path(FACTOR_IMPLEMENT_SETTINGS.cache_location) / f"{target_file_name}.pkl"
Path(FACTOR_IMPLEMENT_SETTINGS.cache_location).mkdir(exist_ok=True, parents=True)
if cache_file_path.exists() and not self.raise_exception:
if cache_file_path.exists():
cached_res = pickle.load(open(cache_file_path, "rb"))
if store_result and cached_res[1] is not None:
self.executed_factor_value_dataframe = cached_res[1]
return cached_res
if not self.raise_exception or len(cached_res) == 3:
if cached_res[2]:
raise cached_res[2]
if store_result and cached_res[1] is not None:
self.executed_factor_value_dataframe = cached_res[1]
return cached_res[:1]

if self.executed_factor_value_dataframe is not None:
return self.FB_FROM_CACHE, self.executed_factor_value_dataframe
Expand All @@ -147,6 +150,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple

execution_feedback = self.FB_EXECUTION_SUCCEEDED
execution_success = False
execution_error = None

if self.target_task.version == 1:
execution_code_path = code_path
Expand Down Expand Up @@ -177,10 +181,14 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
)
if self.raise_exception:
raise CustomRuntimeError(execution_feedback)
else:
execution_error = CustomRuntimeError(execution_feedback)
except subprocess.TimeoutExpired:
execution_feedback += f"Execution timeout error and the timeout is set to {FACTOR_IMPLEMENT_SETTINGS.file_based_execution_timeout} seconds."
if self.raise_exception:
raise CustomRuntimeError(execution_feedback)
else:
execution_error = CustomRuntimeError(execution_feedback)

workspace_output_file_path = self.workspace_path / "result.h5"
if workspace_output_file_path.exists() and execution_success:
Expand All @@ -195,13 +203,15 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
executed_factor_value_dataframe = None
if self.raise_exception:
raise NoOutputError(execution_feedback)
else:
execution_error = NoOutputError(execution_feedback)

if store_result and executed_factor_value_dataframe is not None:
self.executed_factor_value_dataframe = executed_factor_value_dataframe

if FACTOR_IMPLEMENT_SETTINGS.enable_execution_cache:
pickle.dump(
(execution_feedback, executed_factor_value_dataframe),
(execution_feedback, executed_factor_value_dataframe, execution_error),
open(cache_file_path, "wb"),
)
return execution_feedback, executed_factor_value_dataframe
Expand Down
2 changes: 2 additions & 0 deletions rdagent/core/evolving_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def multistep_evolve(
)
# TODO: Due to design issues, we have chosen to ignore this mypy error.
logger.log_object(evo.sub_workspace_list, tag="evolving code") # type: ignore[attr-defined]
for sw in evo.sub_workspace_list: # type: ignore[attr-defined]
logger.info(f"evolving code workspace: {sw}")

# 4. Pack evolve results
es = EvoStep(evo, queried_knowledge)
Expand Down
5 changes: 5 additions & 0 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ def execute(self) -> object | None:
self.inject_code(**self.code_dict)
return None

def __str__(self) -> str:
return f"Workspace[{self.workspace_path=}" + (
"]" if self.target_task is None else f",{self.target_task.name=}]"
)


ASpecificWSForExperiment = TypeVar("ASpecificWSForExperiment", bound=Workspace)
ASpecificWSForSubTasks = TypeVar("ASpecificWSForSubTasks", bound=Workspace)
Expand Down

0 comments on commit af9808f

Please sign in to comment.