From 6a30909ff70c99d639a2b4d41bc9c3588679f501 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 15 Aug 2024 18:52:37 +0800 Subject: [PATCH] add relevance check to quant factors (#210) --- rdagent/core/conf.py | 2 +- .../factor_experiment_loader/pdf_loader.py | 45 ++++++++++++++++++- .../factor_experiment_loader/prompts.yaml | 40 +++++++++++++++++ 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py index 2e5e8613..a20828f0 100644 --- a/rdagent/core/conf.py +++ b/rdagent/core/conf.py @@ -96,7 +96,7 @@ class RDAgentSettings(BaseSettings): gcr_endpoint_max_token: int = 100 # factor extraction conf - max_input_duplicate_factor_group: int = 600 + max_input_duplicate_factor_group: int = 300 max_output_duplicate_factor_group: int = 20 # workspace conf diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py index be3e54f8..97d16365 100644 --- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py +++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py @@ -274,6 +274,49 @@ def merge_file_to_factor_dict_to_factor_dict( return factor_dict_simple_deduplication +def __check_factor_dict_relevance( + factor_df_string: str, +) -> dict[str, dict[str, str]]: + extract_result_resp = APIBackend().build_messages_and_create_chat_completion( + system_prompt=document_process_prompts["factor_relevance_system"], + user_prompt=factor_df_string, + json_mode=True, + ) + return json.loads(extract_result_resp) + + +def check_factor_relevance( + factor_dict: dict[str, dict[str, str]], +) -> tuple[dict[str, dict[str, str]], dict[str, dict[str, str]]]: + factor_relevance_dict = {} + + factor_df = pd.DataFrame(factor_dict).T + factor_df.index.names = ["factor_name"] + + while factor_df.shape[0] > 0: + result_list = multiprocessing_wrapper( + [ + (__check_factor_dict_relevance, (factor_df.iloc[i : i + 50, :].to_string(),)) + for i in range(0, factor_df.shape[0], 50) + ], + n=RD_AGENT_SETTINGS.multi_proc_n, + ) + + for result in result_list: + for factor_name, relevance in result.items(): + factor_relevance_dict[factor_name] = relevance + + factor_df = factor_df[~factor_df.index.isin(factor_relevance_dict)] + + filtered_factor_dict = { + factor_name: factor_dict[factor_name] + for factor_name in factor_dict + if factor_relevance_dict[factor_name]["relevance"] + } + + return factor_relevance_dict, filtered_factor_dict + + def __check_factor_dict_viability_simulate_json_mode( factor_df_string: str, ) -> dict[str, dict[str, str]]: @@ -425,7 +468,7 @@ def __deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[li else: for k in range( len(full_str_list) // RD_AGENT_SETTINGS.max_input_duplicate_factor_group, - 30, + 40, ): kmeans_index_group = __kmeans_embeddings(embeddings=embeddings, k=k) if len(kmeans_index_group[0]) < RD_AGENT_SETTINGS.max_input_duplicate_factor_group: diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml b/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml index 9221fb98..2c49a14b 100644 --- a/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml +++ b/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml @@ -143,6 +143,46 @@ factor_viability_system: |- } } +factor_relevance_system: |- + User has designed several factors in quant investment. Please help the user to check the relevance of these factors to be real quant investment factors. + These factors are used to build a daily frequency strategy in China A-share market. + + User will provide a pandas dataframe like table containing following information: + 1. The name of the factor; + 2. The simple description of the factor; + 3. The formulation of the factor in latex format; + 4. The description to the variables and functions in the formulation of the factor. + + A relevant factor should satisfy the following conditions: + 1. The factor should be able to be calculated in daily frequency; + 2. The factor should be able to be calculated based on each stock; + 3. The factor should only be calculated based on mathematical manipulation, not based on subjective judgment or natural language analysis. + + You should give decision to each factor provided by the user. You should reject the factor based on very solid reason. + Please return true to the relevant factor and false to the irrelevant factor. + + Notice, you can just return part of the factors due to token limit. Your factor name should be the same as the user's factor name. + + Please respond with your decision in JSON format. Just respond the output json string without any interaction and explanation. + The JSON schema should include: + { + "Name to factor 1": + { + "relevance": true, + "reason": "The reason to the relevance of this factor" + }, + "Name to factor 2": + { + "relevance": false, + "reason": "The reason to the non-relevance of this factor" + } + "Name to factor 3": + { + "relevance": true, + "reason": "The reason to the relevance of this factor" + } + } + factor_duplicate_system: |- User has designed several factors in quant investment. Please help the user to duplicate these factors.