From 6a30909ff70c99d639a2b4d41bc9c3588679f501 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Thu, 15 Aug 2024 18:52:37 +0800
Subject: [PATCH] add relevance check to quant factors (#210)

---
 rdagent/core/conf.py                          |  2 +-
 .../factor_experiment_loader/pdf_loader.py    | 45 ++++++++++++++++++-
 .../factor_experiment_loader/prompts.yaml     | 40 +++++++++++++++++
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/rdagent/core/conf.py b/rdagent/core/conf.py
index 2e5e8613..a20828f0 100644
--- a/rdagent/core/conf.py
+++ b/rdagent/core/conf.py
@@ -96,7 +96,7 @@ class RDAgentSettings(BaseSettings):
     gcr_endpoint_max_token: int = 100
 
     # factor extraction conf
-    max_input_duplicate_factor_group: int = 600
+    max_input_duplicate_factor_group: int = 300
     max_output_duplicate_factor_group: int = 20
 
     # workspace conf
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
index be3e54f8..97d16365 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -274,6 +274,49 @@ def merge_file_to_factor_dict_to_factor_dict(
     return factor_dict_simple_deduplication
 
 
+def __check_factor_dict_relevance(
+    factor_df_string: str,
+) -> dict[str, dict[str, str]]:
+    extract_result_resp = APIBackend().build_messages_and_create_chat_completion(
+        system_prompt=document_process_prompts["factor_relevance_system"],
+        user_prompt=factor_df_string,
+        json_mode=True,
+    )
+    return json.loads(extract_result_resp)
+
+
+def check_factor_relevance(
+    factor_dict: dict[str, dict[str, str]],
+) -> tuple[dict[str, dict[str, str]], dict[str, dict[str, str]]]:
+    factor_relevance_dict = {}
+
+    factor_df = pd.DataFrame(factor_dict).T
+    factor_df.index.names = ["factor_name"]
+
+    while factor_df.shape[0] > 0:
+        result_list = multiprocessing_wrapper(
+            [
+                (__check_factor_dict_relevance, (factor_df.iloc[i : i + 50, :].to_string(),))
+                for i in range(0, factor_df.shape[0], 50)
+            ],
+            n=RD_AGENT_SETTINGS.multi_proc_n,
+        )
+
+        for result in result_list:
+            for factor_name, relevance in result.items():
+                factor_relevance_dict[factor_name] = relevance
+
+        factor_df = factor_df[~factor_df.index.isin(factor_relevance_dict)]
+
+    filtered_factor_dict = {
+        factor_name: factor_dict[factor_name]
+        for factor_name in factor_dict
+        if factor_relevance_dict[factor_name]["relevance"]
+    }
+
+    return factor_relevance_dict, filtered_factor_dict
+
+
 def __check_factor_dict_viability_simulate_json_mode(
     factor_df_string: str,
 ) -> dict[str, dict[str, str]]:
@@ -425,7 +468,7 @@ def __deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[li
     else:
         for k in range(
             len(full_str_list) // RD_AGENT_SETTINGS.max_input_duplicate_factor_group,
-            30,
+            40,
         ):
             kmeans_index_group = __kmeans_embeddings(embeddings=embeddings, k=k)
             if len(kmeans_index_group[0]) < RD_AGENT_SETTINGS.max_input_duplicate_factor_group:
diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml b/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml
index 9221fb98..2c49a14b 100644
--- a/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml
+++ b/rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml
@@ -143,6 +143,46 @@ factor_viability_system: |-
         }
     }
 
+factor_relevance_system: |-
+    User has designed several factors in quant investment. Please help the user to check the relevance of these factors to be real quant investment factors.
+    These factors are used to build a daily frequency strategy in China A-share market.
+
+    User will provide a pandas dataframe like table containing following information:
+    1. The name of the factor;
+    2. The simple description of the factor;
+    3. The formulation of the factor in latex format;
+    4. The description to the variables and functions in the formulation of the factor.
+
+    A relevant factor should satisfy the following conditions:
+    1. The factor should be able to be calculated in daily frequency;
+    2. The factor should be able to be calculated based on each stock;
+    3. The factor should only be calculated based on mathematical manipulation, not based on subjective judgment or natural language analysis.
+
+    You should give decision to each factor provided by the user. You should reject the factor based on very solid reason.
+    Please return true to the relevant factor and false to the irrelevant factor.
+
+    Notice, you can just return part of the factors due to token limit. Your factor name should be the same as the user's factor name.
+
+    Please respond with your decision in JSON format. Just respond the output json string without any interaction and explanation.
+    The JSON schema should include:
+    {
+        "Name to factor 1":
+        {
+            "relevance": true,
+            "reason": "The reason to the relevance of this factor"
+        },
+        "Name to factor 2":
+        {
+            "relevance": false,
+            "reason": "The reason to the non-relevance of this factor"
+        }
+        "Name to factor 3":
+        {
+            "relevance": true,
+            "reason": "The reason to the relevance of this factor"
+        }
+    }
+
 
 factor_duplicate_system: |-
     User has designed several factors in quant investment. Please help the user to duplicate these factors.