Skip to content

Commit

Permalink
add relevance check to quant factors (#210)
Browse files Browse the repository at this point in the history
  • Loading branch information
peteryang1 authored Aug 15, 2024
1 parent 2942d33 commit 6a30909
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 2 deletions.
2 changes: 1 addition & 1 deletion rdagent/core/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class RDAgentSettings(BaseSettings):
gcr_endpoint_max_token: int = 100

# factor extraction conf
max_input_duplicate_factor_group: int = 600
max_input_duplicate_factor_group: int = 300
max_output_duplicate_factor_group: int = 20

# workspace conf
Expand Down
45 changes: 44 additions & 1 deletion rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,49 @@ def merge_file_to_factor_dict_to_factor_dict(
return factor_dict_simple_deduplication


def __check_factor_dict_relevance(
factor_df_string: str,
) -> dict[str, dict[str, str]]:
extract_result_resp = APIBackend().build_messages_and_create_chat_completion(
system_prompt=document_process_prompts["factor_relevance_system"],
user_prompt=factor_df_string,
json_mode=True,
)
return json.loads(extract_result_resp)


def check_factor_relevance(
factor_dict: dict[str, dict[str, str]],
) -> tuple[dict[str, dict[str, str]], dict[str, dict[str, str]]]:
factor_relevance_dict = {}

factor_df = pd.DataFrame(factor_dict).T
factor_df.index.names = ["factor_name"]

while factor_df.shape[0] > 0:
result_list = multiprocessing_wrapper(
[
(__check_factor_dict_relevance, (factor_df.iloc[i : i + 50, :].to_string(),))
for i in range(0, factor_df.shape[0], 50)
],
n=RD_AGENT_SETTINGS.multi_proc_n,
)

for result in result_list:
for factor_name, relevance in result.items():
factor_relevance_dict[factor_name] = relevance

factor_df = factor_df[~factor_df.index.isin(factor_relevance_dict)]

filtered_factor_dict = {
factor_name: factor_dict[factor_name]
for factor_name in factor_dict
if factor_relevance_dict[factor_name]["relevance"]
}

return factor_relevance_dict, filtered_factor_dict


def __check_factor_dict_viability_simulate_json_mode(
factor_df_string: str,
) -> dict[str, dict[str, str]]:
Expand Down Expand Up @@ -425,7 +468,7 @@ def __deduplicate_factor_dict(factor_dict: dict[str, dict[str, str]]) -> list[li
else:
for k in range(
len(full_str_list) // RD_AGENT_SETTINGS.max_input_duplicate_factor_group,
30,
40,
):
kmeans_index_group = __kmeans_embeddings(embeddings=embeddings, k=k)
if len(kmeans_index_group[0]) < RD_AGENT_SETTINGS.max_input_duplicate_factor_group:
Expand Down
40 changes: 40 additions & 0 deletions rdagent/scenarios/qlib/factor_experiment_loader/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,46 @@ factor_viability_system: |-
}
}
factor_relevance_system: |-
User has designed several factors in quant investment. Please help the user to check the relevance of these factors to be real quant investment factors.
These factors are used to build a daily frequency strategy in China A-share market.
User will provide a pandas dataframe like table containing following information:
1. The name of the factor;
2. The simple description of the factor;
3. The formulation of the factor in latex format;
4. The description to the variables and functions in the formulation of the factor.
A relevant factor should satisfy the following conditions:
1. The factor should be able to be calculated in daily frequency;
2. The factor should be able to be calculated based on each stock;
3. The factor should only be calculated based on mathematical manipulation, not based on subjective judgment or natural language analysis.
You should give decision to each factor provided by the user. You should reject the factor based on very solid reason.
Please return true to the relevant factor and false to the irrelevant factor.
Notice, you can just return part of the factors due to token limit. Your factor name should be the same as the user's factor name.
Please respond with your decision in JSON format. Just respond the output json string without any interaction and explanation.
The JSON schema should include:
{
"Name to factor 1":
{
"relevance": true,
"reason": "The reason to the relevance of this factor"
},
"Name to factor 2":
{
"relevance": false,
"reason": "The reason to the non-relevance of this factor"
}
"Name to factor 3":
{
"relevance": true,
"reason": "The reason to the relevance of this factor"
}
}
factor_duplicate_system: |-
User has designed several factors in quant investment. Please help the user to duplicate these factors.
Expand Down

0 comments on commit 6a30909

Please sign in to comment.