feat: add qlib_factor_strategy (#307)

* add qlib_factor_strategy * refine the code of action choosing * fix a bug * feat: template for kaggle (#308) * init for s3e26 * ci issue * fix a small bug in model runner which might cause error when model is the first try (#309) * update --------- Co-authored-by: Haoran Pan <[email protected]> Co-authored-by: Xu Yang <[email protected]>
microsoft · Sep 24, 2024 · f8f59ff · f8f59ff
1 parent 1f910a5
commit f8f59ff
Show file tree

Hide file tree

Showing 9 changed files with 49 additions and 5 deletions.
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -53,5 +53,7 @@ class Config:
 
     rag_path: str = "git_ignore_folder/rag"
 
+    if_action_choosing_based_on_UCB: bool = False
+
 
 KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -157,7 +157,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             "reason": reason,
         }
 
-        self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
+        # self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
 
         return HypothesisFeedback(
             observations=observations,

diff --git a/...arios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py b/...arios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py
@@ -50,4 +50,4 @@ def predict(model, X):
     # Predict using the trained model
     y_pred = model.predict(X_selected)
 
-    return y_pred
+    return y_pred.reshape(-1, 1)
diff --git a/.../scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py b/.../scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py
@@ -38,4 +38,4 @@ def predict(model, X):
     X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred = model.predict(dtest)
-    return y_pred.astype(int)
+    return y_pred.astype(int).reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -106,6 +106,7 @@ kg_feature_interface: |-
   ```
 
   To Note:
+  Top 0. I have already completed the encoded labeling process, so please avoid any one-hot encoding or similar operations in the future. Focus instead on targeted and efficient feature engineering techniques, such as normalizing float-type features, filtering based on specific categories, or other concise transformations that can be quickly implemented and tested without unnecessary complexity. Also, ensure that the index of the output DataFrame matches the original DataFrame's index, and that the number of columns remains consistent across train, validation, and test sets.
   1. Ensure that your code meets these requirements and produces a feature-engineered DataFrame that contains only the newly engineered columns, aligning with the user's data and objectives.
   2. Ensure that the index of the output DataFrame matches the index of the original DataFrame. For example:
     Incorrect: `normalized_df = pd.DataFrame(normalized_features, columns=X.columns)`

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -35,7 +35,7 @@ def __init__(self, competition: str) -> None:
         self.target_description = None
         self.competition_features = None
         self._analysis_competition_description()
-        self.if_action_choosing_based_on_UCB = False
+        self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
 
         self._background = self.background
 

diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -92,7 +92,7 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
             "Feature engineering": 0.0,
             "Feature processing": 0.0,
             "Model feature selection": 0.0,
-            "Model tuning": 0.0,
+            "Model tuning": 0.5,
         }
         self.confidence_parameter = 1.0
         self.initial_performance = 0.0

diff --git a/rdagent/scenarios/qlib/experiment/factor_experiment.py b/rdagent/scenarios/qlib/experiment/factor_experiment.py
@@ -27,6 +27,7 @@ def __init__(self) -> None:
         self._source_data = deepcopy(get_data_folder_intro())
         self._output_format = deepcopy(prompt_dict["qlib_factor_output_format"])
         self._interface = deepcopy(prompt_dict["qlib_factor_interface"])
+        self._strategy = deepcopy(prompt_dict["qlib_factor_strategy"])
         self._simulator = deepcopy(prompt_dict["qlib_factor_simulator"])
         self._rich_style_description = deepcopy(prompt_dict["qlib_factor_rich_style_description"])
         self._experiment_setting = deepcopy(prompt_dict["qlib_factor_experiment_setting"])

diff --git a/rdagent/scenarios/qlib/experiment/prompts.yaml b/rdagent/scenarios/qlib/experiment/prompts.yaml
@@ -15,6 +15,46 @@ qlib_factor_interface: |-
   Your python code should contain the following part: the import part, the function part, and the main part. You should write a main function name: "calculate_{function_name}" and call this function in "if __name__ == __main__" part. Don't write any try-except block in your python code. The user will catch the exception message and provide the feedback to you.
   User will write your python code into a python file and execute the file directly with "python {your_file_name}.py". You should calculate the factor values and save the result into a HDF5(H5) file named "result.h5" in the same directory as your python file. The result file is a HDF5(H5) file containing a pandas dataframe. The index of the dataframe is the "datetime" and "instrument", and the single column name is the factor name,and the value is the factor value. The result file should be saved in the same directory as your python file.
 
+qlib_factor_strategy: |-
+  Ensure that for every step of data processing, the data format (including indexes) is clearly explained through comments.
+  Each transformation or calculation should be accompanied by a detailed description of how the data is structured, especially focusing on key aspects like whether the data has multi-level indexing, how to access specific columns or index levels, and any operations that affect the data shape (e.g., `reset_index()`, `groupby()`, `merge()`).
+  This step-by-step explanation will ensure clarity and accuracy in data handling. For example:
+  1. **Start with multi-level index**:  
+    ```python
+    # The initial DataFrame has a multi-level index with 'datetime' and 'instrument'.
+    # To access the 'datetime' index, use df.index.get_level_values('datetime').
+    datetime_values = df.index.get_level_values('datetime')
+    ```
+ 
+  2. **Reset the index if necessary**:  
+    ```python
+    # Resetting the index to move 'datetime' and 'instrument' from the index to columns.
+    # This operation flattens the multi-index structure.
+    df = df.reset_index()
+    ```
+ 
+  3. **Perform groupby operations**:  
+    ```python
+    # Grouping by 'datetime' and 'instrument' to aggregate the data.
+    # After groupby, the result will maintain 'datetime' and 'instrument' as a multi-level index.
+    df_grouped = df.groupby(['datetime', 'instrument']).sum()
+    ```
+ 
+  4. **Ensure consistent datetime formats**:  
+    ```python
+    # Before merging, ensure that the 'datetime' column in both DataFrames is of the same format.
+    # Convert to datetime format if necessary.
+    df['datetime'] = pd.to_datetime(df['datetime'])
+    other_df['datetime'] = pd.to_datetime(other_df['datetime'])
+    ```
+ 
+  5. **Merge operations**:  
+    ```python
+    # When merging DataFrames, ensure you are merging on both 'datetime' and 'instrument'.
+    # If these are part of the index, reset the index before merging.
+    merged_df = pd.merge(df, other_df, on=['datetime', 'instrument'], how='inner')
+    ```
+
 qlib_factor_output_format: |-
   Your output should be a pandas dataframe similar to the following example information:
   <class 'pandas.core.frame.DataFrame'>