From 3d36c452ff0983800e5343834cc69f24a508ea70 Mon Sep 17 00:00:00 2001
From: Way2Learn <118058822+Xisen-Wang@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:45:08 +0800
Subject: [PATCH] feat: support Multi output (#330)

* Key changes

* Revised to support submission specifications

* Revised to support submission specifications

* revise CI

* CI-Fix

* fixing-CI

* Support COSTEER Multi-Dimension for output & bug-fix

* Revised to support submission specifications

* revise CI

* CI-Fix

* fixing-CI

* Support COSTEER Multi-Dimension for output & bug-fix

* Linting
---
 .../scenarios/kaggle/experiment/prompts.yaml  |  6 ++-
 .../scenarios/kaggle/experiment/scenario.py   | 43 +++++++++++++------
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
index f90669a8..eb09fe7b 100644
--- a/rdagent/scenarios/kaggle/experiment/prompts.yaml
+++ b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -314,7 +314,11 @@ kg_feature_simulator: |-
 
 kg_model_output_format: |-
   For feature related tasks, the output should be a pandas DataFrame with the new features. The columns should be the new features, and the rows should correspond to the number of samples in the input DataFrame.
-  For model related tasks, the output should be an np.ndarray with the appropriate number of predictions, each prediction being a single value. The output should be a 2D array with dimensions corresponding to the number of predictions and 1 column (e.g., (8, 1) if there are 8 predictions).
+  For model related tasks:
+  1. the output should be an np.ndarray with the appropriate number of predictions & the appropriate values within each prediction 
+  2. the output should be a 2D array with dimensions corresponding to the number of predictions and the number of things to output. Eg, if 4 predictions, each prediction needs to predict 3 probabilities, then (4,3). Or (8, 1) if there are 8 predictions but each prediction is only one value.
+  3. please reference the competition's submission requirement and align with that. 
+  Submission Requirements here:\n: {{submission_specifications}}
   
 kg_model_simulator: |-
   The models will be trained on the competition dataset and evaluated on their ability to predict the target. Metrics like accuracy and AUC-ROC is used to evaluate the model performance. 
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index e4536ab7..914f2b61 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -25,9 +25,6 @@ def __init__(self, competition: str) -> None:
         self.competition = competition
         self.competition_descriptions = crawl_descriptions(competition)
         self._source_data = self.source_data
-        self._output_format = self.output_format
-        self._interface = self.interface
-        self._simulator = self.simulator
 
         self.competition_type = None
         self.competition_description = None
@@ -35,10 +32,15 @@ def __init__(self, competition: str) -> None:
         self.competition_features = None
         self.submission_specifications = None
         self._analysis_competition_description()
-        self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
 
+        # Move these assignments after _analysis_competition_description
+        self._output_format = self.output_format
+        self._interface = self.interface
+        self._simulator = self.simulator
         self._background = self.background
 
+        self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
+
     def _analysis_competition_description(self):
         sys_prompt = (
             Environment(undefined=StrictUndefined)
@@ -61,14 +63,25 @@ def _analysis_competition_description(self):
             json_mode=True,
         )
 
-        response_json_analysis = json.loads(response_analysis)
-        self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
-        self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
-        self.target_description = response_json_analysis.get("Target Description", "No target provided")
-        self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
-        self.submission_specifications = response_json_analysis.get(
-            "Submission Specifications", "No submission requirements provided"
-        )
+        try:
+            response_json_analysis = json.loads(response_analysis)
+            self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
+            self.competition_description = response_json_analysis.get(
+                "Competition Description", "No description provided"
+            )
+            self.target_description = response_json_analysis.get("Target Description", "No target provided")
+            self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
+            self.submission_specifications = response_json_analysis.get(
+                "Submission Specifications", "No submission requirements provided"
+            )
+        except json.JSONDecodeError:
+            print(f"Failed to parse JSON response: {response_analysis}")
+            # Set default values if JSON parsing fails
+            self.competition_type = "Unknown"
+            self.competition_description = "No description available"
+            self.target_description = "No target available"
+            self.competition_features = "No features available"
+            self.submission_specifications = "No submission requirements available"
 
     def get_competition_full_desc(self) -> str:
         return f"""Competition Type: {self.competition_type}
@@ -137,7 +150,11 @@ def source_data(self) -> str:
 
     @property
     def output_format(self) -> str:
-        return prompt_dict["kg_model_output_format"]
+        return (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["kg_model_output_format"])
+            .render(submission_specifications=self.submission_specifications)
+        )
 
     @property
     def interface(self) -> str: