From da05927d630f1f03b505bdf05888d2e9582a53d2 Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:15:31 +0800
Subject: [PATCH] Fix model bug and push (#35)

---
 .../model_implementation/benchmark/eval.py    | 11 ++++++-
 .../model_implementation/one_shot/__init__.py |  4 ++-
 rdagent/model_implementation/task.py          | 33 ++++++++++++++-----
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/rdagent/model_implementation/benchmark/eval.py b/rdagent/model_implementation/benchmark/eval.py
index 394b6f7e..7d9289be 100644
--- a/rdagent/model_implementation/benchmark/eval.py
+++ b/rdagent/model_implementation/benchmark/eval.py
@@ -19,7 +19,16 @@ class ModelImpValEval:
 
     Assumption:
     - If the model structure is similar, the output will change in similar way when we change the input.
-        - we try to initialize the model param in similar value. So only the model structure is different.
+
+    Challenge:
+    - The key difference between it and implementing factors is that we have parameters in the layers (Factor operators often have no parameters or are given parameters).
+    - we try to initialize the model param in similar value. So only the model structure is different.
+
+    Comparing the correlation of following sequences
+    - modelA[init1](input1).hidden_out1, modelA[init1](input2).hidden_out1, ...
+    - modelB[init1](input1).hidden_out1, modelB[init1](input2).hidden_out1, ...
+    
+    For each hidden output, we can calculate a correlation. The average correlation will be the metrics.
     """
 
     def evaluate(self, gt: ModelTaskImpl, gen: ModelTaskImpl):
diff --git a/rdagent/model_implementation/one_shot/__init__.py b/rdagent/model_implementation/one_shot/__init__.py
index 357f8c03..2e02fdb9 100644
--- a/rdagent/model_implementation/one_shot/__init__.py
+++ b/rdagent/model_implementation/one_shot/__init__.py
@@ -1,3 +1,4 @@
+import re
 from typing import Sequence
 from rdagent.oai.llm_utils import APIBackend
 
@@ -36,7 +37,8 @@ def generate(self, task_l: Sequence[ModelImplTask]) -> Sequence[ModelTaskImpl]:
             )
 
             # Extract the code part from the response
-            code = resp.split("```python")[1].split("```")[0]
+            match = re.search(r".*```[Pp]ython\n(.*)\n```.*", resp, re.DOTALL)
+            code = match.group(1)
             mti.inject_code(**{"model.py": code})
             mti_l.append(mti)
         return mti_l
diff --git a/rdagent/model_implementation/task.py b/rdagent/model_implementation/task.py
index 13efba40..0a69aee4 100644
--- a/rdagent/model_implementation/task.py
+++ b/rdagent/model_implementation/task.py
@@ -41,18 +41,35 @@ def __init__(self, json_uri: str) -> None:
     def load(self, *argT, **kwargs) -> Sequence[ModelImplTask]:
         # TODO: we should load the tasks from json;
 
+        # this version does not align with the right answer
+        # formula_info = {
+        #     "name": "Anti-Symmetric Deep Graph Network (A-DGN)",
+        #     "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
+        #     "formulation": "x_u^{(l)} = x_u^{(l-1)} + \\epsilon \\sigma \\left( W^T x_u^{(l-1)} + \\Phi(X^{(l-1)}, N_u) + b \\right)",
+        #     "variables": {
+        #         "x_u^{(l)}": "The state of node u at layer l",
+        #         "\\epsilon": "The step size in the Euler discretization",
+        #         "\\sigma": "A monotonically non-decreasing activation function",
+        #         "W": "An anti-symmetric weight matrix",
+        #         "X^{(l-1)}": "The node feature matrix at layer l-1",
+        #         "N_u": "The set of neighbors of node u",
+        #         "b": "A bias vector",
+        #     },
+        #     "key": "A-DGN",
+        # }
         formula_info = {
             "name": "Anti-Symmetric Deep Graph Network (A-DGN)",
             "description": "A framework for stable and non-dissipative DGN design. It ensures long-range information preservation between nodes and prevents gradient vanishing or explosion during training.",
-            "formulation": "x_u^{(l)} = x_u^{(l-1)} + \\epsilon \\sigma \\left( W^T x_u^{(l-1)} + \\Phi(X^{(l-1)}, N_u) + b \\right)",
+            "formulation": r"\mathbf{x}^{\prime}_i = \mathbf{x}_i + \epsilon \cdot \sigma \left( (\mathbf{W}-\mathbf{W}^T-\gamma \mathbf{I}) \mathbf{x}_i + \Phi(\mathbf{X}, \mathcal{N}_i) + \mathbf{b}\right),",
             "variables": {
-                "x_u^{(l)}": "The state of node u at layer l",
-                "\\epsilon": "The step size in the Euler discretization",
-                "\\sigma": "A monotonically non-decreasing activation function",
-                "W": "An anti-symmetric weight matrix",
-                "X^{(l-1)}": "The node feature matrix at layer l-1",
-                "N_u": "The set of neighbors of node u",
-                "b": "A bias vector",
+                r"\mathbf{x}_i": "The state of node i at previous layer",
+                r"\epsilon": "The step size in the Euler discretization",
+                r"\sigma": "A monotonically non-decreasing activation function",
+                r"\Phi": "A graph convolutional operator",
+                r"W": "An anti-symmetric weight matrix",
+                r"\mathbf{x}^{\prime}_i": "The node feature matrix at layer l-1",
+                r"\mathcal{N}_i": "The set of neighbors of node u",
+                r"\mathbf{b}": "A bias vector",
             },
             "key": "A-DGN",
         }