Skip to content

Commit

Permalink
Merge branch 'main' into kaggle-running
Browse files Browse the repository at this point in the history
  • Loading branch information
WinstonLiyt committed Sep 30, 2024
2 parents 1f9d595 + 94c47d6 commit b022731
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 22 deletions.
10 changes: 6 additions & 4 deletions rdagent/core/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ def __init__(self, path: str | Path | None = None) -> None:
def load(self) -> None:
if self.path is not None and self.path.exists():
with self.path.open("rb") as f:
self.__dict__.update(
pickle.load(f).__dict__,
) # TODO: because we need to align with init function, we need a less hacky way to do this
loaded = pickle.load(f)
if isinstance(loaded, dict):
self.__dict__.update(loaded)
else:
self.__dict__.update(loaded.__dict__)

def dump(self) -> None:
if self.path is not None:
self.path.parent.mkdir(parents=True, exist_ok=True)
pickle.dump(self, self.path.open("wb"))
pickle.dump(self.__dict__, self.path.open("wb"))
else:
logger.warning("KnowledgeBase path is not set, dump failed.")
3 changes: 3 additions & 0 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
elif self.scen.if_using_graph_rag:
trace.knowledge_base.add_document(experiment_feedback, self.scen)

if self.scen.if_action_choosing_based_on_UCB:
self.scen.action_counts[hypothesis.action] += 1

return HypothesisFeedback(
observations=observations,
hypothesis_evaluation=hypothesis_evaluation,
Expand Down
7 changes: 4 additions & 3 deletions rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ kg_description_template:
"Competition Features": "Two-line description of the overall features involved within the competition as background."
"Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
"Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
"Evaluation Description": "A brief description for what metrics are used in evaluation. An explanation of whether a higher score is better or lower is better in terms of performance."
"Evaluation Boolean": "True" or "False" (True means the higher score the better (like accuracy); False means the lower value the better (like loss).)
"Evaluation Description": "A brief description of the metrics used in the evaluation. Please note that if `evaluation_metric_direction` is True, it indicates that higher values are better; if False, lower values are preferred."
}
Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
Expand All @@ -22,10 +21,12 @@ kg_description_template:
{{ competition_descriptions }}
The raw data information:
{{ raw_data_information }}
Evaluation_metric_direction:
{{ evaluation_metric_direction }}
kg_background: |-
You are solving a data science tasks and the type of the competition is {{ competition_type }}.
The competition description is:{{competition_description}}.
The competition description is: {{competition_description}}.
We provide an overall script in file: train.py. The user will run the train.py script along with several feature and model scripts to train several model to get a good performance on this task.
Expand Down
9 changes: 3 additions & 6 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def __init__(self, competition: str) -> None:
self.submission_specifications = None
self.model_output_channel = None
self.evaluation_desc = None
self.evaluation_metric_direction = None
self.leaderboard = leaderboard_scores(competition)
self.evaluation_metric_direction = float(self.leaderboard[0]) > float(self.leaderboard[-1])
self.vector_base = None
self.mini_case = KAGGLE_IMPLEMENT_SETTING.mini_case
self._analysis_competition_description()
Expand All @@ -75,8 +76,6 @@ def __init__(self, competition: str) -> None:
self.confidence_parameter = 1.0
self.initial_performance = 0.0

self.leaderboard = leaderboard_scores(competition)

def _analysis_competition_description(self):
sys_prompt = (
Environment(undefined=StrictUndefined)
Expand All @@ -90,6 +89,7 @@ def _analysis_competition_description(self):
.render(
competition_descriptions=self.competition_descriptions,
raw_data_information=self._source_data,
evaluation_metric_direction=self.evaluation_metric_direction,
)
)

Expand All @@ -111,9 +111,6 @@ def _analysis_competition_description(self):
self.evaluation_desc = response_json_analysis.get(
"Evaluation Description", "No evaluation specification provided."
)
self.evaluation_metric_direction = response_json_analysis.get(
"Evaluation Boolean", "No evaluation specification provided."
)

def get_competition_full_desc(self) -> str:
evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
Expand Down
16 changes: 7 additions & 9 deletions rdagent/scenarios/kaggle/proposal/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,13 @@ def generate_RAG_content(self, trace: Trace, hypothesis_and_feedback: str, targe

found_hypothesis_nodes = []
for similar_node in similar_nodes:
for hypothesis_type in KG_ACTION_LIST:
hypothesis_nodes = trace.knowledge_base.get_nodes_within_steps(
start_node=similar_node,
steps=3,
constraint_labels=[hypothesis_type],
)
found_hypothesis_nodes.extend(hypothesis_nodes[:2])
# for hypothesis_type in KG_ACTION_LIST:
hypothesis_nodes = trace.knowledge_base.get_nodes_within_steps(
start_node=similar_node,
steps=3,
constraint_labels=[target],
)
found_hypothesis_nodes.extend(hypothesis_nodes[:2])

found_hypothesis_nodes = sorted(list(set(found_hypothesis_nodes)), key=lambda x: len(x.content))

Expand Down Expand Up @@ -204,7 +204,6 @@ def execute_next_action(self, trace: Trace) -> str:
for action in actions:
if self.scen.action_counts[action] == 0:
selected_action = action
self.scen.action_counts[selected_action] += 1
return selected_action

c = self.scen.confidence_parameter
Expand All @@ -216,7 +215,6 @@ def execute_next_action(self, trace: Trace) -> str:
ucb_values[action] = ucb
# Select action with highest UCB
selected_action = max(ucb_values, key=ucb_values.get)
self.scen.action_counts[selected_action] += 1

return selected_action

Expand Down

0 comments on commit b022731

Please sign in to comment.