Skip to content

Commit

Permalink
add ranking in kaggle scenario
Browse files Browse the repository at this point in the history
  • Loading branch information
WinstonLiyt committed Sep 29, 2024
1 parent 9a8960c commit c15cf0d
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 36 deletions.
11 changes: 11 additions & 0 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,16 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")
reason = response_json.get("Reasoning", "No reasoning provided")
decision = convert2bool(response_json.get("Replace Best Result", "no"))
leaderboard = self.scen.leaderboard
current_score = current_result.iloc[0]
sorted_scores = sorted(leaderboard, reverse=True)
import bisect

if self.scen.evaluation_metric_direction:
insert_position = bisect.bisect_right([-score for score in sorted_scores], -current_score)
else:
insert_position = bisect.bisect_left(sorted_scores, current_score, lo=0, hi=len(sorted_scores))
percentile_ranking = (insert_position) / (len(sorted_scores)) * 100

experiment_feedback = {
"current_competition": self.scen.get_competition_full_desc(),
Expand All @@ -158,6 +168,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
"observations": observations,
"hypothesis_evaluation": hypothesis_evaluation,
"reason": reason,
"percentile_ranking": percentile_ranking,
}

if self.scen.if_using_vector_rag:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)

# Fit the model
model.fit(X_train, y_train)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
"tree_method": "gpu_hist",
"device": "cuda",
}
num_round = 180
num_round = 200

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def import_module_from_path(module_name, module_path):
metrics_all.append(metrics)

# 5) Save the validation accuracy
min_index = np.argmin(metrics_all)
min_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv")

# 6) Make predictions on the test set and save them
Expand Down
7 changes: 6 additions & 1 deletion rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KGFactorExperiment
from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions
from rdagent.scenarios.kaggle.kaggle_crawler import (
crawl_descriptions,
leaderboard_scores,
)
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)
Expand Down Expand Up @@ -71,6 +74,8 @@ def __init__(self, competition: str) -> None:
self.confidence_parameter = 1.0
self.initial_performance = 0.0

self.leaderboard = leaderboard_scores(competition)

def _analysis_competition_description(self):
sys_prompt = (
Environment(undefined=StrictUndefined)
Expand Down
34 changes: 23 additions & 11 deletions rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
zip_ref.extractall(data_path)


def leaderboard_scores(competition: str) -> list[float]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
ll = api.competition_leaderboard_view(competition)
return [float(x.score) for x in ll]


def download_notebooks(
competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15
) -> None:
Expand Down Expand Up @@ -254,15 +263,18 @@ def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") ->
"facebook-v-predicting-check-ins",
]

all_cs = mini_case_cs + other_cs
for c in all_cs:
convert_notebooks_to_text(c)
exit()
from kaggle.api.kaggle_api_extended import KaggleApi
# all_cs = mini_case_cs + other_cs
# for c in all_cs:
# convert_notebooks_to_text(c)
# exit()
# from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
cs = api.competitions_list()
for c in cs:
name = c.ref.split("/")[-1]
crawl_descriptions(name)
# api = KaggleApi()
# api.authenticate()
# cs = api.competitions_list()
# for c in cs:
# name = c.ref.split("/")[-1]
# crawl_descriptions(name)
res = leaderboard_scores(competition="playground-series-s4e8")

# %%
23 changes: 2 additions & 21 deletions rdagent/scenarios/kaggle/proposal/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
ModelHypothesisGen,
)
from rdagent.core.exception import ModelEmptyError
from rdagent.core.exception import ModelEmptyError
from rdagent.core.prompts import Prompts
from rdagent.core.proposal import Hypothesis, Scenario, Trace
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import (
KG_SELECT_MAPPING,
KG_SELECT_MAPPING,
KGFactorExperiment,
KGModelExperiment,
Expand Down Expand Up @@ -88,10 +86,8 @@ def generate_RAG_content(self, trace: Trace, hypothesis_and_feedback: str, targe
if self.scen.if_using_vector_rag:
if self.scen.mini_case:
rag_results, _ = self.scen.vector_base.search_experience(target, hypothesis_and_feedback, topk_k=1)
rag_results, _ = self.scen.vector_base.search_experience(target, hypothesis_and_feedback, topk_k=1)
else:
rag_results, _ = self.scen.vector_base.search_experience(target, hypothesis_and_feedback, topk_k=5)
rag_results, _ = self.scen.vector_base.search_experience(target, hypothesis_and_feedback, topk_k=5)
return "\n".join([doc.content for doc in rag_results])
if self.scen.if_using_graph_rag is False or trace.knowledge_base is None:
return None
Expand Down Expand Up @@ -196,42 +192,32 @@ def update_reward_estimates(self, trace: Trace) -> None:
n_o = self.scen.action_counts[last_action]
mu_o = self.scen.reward_estimates[last_action]
self.scen.scen.reward_estimates[last_action] += (reward - mu_o) / n_o
n_o = self.scen.action_counts[last_action]
mu_o = self.scen.reward_estimates[last_action]
self.scen.scen.reward_estimates[last_action] += (reward - mu_o) / n_o
else:
# First iteration, nothing to update
pass

def execute_next_action(self, trace: Trace) -> str:
actions = list(self.scen.action_counts.keys())
t = sum(self.scen.action_counts.values()) + 1
actions = list(self.scen.action_counts.keys())
t = sum(self.scen.action_counts.values()) + 1

# If any action has not been tried yet, select it
for action in actions:
if self.scen.action_counts[action] == 0:
if self.scen.action_counts[action] == 0:
selected_action = action
self.scen.action_counts[selected_action] += 1
self.scen.action_counts[selected_action] += 1
return selected_action

c = self.scen.confidence_parameter
c = self.scen.confidence_parameter
ucb_values = {}
for action in actions:
mu_o = self.scen.reward_estimates[action]
n_o = self.scen.action_counts[action]
mu_o = self.scen.reward_estimates[action]
n_o = self.scen.action_counts[action]
ucb = mu_o + c * math.sqrt(math.log(t) / n_o)
ucb_values[action] = ucb
# Select action with highest UCB
selected_action = max(ucb_values, key=ucb_values.get)
self.scen.action_counts[selected_action] += 1
self.scen.action_counts[selected_action] += 1

return selected_action

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
Expand Down Expand Up @@ -307,8 +293,6 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
for experiment in experiment_list:
for sub_task in experiment.sub_tasks:
model_list.extend(sub_task.get_task_information())
for sub_task in experiment.sub_tasks:
model_list.extend(sub_task.get_task_information())

return {
"target_hypothesis": str(hypothesis),
Expand Down Expand Up @@ -355,10 +339,7 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
f"Invalid model type '{model_type}'. Allowed model types are: {', '.join(KG_SELECT_MAPPING)}."
)
model_type = response_dict.get("model_type", "Model type not provided")
if model_type not in KG_SELECT_MAPPING:
raise ModelEmptyError(
f"Invalid model type '{model_type}'. Allowed model types are: {', '.join(KG_SELECT_MAPPING)}."
)

tasks.append(
ModelTask(
name=response_dict.get("model_name", "Model name not provided"),
Expand Down

0 comments on commit c15cf0d

Please sign in to comment.