Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting various Kaggle competitions & scenarios for RD-Agent #409

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,21 +103,22 @@ def running(self, prev_out: dict[str, Any]):
if KAGGLE_IMPLEMENT_SETTING.auto_submit:
csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
try:
subprocess.run(
[
"kaggle",
"competitions",
"submit",
"-f",
str(csv_path.absolute()),
"-m",
str(csv_path.parent.absolute()),
KAGGLE_IMPLEMENT_SETTING.competition,
],
check=True,
)
command = [
"kaggle",
"competitions",
"submit",
"-f",
str(csv_path.absolute()),
"-m",
str(csv_path.parent.absolute()),
KAGGLE_IMPLEMENT_SETTING.competition,
]
logger.info(f"Executing Kaggle API command: {' '.join(command)}")
result = subprocess.run(command, check=True, capture_output=True, text=True)
logger.info(f"Kaggle API output: {result.stdout}")
except subprocess.CalledProcessError as e:
logger.error(f"Auto submission failed: \n{e}")
logger.error(f"Kaggle API error output: {e.stderr}")
except Exception as e:
logger.error(f"Other exception when use kaggle api:\n{e}")

Expand Down
31 changes: 31 additions & 0 deletions rdagent/log/ui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@
if "lround" not in state:
state.lround = 0 # RD Loop Round

if "times" not in state:
state.times = defaultdict(lambda: defaultdict(list))

if "erounds" not in state:
state.erounds = defaultdict(int) # Evolving Rounds in each RD Loop

Expand Down Expand Up @@ -186,6 +189,17 @@ def get_msgs_until(end_func: Callable[[Message], bool] = lambda _: True):
)

state.msgs[state.lround][msg.tag].append(msg)

# Update Times
if "init" in tags:
state.times[state.lround]["init"].append(msg.timestamp)
if "r" in tags:
state.times[state.lround]["r"].append(msg.timestamp)
if "d" in tags:
state.times[state.lround]["d"].append(msg.timestamp)
if "ef" in tags:
state.times[state.lround]["ef"].append(msg.timestamp)

# Stop Getting Logs
if end_func(msg):
break
Expand Down Expand Up @@ -224,6 +238,7 @@ def refresh(same_trace: bool = False):
state.last_msg = None
state.current_tags = []
state.alpha158_metrics = None
state.times = defaultdict(lambda: defaultdict(list))


def evolving_feedback_window(wsf: FactorSingleFeedback | ModelCoderFeedback):
Expand Down Expand Up @@ -741,6 +756,18 @@ def evolving_window():
st.markdown(state.scenario.rich_style_description + css, unsafe_allow_html=True)


def show_times(round: int):
for k, v in state.times[round].items():
if len(v) > 1:
diff = v[-1] - v[0]
else:
diff = v[0] - v[0]
total_seconds = diff.seconds
seconds = total_seconds % 60
minutes = total_seconds // 60
st.markdown(f"**:blue[{k}]**: :red[**{minutes}**] minutes :orange[**{seconds}**] seconds")


if state.scenario is not None:
summary_window()

Expand All @@ -754,8 +781,12 @@ def evolving_window():
round = st.radio("**Loops**", horizontal=True, options=r_options, index=state.lround - 1)
else:
round = 1

show_times(round)
rf_c, d_c = st.columns([2, 2])
elif isinstance(state.scenario, GeneralModelScenario):
show_times(round)

rf_c = st.container()
d_c = st.container()
round = 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ def import_module_from_path(module_name, module_path):
metrics_all.append(accuracy)

# 5) Save the validation accuracy
min_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
max_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")

# 6) Submit predictions for the test
ids = range(1, len(X_test) + 1)

# TODO: fix selection
print(X_valid_selected.columns)
y_test_pred = model_l[min_index][1](model_l[min_index][0], model_l[min_index][2].select(X_test)).flatten()
y_test_pred = model_l[max_index][1](model_l[max_index][0], model_l[max_index][2].select(X_test)).flatten()
submission_result = pd.DataFrame({"ImageId": ids, "Label": y_test_pred})
submission_result.to_csv("submission.csv", index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ def import_module_from_path(module_name, module_path):
metrics_all.append(accuracy)

# 5) Save the validation accuracy
min_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
max_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")

# 6) Make predictions on the test set and save them
X_test_selected = model_l[min_index][2].select(X_test.copy())
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1
X_test_selected = model_l[max_index][2].select(X_test.copy())
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1


# 7) Submit predictions for the test set
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")
y_train = pd.Series(y_train).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)

return X_train, X_valid, y_train, y_valid, X_test, *others

# train
train = pd.read_csv("/kaggle/input/train.csv")
X_train, X_valid, y_train, y_valid = train_test_split(
train.drop(["yield", "id"], axis=1), train["yield"], test_size=0.2, random_state=2023
)
y_train = pd.Series(y_train).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)

# test
test = pd.read_csv("/kaggle/input/test.csv")

ids = test["id"]
X_test = test.drop(["id"], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the Random Forest model. Merge feature_select"""
rf_params = {
"n_estimators": 100,
"max_depth": 10,
"min_samples_split": 2,
"min_samples_leaf": 1,
"max_features": "sqrt",
"random_state": 2023,
"n_jobs": -1,
"verbose": 1,
}
model = RandomForestRegressor(**rf_params)
model.fit(X_train, y_train)
return model


def predict(model, X_test):
"""
Keep feature select's consistency.
"""
y_pred = model.predict(X_test)
return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
motivation of the model
"""

import pandas as pd
import xgboost as xgb


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
xgb_params = {
"n_estimators": 280,
"learning_rate": 0.05,
"max_depth": 10,
"subsample": 1.0,
"colsample_bytree": 1.0,
"tree_method": "hist",
"enable_categorical": True,
"verbosity": 1,
"min_child_weight": 3,
"base_score": 4.6,
"random_state": 2023,
}
model = xgb.XGBRegressor(**xgb_params)
model.fit(X_train, y_train)
return model


def predict(model, X_test):
"""
Keep feature select's consistency.
"""
y_pred = model.predict(X_test)
return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Loading