diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py index 2f1111ff..03764219 100644 --- a/rdagent/app/kaggle/loop.py +++ b/rdagent/app/kaggle/loop.py @@ -103,21 +103,22 @@ def running(self, prev_out: dict[str, Any]): if KAGGLE_IMPLEMENT_SETTING.auto_submit: csv_path = exp.experiment_workspace.workspace_path / "submission.csv" try: - subprocess.run( - [ - "kaggle", - "competitions", - "submit", - "-f", - str(csv_path.absolute()), - "-m", - str(csv_path.parent.absolute()), - KAGGLE_IMPLEMENT_SETTING.competition, - ], - check=True, - ) + command = [ + "kaggle", + "competitions", + "submit", + "-f", + str(csv_path.absolute()), + "-m", + str(csv_path.parent.absolute()), + KAGGLE_IMPLEMENT_SETTING.competition, + ] + logger.info(f"Executing Kaggle API command: {' '.join(command)}") + result = subprocess.run(command, check=True, capture_output=True, text=True) + logger.info(f"Kaggle API output: {result.stdout}") except subprocess.CalledProcessError as e: logger.error(f"Auto submission failed: \n{e}") + logger.error(f"Kaggle API error output: {e.stderr}") except Exception as e: logger.error(f"Other exception when use kaggle api:\n{e}") diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py index 8020e30c..3f6dd5d0 100644 --- a/rdagent/log/ui/app.py +++ b/rdagent/log/ui/app.py @@ -88,6 +88,9 @@ if "lround" not in state: state.lround = 0 # RD Loop Round +if "times" not in state: + state.times = defaultdict(lambda: defaultdict(list)) + if "erounds" not in state: state.erounds = defaultdict(int) # Evolving Rounds in each RD Loop @@ -186,6 +189,17 @@ def get_msgs_until(end_func: Callable[[Message], bool] = lambda _: True): ) state.msgs[state.lround][msg.tag].append(msg) + + # Update Times + if "init" in tags: + state.times[state.lround]["init"].append(msg.timestamp) + if "r" in tags: + state.times[state.lround]["r"].append(msg.timestamp) + if "d" in tags: + state.times[state.lround]["d"].append(msg.timestamp) + if "ef" in tags: + state.times[state.lround]["ef"].append(msg.timestamp) + # Stop Getting Logs if end_func(msg): break @@ -224,6 +238,7 @@ def refresh(same_trace: bool = False): state.last_msg = None state.current_tags = [] state.alpha158_metrics = None + state.times = defaultdict(lambda: defaultdict(list)) def evolving_feedback_window(wsf: FactorSingleFeedback | ModelCoderFeedback): @@ -741,6 +756,18 @@ def evolving_window(): st.markdown(state.scenario.rich_style_description + css, unsafe_allow_html=True) +def show_times(round: int): + for k, v in state.times[round].items(): + if len(v) > 1: + diff = v[-1] - v[0] + else: + diff = v[0] - v[0] + total_seconds = diff.seconds + seconds = total_seconds % 60 + minutes = total_seconds // 60 + st.markdown(f"**:blue[{k}]**: :red[**{minutes}**] minutes :orange[**{seconds}**] seconds") + + if state.scenario is not None: summary_window() @@ -754,8 +781,12 @@ def evolving_window(): round = st.radio("**Loops**", horizontal=True, options=r_options, index=state.lround - 1) else: round = 1 + + show_times(round) rf_c, d_c = st.columns([2, 2]) elif isinstance(state.scenario, GeneralModelScenario): + show_times(round) + rf_c = st.container() d_c = st.container() round = 1 diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py index c7572f09..c57cbe4a 100644 --- a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py @@ -75,14 +75,14 @@ def import_module_from_path(module_name, module_path): metrics_all.append(accuracy) # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") # 6) Submit predictions for the test ids = range(1, len(X_test) + 1) # TODO: fix selection print(X_valid_selected.columns) -y_test_pred = model_l[min_index][1](model_l[min_index][0], model_l[min_index][2].select(X_test)).flatten() +y_test_pred = model_l[max_index][1](model_l[max_index][0], model_l[max_index][2].select(X_test)).flatten() submission_result = pd.DataFrame({"ImageId": ids, "Label": y_test_pred}) submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py index 1bacd3d0..619cb2e2 100644 --- a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py @@ -76,12 +76,12 @@ def import_module_from_path(module_name, module_path): metrics_all.append(accuracy) # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") # 6) Make predictions on the test set and save them -X_test_selected = model_l[min_index][2].select(X_test.copy()) -y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1 +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1 # 7) Submit predictions for the test set diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py new file mode 100644 index 00000000..6b615386 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py @@ -0,0 +1,38 @@ +import os + +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +from sklearn.model_selection import train_test_split + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + return X_train, X_valid, y_train, y_valid, X_test, *others + + # train + train = pd.read_csv("/kaggle/input/train.csv") + X_train, X_valid, y_train, y_valid = train_test_split( + train.drop(["yield", "id"], axis=1), train["yield"], test_size=0.2, random_state=2023 + ) + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + # test + test = pd.read_csv("/kaggle/input/test.csv") + + ids = test["id"] + X_test = test.drop(["id"], axis=1) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_randomforest.py new file mode 100644 index 00000000..82b6712a --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_randomforest.py @@ -0,0 +1,27 @@ +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the Random Forest model. Merge feature_select""" + rf_params = { + "n_estimators": 100, + "max_depth": 10, + "min_samples_split": 2, + "min_samples_leaf": 1, + "max_features": "sqrt", + "random_state": 2023, + "n_jobs": -1, + "verbose": 1, + } + model = RandomForestRegressor(**rf_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_xgboost.py new file mode 100644 index 00000000..16cb7c34 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_xgboost.py @@ -0,0 +1,34 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + xgb_params = { + "n_estimators": 280, + "learning_rate": 0.05, + "max_depth": 10, + "subsample": 1.0, + "colsample_bytree": 1.0, + "tree_method": "hist", + "enable_categorical": True, + "verbosity": 1, + "min_child_weight": 3, + "base_score": 4.6, + "random_state": 2023, + } + model = xgb.XGBRegressor(**xgb_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/train.py new file mode 100644 index 00000000..802f2cc2 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/train.py @@ -0,0 +1,76 @@ +import importlib.util +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import mean_absolute_error + +DIRNAME = Path(__file__).absolute().resolve().parent + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +if len(X_train_l) > 1: + X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) + X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) + X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + metrics = mean_absolute_error(y_valid, y_valid_pred) + print(f"MAE on valid set: {metrics}") + metrics_all.append(metrics) + +# 5) Save the validation accuracy +min_index = np.argmin(metrics_all) +pd.Series(data=[metrics_all[min_index]], index=["MAE"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[min_index][2].select(X_test.copy()) +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame(y_test_pred, columns=["yield"]) +submission_result.insert(0, "id", ids) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/fea_share_preprocess.py new file mode 100644 index 00000000..90e3b8ed --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/fea_share_preprocess.py @@ -0,0 +1,45 @@ +import os + +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + return X_train, X_valid, y_train, y_valid, X_test, *others + + # train + train = pd.read_csv("/kaggle/input/train.csv") + + le = LabelEncoder() + train["Sex"] = le.fit_transform(train["Sex"]) + + X_train, X_valid, y_train, y_valid = train_test_split( + train.drop(["Age", "id"], axis=1), train["Age"], test_size=0.2, random_state=2023 + ) + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + # test + test = pd.read_csv("/kaggle/input/test.csv") + + test["Sex"] = le.transform(test["Sex"]) + ids = test["id"] + + X_test = test.drop(["id"], axis=1) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_randomforest.py new file mode 100644 index 00000000..82b6712a --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_randomforest.py @@ -0,0 +1,27 @@ +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the Random Forest model. Merge feature_select""" + rf_params = { + "n_estimators": 100, + "max_depth": 10, + "min_samples_split": 2, + "min_samples_leaf": 1, + "max_features": "sqrt", + "random_state": 2023, + "n_jobs": -1, + "verbose": 1, + } + model = RandomForestRegressor(**rf_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_xgboost.py new file mode 100644 index 00000000..16cb7c34 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_xgboost.py @@ -0,0 +1,34 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + xgb_params = { + "n_estimators": 280, + "learning_rate": 0.05, + "max_depth": 10, + "subsample": 1.0, + "colsample_bytree": 1.0, + "tree_method": "hist", + "enable_categorical": True, + "verbosity": 1, + "min_child_weight": 3, + "base_score": 4.6, + "random_state": 2023, + } + model = xgb.XGBRegressor(**xgb_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/train.py new file mode 100644 index 00000000..e04091ee --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/train.py @@ -0,0 +1,76 @@ +import importlib.util +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import mean_absolute_error + +DIRNAME = Path(__file__).absolute().resolve().parent + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +if len(X_train_l) > 1: + X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) + X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) + X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + metrics = mean_absolute_error(y_valid, y_valid_pred) + print(f"MAE on valid set: {metrics}") + metrics_all.append(metrics) + +# 5) Save the validation accuracy +min_index = np.argmin(metrics_all) +pd.Series(data=[metrics_all[min_index]], index=["MAE"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[min_index][2].select(X_test.copy()) +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame(np.round(y_test_pred).astype(int), columns=["Age"]) +submission_result.insert(0, "id", ids) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/fea_share_preprocess.py new file mode 100644 index 00000000..33bc18a1 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/fea_share_preprocess.py @@ -0,0 +1,99 @@ +import os + +import numpy as np +import pandas as pd +import scipy.sparse +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + + +def prepreprocess(): + """ + Load the data, preprocess it, and split into train and validation sets. + """ + # Load the data + data_df = pd.read_csv("/kaggle/input/train.csv") + + # Check if 'id' is actually the index + if "id" not in data_df.columns and data_df.index.name == "id": + data_df = data_df.reset_index() + + # Now we can safely drop the 'id' column + data_df = data_df.drop(["id"], axis=1) + + # Separate features and targets + target_columns = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"] + X = data_df.drop(target_columns, axis=1) + y = data_df[target_columns] + + # Split the data + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + """ + Fit the preprocessor on the training data. + """ + numerical_cols = X_train.columns # All columns are numerical in this dataset + + numerical_transformer = Pipeline( + steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] + ) + + preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_cols)]) + + preprocessor.fit(X_train) + + return preprocessor, numerical_cols + + +def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols): + X_transformed = preprocessor.transform(X) + + # If X_transformed is a sparse matrix, convert it to a dense array + if scipy.sparse.issparse(X_transformed): + X_transformed = X_transformed.toarray() + + # Get feature names from the preprocessor + feature_names = preprocessor.get_feature_names_out() + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_transformed, columns=feature_names, index=X.index) + + return X_transformed + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + return X_train, X_valid, y_train, y_valid, X_test, *others + + X_train, X_valid, y_train, y_valid = prepreprocess() + + # Fit the preprocessor on the training data + preprocessor, numerical_cols = preprocess_fit(X_train) + + # Preprocess the train, validation, and test data + X_train = preprocess_transform(X_train, preprocessor, numerical_cols) + X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols) + + # Load and preprocess the test data + submission_df = pd.read_csv("/kaggle/input/test.csv") + ids = submission_df["id"] + submission_df = submission_df.drop(["id"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor, numerical_cols) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/model_randomforest.py new file mode 100644 index 00000000..deef63da --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/model_randomforest.py @@ -0,0 +1,32 @@ +""" +Random Forest model for academic success classification. +""" + +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Train the Random Forest model. + """ + model = RandomForestClassifier( + n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1 + ) + + model.fit(X_train, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid) + accuracy = accuracy_score(y_valid, y_valid_pred) + print(f"Validation Accuracy: {accuracy:.4f}") + + return model + + +def predict(model, X): + """ + Make predictions using the trained model. + """ + return model.predict_proba(X) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/train.py new file mode 100644 index 00000000..ffc3c40f --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e3_template/train.py @@ -0,0 +1,110 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import StratifiedKFold + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +def compute_metrics_for_classification(y_true, y_pred): + """Compute average ROC AUC for multi-label classification.""" + auc_scores = [] + for i in range(y_true.shape[1]): + # Convert y_pred to numpy array if it's a list + if isinstance(y_pred, list): + y_pred_i = np.array(y_pred[i])[:, 1] # Get probabilities for positive class + else: + y_pred_i = y_pred[:, i] + auc_scores.append(roc_auc_score(y_true.iloc[:, i], y_pred_i)) + return np.mean(auc_scores) + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def get_positive_proba(y_pred): + """Extract positive class probabilities from prediction.""" + if isinstance(y_pred, list): + return np.column_stack([proba[:, 1] for proba in y_pred]) + return y_pred + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# Print X_train head and shape +print("X_train shape:", X_train.shape) +print("\nX_train head:") +print(X_train.head()) + +# Print y_train head and shape +print("\ny_train shape:", y_train.shape) +print("\ny_train head:") +print(y_train.head()) + +# Replace the existing train-validation split with StratifiedKFold +skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) + +# 3) Train the model +model_l = [] +for train_index, val_index in skf.split(X_train, y_train.iloc[:, 0]): # Using the first target for stratification + X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index] + y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index] + + for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train_fold.copy()) + X_val_selected = select_m.select(X_val_fold.copy()) + + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train_selected, y_train_fold, X_val_selected, y_val_fold), m.predict, select_m)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + + # Debug print + print("y_valid_pred type:", type(y_valid_pred)) + if isinstance(y_valid_pred, list): + print("y_valid_pred length:", len(y_valid_pred)) + print("First element shape:", y_valid_pred[0].shape) + else: + print("y_valid_pred shape:", y_valid_pred.shape) + + y_valid_pred_positive = get_positive_proba(y_valid_pred) + metrics = compute_metrics_for_classification(y_valid, y_valid_pred_positive) + print("Average ROC AUC on validation set: ", metrics) + metrics_all.append(metrics) + +# 5) Save the validation ROC AUC +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["ROC AUC"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected) +y_test_pred_positive = get_positive_proba(y_test_pred) + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame({"id": ids}) +target_columns = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"] +for i, col in enumerate(target_columns): + submission_result[col] = y_test_pred_positive[:, i] + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/fea_share_preprocess.py new file mode 100644 index 00000000..8987d00e --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/fea_share_preprocess.py @@ -0,0 +1,70 @@ +import os + +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline + + +def prepreprocess(): + data_df = pd.read_csv("/kaggle/input/train.csv") + data_df = data_df.drop(["id"], axis=1) + + X = data_df.drop(["FloodProbability"], axis=1) + y = data_df["FloodProbability"] + + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]] + + numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numerical_transformer, numerical_cols), + ] + ) + + preprocessor.fit(X_train) + + return preprocessor, numerical_cols + + +def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols): + X_transformed = preprocessor.transform(X) + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols, index=X.index) + + return X_transformed + + +def preprocess_script(): + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, *others + + X_train, X_valid, y_train, y_valid = prepreprocess() + + preprocessor, numerical_cols = preprocess_fit(X_train) + + X_train = preprocess_transform(X_train, preprocessor, numerical_cols) + X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols) + + submission_df = pd.read_csv("/kaggle/input/test.csv") + ids = submission_df["id"] + submission_df = submission_df.drop(["id"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor, numerical_cols) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_randomforest.py new file mode 100644 index 00000000..bf1b273d --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_randomforest.py @@ -0,0 +1,33 @@ +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1) + + # Fit the model + model.fit(X_train, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid) + mse = mean_squared_error(y_valid, y_valid_pred) + rmse = np.sqrt(mse) + print(f"Validation RMSE: {rmse:.4f}") + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Predict using the trained model + y_pred = model.predict(X) + + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_xgboost.py new file mode 100644 index 00000000..7e517fb0 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_xgboost.py @@ -0,0 +1,34 @@ +import pandas as pd +import xgboost as xgb + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + dtrain = xgb.DMatrix(X_train, label=y_train) + dvalid = xgb.DMatrix(X_valid, label=y_valid) + + # Parameters for regression + params = { + "objective": "reg:squarederror", # Use squared error for regression + "nthread": -1, + "n_estimators": 8000, + "tree_method": "gpu_hist", + "device": "cuda", + "max_depth": 10, + "learning_rate": 0.01, + } + num_round = 5000 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist) + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + dtest = xgb.DMatrix(X) + y_pred = model.predict(dtest) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/train.py new file mode 100644 index 00000000..2d9a3a95 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/train.py @@ -0,0 +1,99 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import r2_score + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +def compute_r2(y_true, y_pred): + """Compute R² score for regression.""" + return r2_score(y_true, y_pred) + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train.replace([np.inf, -np.inf], np.nan, inplace=True) +X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) +X_test.replace([np.inf, -np.inf], np.nan, inplace=True) + +from sklearn.impute import SimpleImputer + +imputer = SimpleImputer(strategy="mean") + +X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) +X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) +X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func,]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + model_name = f.stem + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m, model_name)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m, model_name in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + r2 = compute_r2(y_valid, y_valid_pred) + print(f"R2 on valid set for {model_name}: {r2}") + metrics_all.append(r2) + +# 5) Save the validation accuracy +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["R2"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).ravel() + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame({"id": ids, "FloodProbability": y_test_pred}) +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/fea_share_preprocess.py new file mode 100644 index 00000000..d99c8e47 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/fea_share_preprocess.py @@ -0,0 +1,117 @@ +import os + +import numpy as np +import pandas as pd +import scipy.sparse +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler + + +def prepreprocess(): + """ + Load the data, preprocess it, and split into train and validation sets. + """ + # Load the data + data_df = pd.read_csv("/kaggle/input/train.csv") + + # Check if 'id' is actually the index + if "id" not in data_df.columns and data_df.index.name == "id": + data_df = data_df.reset_index() + + # Now we can safely drop the 'id' column + data_df = data_df.drop(["id"], axis=1) + + print(data_df.head(20)) + + X = data_df.drop(["Target"], axis=1) + y = data_df["Target"] + + # Encode target variable + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) + + # Split the data + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + """ + Fit the preprocessor on the training data. + """ + numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns + categorical_cols = X_train.select_dtypes(include=["object"]).columns + + numerical_transformer = Pipeline( + steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] + ) + + preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_cols)]) + + if len(categorical_cols) > 0: + categorical_transformer = Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), + ("onehot", OneHotEncoder(handle_unknown="ignore")), + ] + ) + preprocessor.transformers.append(("cat", categorical_transformer, categorical_cols)) + + preprocessor.fit(X_train) + + return preprocessor, numerical_cols, categorical_cols + + +def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols): + X_transformed = preprocessor.transform(X) + + # If X_transformed is a sparse matrix, convert it to a dense array + if scipy.sparse.issparse(X_transformed): + X_transformed = X_transformed.toarray() + + # Get feature names from the preprocessor + feature_names = preprocessor.get_feature_names_out() + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_transformed, columns=feature_names, index=X.index) + + return X_transformed + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + return X_train, X_valid, y_train, y_valid, X_test, *others + X_train, X_valid, y_train, y_valid = prepreprocess() + + # Fit the preprocessor on the training data + preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train) + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + # Preprocess the train, validation, and test data + X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols) + X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols) + + # Load and preprocess the test data + submission_df = pd.read_csv("/kaggle/input/test.csv") + ids = submission_df["id"] + submission_df = submission_df.drop(["id"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/model_randomforest.py new file mode 100644 index 00000000..84633948 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/model_randomforest.py @@ -0,0 +1,34 @@ +""" +Random Forest model for academic success classification. +""" + +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Train the Random Forest model. + """ + model = RandomForestClassifier( + n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1 + ) + + model.fit(X_train, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid) + accuracy = accuracy_score(y_valid, y_valid_pred) + print(f"Validation Accuracy: {accuracy:.4f}") + + return model + + +def predict(model, X): + """ + Make predictions using the trained model. + """ + probas = model.predict_proba(X) + return probas diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/train.py new file mode 100644 index 00000000..68632c8e --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e6_template/train.py @@ -0,0 +1,80 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import accuracy_score, matthews_corrcoef +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import LabelEncoder # Add this import + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +# support various method for metrics calculation +def compute_metrics_for_classification(y_true, y_pred): + """Compute MCC for classification.""" + mcc = matthews_corrcoef(y_true, y_pred) + return mcc + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# Replace the existing train-validation split with StratifiedKFold +skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) + +# 3) Train the model +model_l = [] +for train_index, val_index in skf.split(X_train, y_train): + X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index] + y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index] + + for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train_fold.copy()) + X_val_selected = select_m.select(X_val_fold.copy()) + + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train_selected, y_train_fold, X_val_selected, y_val_fold), m.predict, select_m)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + y_valid_pred = np.argmax(y_valid_pred, axis=1) + metrics = accuracy_score(y_valid, y_valid_pred) + print("Accuracy on validation set: ", metrics) + metrics_all.append(metrics) + +# 5) Save the validation accuracy +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["Accuracy"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected) +y_test_pred = np.argmax(y_test_pred, axis=1) + +# Convert numeric predictions back to original labels +label_encoder = LabelEncoder() +label_encoder.fit(["Graduate", "Dropout", "Enrolled"]) +y_test_pred_labels = label_encoder.inverse_transform(y_test_pred) + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame({"id": ids, "Target": y_test_pred_labels}) +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py index 10c98a31..42d24dc2 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py @@ -88,12 +88,12 @@ def import_module_from_path(module_name, module_path): metrics_all.append(metrics) # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["MCC"]).to_csv("submission_score.csv") # 6) Make predictions on the test set and save them -X_test_selected = model_l[min_index][2].select(X_test.copy()) -y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected) y_test_pred = (y_test_pred > 0.5).astype(int) y_test_pred_labels = np.where(y_test_pred == 1, "p", "e") # 将整数转换回 'e' 或 'p' diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py index fe8e3ad5..c94403b5 100644 --- a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py @@ -95,12 +95,12 @@ def import_module_from_path(module_name, module_path): # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["MCC"]).to_csv("submission_score.csv") # 6) Make predictions on the test set and save them -X_test_selected = model_l[min_index][2].select(X_test.copy()) -y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected) y_test_pred = (y_test_pred > 0.5).astype(bool) y_test_pred = y_test_pred.ravel() diff --git a/rdagent/scenarios/kaggle/experiment/statoil-iceberg-classifier-challenge_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/statoil-iceberg-classifier-challenge_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/statoil-iceberg-classifier-challenge_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py index 9146570a..a12d16dc 100644 --- a/rdagent/scenarios/kaggle/kaggle_crawler.py +++ b/rdagent/scenarios/kaggle/kaggle_crawler.py @@ -1,4 +1,5 @@ # %% +import bisect import json import subprocess import time @@ -96,6 +97,26 @@ def leaderboard_scores(competition: str) -> list[float]: return [float(x.score) for x in ll] +def score_rank(competition: str, score: float) -> tuple[int, float]: + """ + Return + ------ + rank: int + rank_percent: float + """ + scores = leaderboard_scores(competition) + if scores[0] < scores[-1]: # Ascending order + rank = bisect.bisect_right(scores, score) + else: # Descending order + scores = scores[::-1] # Reverse the list to use bisect + rank = len(scores) - bisect.bisect_right(scores, score) + + rank = rank + 1 + rank_percent = rank / len(scores) * 100 + + return rank, rank_percent + + def download_notebooks( competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15 ) -> None: @@ -276,5 +297,6 @@ def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") -> # name = c.ref.split("/")[-1] # crawl_descriptions(name) res = leaderboard_scores(competition="playground-series-s4e8") - + rank, rank_percent = score_rank(competition="playground-series-s4e8", score=0.9832) + print(rank, rank_percent) # %% diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml index 29659b01..e7452713 100644 --- a/rdagent/scenarios/kaggle/prompts.yaml +++ b/rdagent/scenarios/kaggle/prompts.yaml @@ -50,41 +50,55 @@ hypothesis_output_format: |- hypothesis_specification: Feature engineering: |- Action: Feature engineering - description: We engineer the features for the sake of best model performance on the basis of engineering the most influential features. - type_of_feature_and_data_characteristics: - - Clearly define the feature type being introduced. - - Highlight the specific data patterns or characteristics the feature captures. - - Keep it focused—omit unnecessary details. - start_with_simple_features: - - Begin with straightforward and impactful features. - - Briefly explain why these features are expected to work. - - Avoid combining complex features at the outset. - increase_complexity_gradually: - - Add more complex features only after gathering experimental results. - - Discuss potential advantages and the trade-offs involved. - - Combine features only after simpler ones are tested and validated. - new_directions_and_optimizations: - - Justify any new direction based on data analysis or domain knowledge. - - Focus on one new direction at a time for clarity. - - If a hypothesis shows optimization potential (even without surpassing previous best results), explain why and proceed. - feature_library_and_task_management: - - Include features that improve performance in the feature library. - - Each generation should focus on 1-3 feature tasks, balancing simplicity with complexity. + + Description: We engineer the features for the sake of best model performance on the basis of engineering the most influential features. + + 1. Type of Feature and Data Characteristics: + - Clearly define the type of feature being introduced. + - Explain what data characteristics or patterns this feature captures. + - Keep descriptions focused, avoiding redundant details to ensure clarity. + + 2. Simple and Effective Features First: + - Start by introducing features that are simple yet likely to be effective. + - Provide a concise explanation of why these features are expected to perform well. + - Avoid complex or combined features during the initial stages. + + 3. Gradual Complexity Increase: + - After initial feature testing, introduce more complex features. + - Discuss both the potential benefits and any additional complexities of these features. + - Begin combining features only after simpler ones have been tested and validated. + + 4. New Directions and Optimizations: + - If results suggest a need for a new approach, explain why, using data analysis, domain knowledge, or observed patterns. + - Propose one new direction per iteration for clarity and focus. + - If a previous hypothesis did not surpass the previous best but shows promise, continue in the same direction with optimizations. + - Emphasize that features that outperform previous best results are added to the feature library, avoiding redundant work. + + 5. 1-3 Feature Tasks per Generation: + - Each generation should produce 1-3 feature tasks. + - Maintain a balance between simplicity and complexity to develop a diverse and robust feature library. Feature processing: |- Action: Feature processing - Define_the_processing_method: - - Clearly state the type of feature processing. - - Explain how this processing captures data patterns or improves feature usefulness. - - Avoid redundant details. - Begin_with_simple_processing: - - Start with simple, effective processing methods. - - Concisely explain why these methods should improve model performance. - - Introduce complex processing only after gathering experimental results. - Introduce_complexity_gradually: - - Add more sophisticated processing methods step-by-step, after validation. - - Discuss the advantages, challenges, and trade-offs of advanced processing. - - Validate simpler methods before combining them with complex ones. + + 1. Feature Transformation and Normalization: + - Clearly define any transformations applied to features (e.g., scaling, normalization, log transforms). + - Explain how these transformations improve the data's suitability for the model. + - Ensure transformations do not introduce unnecessary complexity early on. + + 2. Handling Missing Values and Outliers: + - Define any imputation methods used for missing data (e.g., mean, median, or more complex methods). + - Explain how outliers are handled (e.g., clipping, removal, or transformation). + - Ensure these processes are straightforward, enhancing data quality without overcomplicating early feature processing. + + 3. Feature Interactions and Combinations: + - After testing individual features, introduce combinations or interactions. + - Discuss the potential advantages of feature interaction terms (e.g., polynomial or multiplicative features). + - Ensure interactions are only applied after simpler, individual features have been processed. + + 4. 1-3 Feature Tasks per Generation: + - Each generation should produce 1-3 feature tasks. + - Maintain a balance between simplicity and complexity to develop a diverse and robust feature library. Model feature selection: |- Selection_based_on_model_type: @@ -95,33 +109,37 @@ hypothesis_specification: - Clarify how the selected features complement the model's strengths and handle its potential weaknesses. Model tuning: |- - Explain the hypothesis clearly with valuable information. What kind of model are you building/tuning? What do you think is true? How you are revising and why? What are some innvations? - Focus_on_architecture_or_hyper_parameter_tuning_or_both: - - Focus on designing new model architectures one at a time OR hyper-parameter tuning OR both. - - Each hypothesis should introduce a novel architecture or a significant modification to an existing one, while leveraging previous experiences and the hypothesis history. - - Optimize one model at a time, iterating until its potential is fully explored. Switch to a new model only when you believe the current model’s potential has been exhausted. - Specific_to_model_type: - - Note that any types of tuning or model design must be specific to the model types available in our workspace. - - Clearly define the model type (e.g., Neural Network Models (eg, MLP, CNN, RNN, LSTM, GRU etc.), XGBoost, RandomForest, LightGBM) and the architecture/tuning being introduced. - - Ensure the architecture or tuning aligns with the data characteristics and the strengths or limitations of the specific model. - Rationale_behind_architecture_and_tuning: - - Explain the innovation or reasoning behind the architectural design or tuning approach. - - Justify how the new structure or parameter change captures data patterns more effectively, improves learning efficiency, or enhances predictive power. - Start_simple_innovate_gradually: - - Start with innovative yet simple changes to ensure each iteration is well-tested and the results are well-understood. - - Gradually introduce more complex architectural changes or hyper-parameter adjustments based on gathered results and insights. - Introduce_one_innovation_at_a_time: - - Focus on testing one key innovation at a time to isolate its impact on performance. - - Avoid combining multiple innovations in a single iteration to maintain clarity in performance results. - Balance_innovation_with_performance: - - Strive for a balance between creative design and practical, effective performance. - - If a design or tuning shows strong performance, document it in a "library" for future iterations. - Iterative_testing_and_refinement: - - After each test, evaluate and refine the model architecture or tuning based on observed performance and data patterns. - - If a hypothesis shows potential but doesn't surpass previous results, continue optimizing in that direction. - Hypothesis_statement: - - For each hypothesis, specify the exact innovation or tuning approach and explain why it's expected to enhance performance for the chosen model type. - + Explain the hypothesis clearly with valuable information. What kind of model are you building/tuning? What do you think is true? How are you revising and why? What are some innovations? Base your hypothesis on the previous history and your understanding of the model code. "Tune" means changing the model architecture or hyperparameters. + Focus_on_architecture_or_hyper_parameter_tuning_or_both: + - Focus on designing new model architectures one at a time OR hyper-parameter tuning OR both. + - Each hypothesis should introduce a novel architecture or a significant modification to an existing one, while leveraging previous experiences and the hypothesis history. + - Optimize one model at a time, iterating until its potential is fully explored. Switch to a new model only when you believe the current model’s potential has been exhausted. + Specific_to_model_type: + - Note that any types of tuning or model design must be specific to the model types available in our workspace. + - Clearly define the model type (e.g., Neural Network Models (MLP, CNN, RNN, LSTM, GRU etc.), XGBoost, RandomForest, LightGBM) and the architecture/tuning being introduced. + - Ensure the architecture or tuning aligns with the data characteristics and the strengths or limitations of the specific model. + Rationale_behind_architecture_and_tuning: + - Explain the innovation or reasoning behind the architectural design or tuning approach. + - Justify how the new structure or parameter change captures data patterns more effectively, improves learning efficiency, or enhances predictive power. + Start_simple_innovate_gradually: + - Start with innovative yet simple changes to ensure each iteration is well-tested and the results are well-understood. + - Gradually introduce more complex architectural changes or hyper-parameter adjustments based on gathered results and insights. + Introduce_one_innovation_at_a_time: + - Focus on testing one key innovation at a time to isolate its impact on performance. + - Avoid combining multiple innovations in a single iteration to maintain clarity in performance results. + Hypothesis_statement: + - For each hypothesis, specify the exact innovation or tuning approach and explain why it's expected to enhance performance for the chosen model type. Eg. Instead of a general "Adjusting", specific the direction & extent. + Hypothesis_examples: (Please note that they are only examples) + 1. "Increasing the dropout rate in an MLP from 0.2 to 0.5 will help reduce overfitting and improve model generalization on validation data." + 2. "Adding a skip connection to the CNN architecture will allow deeper layers to receive gradients more effectively, preventing vanishing gradients and improving feature learning." + 3. "Doubling the GRU hidden units from 128 to 256 will allow the model to capture more complex temporal dependencies in time-series data, potentially improving accuracy." + 4. "Switching the LSTM optimizer from Adam to SGD with momentum will slow down convergence, leading to more stable and refined learning over time in sparse data environments." + 5. "Reducing the learning rate in a LightGBM model from 0.05 to 0.01 will slow down the learning process, allowing for better generalization on larger datasets." + 6. "Incorporating a self-attention layer into the RNN model will enhance its ability to focus on important parts of the input sequence, improving sequence-to-sequence translation accuracy." + 7. "Increasing the maximum depth of trees in a RandomForest model from 10 to 20 will allow the model to capture more complex patterns in high-dimensional data, improving performance." + 8. "Replacing the ReLU activation with Leaky ReLU in a CNN will prevent the dying ReLU problem and improve the model’s ability to learn from negative values." + 9. "Introducing early stopping in XGBoost with a patience of 10 rounds will prevent overfitting by halting training when the validation error no longer improves." + 10. "Expanding the CNN kernel size from 3x3 to 5x5 in early layers will help capture larger spatial dependencies in image data, enhancing performance on image classification tasks." feature_experiment_output_format: |- According to the hypothesis, please help user design one or more feature engineering tasks.