Split script into execution & submission part

eyra · Mar 3, 2024 · d3470da · d3470da
1 parent de7cd18
commit d3470da
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 39 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -6,8 +6,8 @@ RUN conda env create -f /environment.yml
 RUN mkdir /app
 
 COPY data /data
-COPY script.py /
+COPY *.py /
 COPY models /models
 
-ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/script.py"]
+ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"]
 CMD ["predict", "/data/fake_data.csv"]
diff --git a/script.py → run.py b/script.py → run.py
@@ -19,11 +19,10 @@
 python script.py data/test_data_liss_2_subjects.csv
 """
 
-import os
 import sys
 import argparse
 import pandas as pd
-from joblib import load
+import submission
 
 parser = argparse.ArgumentParser(description="Process and score data.")
 subparsers = parser.add_subparsers(dest="command")
@@ -46,47 +45,14 @@
 args = parser.parse_args()
 
 
-def predict_outcomes(df):
-    """Process the input data and write the predictions."""
-
-    # The predict_outcomes function accepts a Pandas DataFrame as an argument
-    # and returns a new DataFrame with two columns: nomem_encr and
-    # prediction. The nomem_encr column in the new DataFrame replicates the
-    # corresponding column from the input DataFrame. The prediction
-    # column contains predictions for each corresponding nomem_encr. Each
-    # prediction is represented as a binary value: '0' indicates that the
-    # individual did not have a child during 2020-2022, while '1' implies that
-    # they did.
-
-    # Keep
-    keepcols = [
-        "burgstat2019",
-        "leeftijd2019",
-        "woonvorm2019",
-        "oplmet2019",
-        "aantalki2019",
-    ]
-    nomem_encr = df["nomem_encr"]
-
-    df = df.loc[:, keepcols]
-
-    # Load your trained model from the models directory
-    model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib")
-    model = load(model_path)
-
-    # Use your trained model for prediction
-    predictions = model.predict(df)
-    # Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction"
-    return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1)
-
-
 def predict(input_path, output):
     if output is None:
         output = sys.stdout
     df = pd.read_csv(
         input_path, encoding="latin-1", encoding_errors="replace", low_memory=False
     )
-    predictions = predict_outcomes(df)
+    df = submission.clean_df(df)
+    predictions = submission.predict_outcomes(df)
     assert (
         predictions.shape[1] == 2
     ), "Predictions must have two columns: nomem_encr and prediction"

diff --git a/submission.py b/submission.py
@@ -0,0 +1,77 @@
+"""
+This is an example script to generate the outcome variable given the input dataset.
+
+This script should be modified to prepare your own submission that predicts 
+the outcome for the benchmark challenge by changing the predict_outcomes function. 
+
+The predict_outcomes function takes a Pandas data frame. The return value must
+be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
+should contain the nomem_encr column from the input data frame. The outcome
+column should contain the predicted outcome for each nomem_encr. The outcome
+should be 0 (no child) or 1 (having a child).
+
+The script can be run from the command line using the following command:
+
+python run.py input_path 
+
+An example for the provided test is:
+
+python run.py data/test_data_liss_2_subjects.csv
+"""
+
+import os
+import sys
+import argparse
+import pandas as pd
+from joblib import load
+
+
+def clean_df(df):
+    """Process the input data to feed the model."""
+    ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
+
+    # e.g. keep some variables (the ones you used in your model)
+    # keepcols = [
+    #     "burgstat2019",
+    #     "leeftijd2019",
+    #     "woonvorm2019",
+    #     "oplmet2019",
+    #     "aantalki2019",
+    # ]
+    # df = df.loc[:, keepcols]
+
+    return df
+
+
+def predict_outcomes(df):
+    """Process the input data and write the predictions."""
+
+    # The predict_outcomes function accepts a Pandas DataFrame as an argument
+    # and returns a new DataFrame with two columns: nomem_encr and
+    # prediction. The nomem_encr column in the new DataFrame replicates the
+    # corresponding column from the input DataFrame. The prediction
+    # column contains predictions for each corresponding nomem_encr. Each
+    # prediction is represented as a binary value: '0' indicates that the
+    # individual did not have a child during 2020-2022, while '1' implies that
+    # they did.
+
+    # Keep
+    keepcols = [
+        "burgstat2019",
+        "leeftijd2019",
+        "woonvorm2019",
+        "oplmet2019",
+        "aantalki2019",
+    ]
+    nomem_encr = df["nomem_encr"]
+
+    df = df.loc[:, keepcols]
+
+    # Load your trained model from the models directory
+    model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib")
+    model = load(model_path)
+
+    # Use your trained model for prediction
+    predictions = model.predict(df)
+    # Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction"
+    return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1)