diff --git a/Dockerfile b/Dockerfile index de22b58..1e22357 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,8 +6,8 @@ RUN conda env create -f /environment.yml RUN mkdir /app COPY data /data -COPY script.py / +COPY *.py / COPY models /models -ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/script.py"] +ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"] CMD ["predict", "/data/fake_data.csv"] \ No newline at end of file diff --git a/script.py b/run.py similarity index 77% rename from script.py rename to run.py index 3e94886..021b510 100644 --- a/script.py +++ b/run.py @@ -19,11 +19,10 @@ python script.py data/test_data_liss_2_subjects.csv """ -import os import sys import argparse import pandas as pd -from joblib import load +import submission parser = argparse.ArgumentParser(description="Process and score data.") subparsers = parser.add_subparsers(dest="command") @@ -46,47 +45,14 @@ args = parser.parse_args() -def predict_outcomes(df): - """Process the input data and write the predictions.""" - - # The predict_outcomes function accepts a Pandas DataFrame as an argument - # and returns a new DataFrame with two columns: nomem_encr and - # prediction. The nomem_encr column in the new DataFrame replicates the - # corresponding column from the input DataFrame. The prediction - # column contains predictions for each corresponding nomem_encr. Each - # prediction is represented as a binary value: '0' indicates that the - # individual did not have a child during 2020-2022, while '1' implies that - # they did. - - # Keep - keepcols = [ - "burgstat2019", - "leeftijd2019", - "woonvorm2019", - "oplmet2019", - "aantalki2019", - ] - nomem_encr = df["nomem_encr"] - - df = df.loc[:, keepcols] - - # Load your trained model from the models directory - model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib") - model = load(model_path) - - # Use your trained model for prediction - predictions = model.predict(df) - # Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction" - return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1) - - def predict(input_path, output): if output is None: output = sys.stdout df = pd.read_csv( input_path, encoding="latin-1", encoding_errors="replace", low_memory=False ) - predictions = predict_outcomes(df) + df = submission.clean_df(df) + predictions = submission.predict_outcomes(df) assert ( predictions.shape[1] == 2 ), "Predictions must have two columns: nomem_encr and prediction" diff --git a/submission.py b/submission.py new file mode 100644 index 0000000..806abeb --- /dev/null +++ b/submission.py @@ -0,0 +1,77 @@ +""" +This is an example script to generate the outcome variable given the input dataset. + +This script should be modified to prepare your own submission that predicts +the outcome for the benchmark challenge by changing the predict_outcomes function. + +The predict_outcomes function takes a Pandas data frame. The return value must +be a data frame with two columns: nomem_encr and outcome. The nomem_encr column +should contain the nomem_encr column from the input data frame. The outcome +column should contain the predicted outcome for each nomem_encr. The outcome +should be 0 (no child) or 1 (having a child). + +The script can be run from the command line using the following command: + +python run.py input_path + +An example for the provided test is: + +python run.py data/test_data_liss_2_subjects.csv +""" + +import os +import sys +import argparse +import pandas as pd +from joblib import load + + +def clean_df(df): + """Process the input data to feed the model.""" + ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command + + # e.g. keep some variables (the ones you used in your model) + # keepcols = [ + # "burgstat2019", + # "leeftijd2019", + # "woonvorm2019", + # "oplmet2019", + # "aantalki2019", + # ] + # df = df.loc[:, keepcols] + + return df + + +def predict_outcomes(df): + """Process the input data and write the predictions.""" + + # The predict_outcomes function accepts a Pandas DataFrame as an argument + # and returns a new DataFrame with two columns: nomem_encr and + # prediction. The nomem_encr column in the new DataFrame replicates the + # corresponding column from the input DataFrame. The prediction + # column contains predictions for each corresponding nomem_encr. Each + # prediction is represented as a binary value: '0' indicates that the + # individual did not have a child during 2020-2022, while '1' implies that + # they did. + + # Keep + keepcols = [ + "burgstat2019", + "leeftijd2019", + "woonvorm2019", + "oplmet2019", + "aantalki2019", + ] + nomem_encr = df["nomem_encr"] + + df = df.loc[:, keepcols] + + # Load your trained model from the models directory + model_path = os.path.join(os.path.dirname(__file__), "models", "model.joblib") + model = load(model_path) + + # Use your trained model for prediction + predictions = model.predict(df) + # Return the result as a Pandas DataFrame with the columns "nomem_encr" and "prediction" + return pd.concat([nomem_encr, pd.Series(predictions, name="prediction")], axis=1)