From bab167311db87b5ad2b1e20b5d5bfd8b2998e61c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:24:28 +0100 Subject: [PATCH 1/2] init: m5 forecasting FE benchmark --- m5-forecasting-feature-engineering/README.md | 27 ++++++ .../duckdb_queries.py | 1 + .../pandas_queries.py | 73 +++++++++++++++ .../polars_queries.py | 88 +++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 m5-forecasting-feature-engineering/README.md create mode 100644 m5-forecasting-feature-engineering/duckdb_queries.py create mode 100644 m5-forecasting-feature-engineering/pandas_queries.py create mode 100644 m5-forecasting-feature-engineering/polars_queries.py diff --git a/m5-forecasting-feature-engineering/README.md b/m5-forecasting-feature-engineering/README.md new file mode 100644 index 0000000..81dd863 --- /dev/null +++ b/m5-forecasting-feature-engineering/README.md @@ -0,0 +1,27 @@ +# M5 Forecasting Feature Engineering + +The [M5 Forecasting Competition](https://www.sciencedirect.com/science/article/pii/S0169207021001874) was held on Kaggle in 2020, +and top solutions generally featured a lot of heavy feature engineering. + +Participants typically used pandas (Polars was only just getting started at the time), so here we benchmark how long it have +taken to do the same feature engineering with Polars (and, coming soon, DuckDB). + +We believe this to be a useful task to benchmark, because: + +- the competition was run on real-world Walmart data +- the operations we're benchmarking are from the winning solution, so evidently they were doing something right + +The original code can be found here: https://github.com/Mcompetitions/M5-methods. We run part of the prepocessing +functions from the top solution "A1". The code is generally kept as-is, with some minor modifications to deal +with pandas syntax updates which happened in the last 2 years. + +## Running the benchmark + +**Data**: download the output files from https://www.kaggle.com/code/marcogorelli/winning-solution-preprocessing. +Place them in a `data` folder here. + +**Run**: + +- `python pandas_queries.py` +- `python polars_queries.py` + diff --git a/m5-forecasting-feature-engineering/duckdb_queries.py b/m5-forecasting-feature-engineering/duckdb_queries.py new file mode 100644 index 0000000..05292fa --- /dev/null +++ b/m5-forecasting-feature-engineering/duckdb_queries.py @@ -0,0 +1 @@ +# coming soon ;) diff --git a/m5-forecasting-feature-engineering/pandas_queries.py b/m5-forecasting-feature-engineering/pandas_queries.py new file mode 100644 index 0000000..53599b0 --- /dev/null +++ b/m5-forecasting-feature-engineering/pandas_queries.py @@ -0,0 +1,73 @@ +import os +import time + +import numpy as np +import pandas as pd +import pyarrow + +print("pandas version", pd.__version__) +print("numpy version", np.__version__) +print("pyarrow version", pyarrow.__version__) + +pd.options.mode.copy_on_write = True +pd.options.future.infer_string = True + +PROCESSED_DATA_DIR = "data" + +TARGET = "sales" +SHIFT_DAY = 28 + +# Set this to True if you just want to test that everything runs +SMALL = True +if SMALL: + PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet") +else: + PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet") + +LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] + + +def q1_pandas(df): + return df.assign( + **{ + f"{TARGET}_lag_{l}": df.groupby(["id"], observed=True)[TARGET].transform( + lambda x: x.shift(l) + ) + for l in LAG_DAYS + } + ) + + +def q2_pandas(df): + for i in [7, 14, 30, 60, 180]: + df["rolling_mean_" + str(i)] = df.groupby(["id"], observed=True)[ + TARGET + ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()) + for i in [7, 14, 30, 60, 180]: + df["rolling_std_" + str(i)] = df.groupby(["id"], observed=True)[ + TARGET + ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()) + return df + + +def q3_pandas(df): + for d_shift in [1, 7, 14]: + for d_window in [7, 14, 30, 60]: + col_name = "rolling_mean_" + str(d_shift) + "_" + str(d_window) + df[col_name] = df.groupby(["id"], observed=True)[TARGET].transform( + lambda x: x.shift(d_shift).rolling(d_window).mean() + ) + return df + + +start_time = time.perf_counter() +q1_pandas(pd.read_parquet(PATH, engine="pyarrow")) +print(f"q1 took: {time.perf_counter() - start_time}") + +start_time = time.perf_counter() +q2_pandas(pd.read_parquet(PATH, engine="pyarrow")) +print(f"q2 took: {time.perf_counter() - start_time}") + +start_time = time.perf_counter() +q3_pandas(pd.read_parquet(PATH, engine="pyarrow")) +print(f"q2 took: {time.perf_counter() - start_time}") diff --git a/m5-forecasting-feature-engineering/polars_queries.py b/m5-forecasting-feature-engineering/polars_queries.py new file mode 100644 index 0000000..e7a9afa --- /dev/null +++ b/m5-forecasting-feature-engineering/polars_queries.py @@ -0,0 +1,88 @@ +import os +import time + +import polars as pl + +print("polars version", pl.__version__) + +PROCESSED_DATA_DIR = "data" + +TARGET = "sales" +SHIFT_DAY = 28 + +# Set this to True if you just want to test that everything runs +SMALL = True +if SMALL: + PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet") +else: + PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet") + +LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] + + +def q1_polars(df): + return df.with_columns( + pl.col(TARGET).shift(l).over("id").alias(f"{TARGET}_lag_{l}") for l in LAG_DAYS + ) + + +def q2_polars(df): + return df.with_columns( + *[ + pl.col(TARGET) + .shift(SHIFT_DAY) + .rolling_mean(window_size=i) + .over("id") + .alias(f"rolling_mean_{i}") + for i in [7, 14, 30, 60, 180] + ], + *[ + pl.col(TARGET) + .shift(SHIFT_DAY) + .rolling_std(window_size=i) + .over("id") + .alias(f"rolling_std_{i}") + for i in [7, 14, 30, 60, 180] + ], + ) + + +def q3_polars(df): + return df.with_columns( + pl.col(TARGET) + .shift(d_shift) + .rolling_mean(window_size=d_window) + .over("id") + .alias(f"rolling_mean_{d_shift}_{d_window}") + for d_shift in [1, 7, 14] + for d_window in [7, 14, 30, 60] + ) + + +print("*** polars lazy ***") + +start_time = time.perf_counter() +q1_polars(pl.scan_parquet(PATH)).collect() +print(f"q1 took: {time.perf_counter() - start_time}") + +start_time = time.perf_counter() +q2_polars(pl.scan_parquet(PATH)).collect() +print(f"q2 took: {time.perf_counter() - start_time}") + +start_time = time.perf_counter() +q3_polars(pl.scan_parquet(PATH)).collect() +print(f"q2 took: {time.perf_counter() - start_time}") + +print("*** polars eager ***") + +start_time = time.perf_counter() +q1_polars(pl.read_parquet(PATH)) +print(f"q1 took: {time.perf_counter() - start_time}") + +start_time = time.perf_counter() +q2_polars(pl.read_parquet(PATH)) +print(f"q2 took: {time.perf_counter() - start_time}") + +start_time = time.perf_counter() +q3_polars(pl.read_parquet(PATH)) +print(f"q2 took: {time.perf_counter() - start_time}") From df2ccb59862850d0f65ee449d9c8c060b47027cb Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:35:57 +0100 Subject: [PATCH 2/2] lint --- .../pandas_queries.py | 20 +++++++++---------- .../polars_queries.py | 12 ++++++----- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/m5-forecasting-feature-engineering/pandas_queries.py b/m5-forecasting-feature-engineering/pandas_queries.py index 53599b0..401014b 100644 --- a/m5-forecasting-feature-engineering/pandas_queries.py +++ b/m5-forecasting-feature-engineering/pandas_queries.py @@ -1,5 +1,5 @@ -import os import time +from pathlib import Path import numpy as np import pandas as pd @@ -20,20 +20,20 @@ # Set this to True if you just want to test that everything runs SMALL = True if SMALL: - PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet") + PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1_small.parquet" else: - PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet") + PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1.parquet" -LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] +LAG_DAYS = list(range(SHIFT_DAY, SHIFT_DAY + 15)) def q1_pandas(df): return df.assign( **{ - f"{TARGET}_lag_{l}": df.groupby(["id"], observed=True)[TARGET].transform( - lambda x: x.shift(l) + f"{TARGET}_lag_{lag}": df.groupby(["id"], observed=True)[TARGET].transform( + lambda x: x.shift(lag) # noqa: B023 ) - for l in LAG_DAYS + for lag in LAG_DAYS } ) @@ -42,11 +42,11 @@ def q2_pandas(df): for i in [7, 14, 30, 60, 180]: df["rolling_mean_" + str(i)] = df.groupby(["id"], observed=True)[ TARGET - ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()) + ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()) # noqa: B023 for i in [7, 14, 30, 60, 180]: df["rolling_std_" + str(i)] = df.groupby(["id"], observed=True)[ TARGET - ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()) + ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()) # noqa: B023 return df @@ -55,7 +55,7 @@ def q3_pandas(df): for d_window in [7, 14, 30, 60]: col_name = "rolling_mean_" + str(d_shift) + "_" + str(d_window) df[col_name] = df.groupby(["id"], observed=True)[TARGET].transform( - lambda x: x.shift(d_shift).rolling(d_window).mean() + lambda x: x.shift(d_shift).rolling(d_window).mean() # noqa: B023 ) return df diff --git a/m5-forecasting-feature-engineering/polars_queries.py b/m5-forecasting-feature-engineering/polars_queries.py index e7a9afa..557228e 100644 --- a/m5-forecasting-feature-engineering/polars_queries.py +++ b/m5-forecasting-feature-engineering/polars_queries.py @@ -1,5 +1,5 @@ -import os import time +from pathlib import Path import polars as pl @@ -10,19 +10,21 @@ TARGET = "sales" SHIFT_DAY = 28 + # Set this to True if you just want to test that everything runs SMALL = True if SMALL: - PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet") + PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1_small.parquet" else: - PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet") + PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1.parquet" -LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)] +LAG_DAYS = list(range(SHIFT_DAY, SHIFT_DAY + 15)) def q1_polars(df): return df.with_columns( - pl.col(TARGET).shift(l).over("id").alias(f"{TARGET}_lag_{l}") for l in LAG_DAYS + pl.col(TARGET).shift(lag).over("id").alias(f"{TARGET}_lag_{lag}") + for lag in LAG_DAYS )