From bab167311db87b5ad2b1e20b5d5bfd8b2998e61c Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:24:28 +0100
Subject: [PATCH 1/2] init: m5 forecasting FE benchmark

---
 m5-forecasting-feature-engineering/README.md  | 27 ++++++
 .../duckdb_queries.py                         |  1 +
 .../pandas_queries.py                         | 73 +++++++++++++++
 .../polars_queries.py                         | 88 +++++++++++++++++++
 4 files changed, 189 insertions(+)
 create mode 100644 m5-forecasting-feature-engineering/README.md
 create mode 100644 m5-forecasting-feature-engineering/duckdb_queries.py
 create mode 100644 m5-forecasting-feature-engineering/pandas_queries.py
 create mode 100644 m5-forecasting-feature-engineering/polars_queries.py

diff --git a/m5-forecasting-feature-engineering/README.md b/m5-forecasting-feature-engineering/README.md
new file mode 100644
index 0000000..81dd863
--- /dev/null
+++ b/m5-forecasting-feature-engineering/README.md
@@ -0,0 +1,27 @@
+# M5 Forecasting Feature Engineering
+
+The [M5 Forecasting Competition](https://www.sciencedirect.com/science/article/pii/S0169207021001874) was held on Kaggle in 2020,
+and top solutions generally featured a lot of heavy feature engineering.
+
+Participants typically used pandas (Polars was only just getting started at the time), so here we benchmark how long it have
+taken to do the same feature engineering with Polars (and, coming soon, DuckDB).
+
+We believe this to be a useful task to benchmark, because:
+
+- the competition was run on real-world Walmart data
+- the operations we're benchmarking are from the winning solution, so evidently they were doing something right
+
+The original code can be found here: https://github.com/Mcompetitions/M5-methods. We run part of the prepocessing
+functions from the top solution "A1". The code is generally kept as-is, with some minor modifications to deal
+with pandas syntax updates which happened in the last 2 years.
+
+## Running the benchmark
+
+**Data**: download the output files from https://www.kaggle.com/code/marcogorelli/winning-solution-preprocessing.
+Place them in a `data` folder here.
+
+**Run**:
+
+- `python pandas_queries.py`
+- `python polars_queries.py`
+
diff --git a/m5-forecasting-feature-engineering/duckdb_queries.py b/m5-forecasting-feature-engineering/duckdb_queries.py
new file mode 100644
index 0000000..05292fa
--- /dev/null
+++ b/m5-forecasting-feature-engineering/duckdb_queries.py
@@ -0,0 +1 @@
+# coming soon ;)
diff --git a/m5-forecasting-feature-engineering/pandas_queries.py b/m5-forecasting-feature-engineering/pandas_queries.py
new file mode 100644
index 0000000..53599b0
--- /dev/null
+++ b/m5-forecasting-feature-engineering/pandas_queries.py
@@ -0,0 +1,73 @@
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import pyarrow
+
+print("pandas version", pd.__version__)
+print("numpy version", np.__version__)
+print("pyarrow version", pyarrow.__version__)
+
+pd.options.mode.copy_on_write = True
+pd.options.future.infer_string = True
+
+PROCESSED_DATA_DIR = "data"
+
+TARGET = "sales"
+SHIFT_DAY = 28
+
+# Set this to True if you just want to test that everything runs
+SMALL = True
+if SMALL:
+    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet")
+else:
+    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet")
+
+LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]
+
+
+def q1_pandas(df):
+    return df.assign(
+        **{
+            f"{TARGET}_lag_{l}": df.groupby(["id"], observed=True)[TARGET].transform(
+                lambda x: x.shift(l)
+            )
+            for l in LAG_DAYS
+        }
+    )
+
+
+def q2_pandas(df):
+    for i in [7, 14, 30, 60, 180]:
+        df["rolling_mean_" + str(i)] = df.groupby(["id"], observed=True)[
+            TARGET
+        ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean())
+    for i in [7, 14, 30, 60, 180]:
+        df["rolling_std_" + str(i)] = df.groupby(["id"], observed=True)[
+            TARGET
+        ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std())
+    return df
+
+
+def q3_pandas(df):
+    for d_shift in [1, 7, 14]:
+        for d_window in [7, 14, 30, 60]:
+            col_name = "rolling_mean_" + str(d_shift) + "_" + str(d_window)
+            df[col_name] = df.groupby(["id"], observed=True)[TARGET].transform(
+                lambda x: x.shift(d_shift).rolling(d_window).mean()
+            )
+    return df
+
+
+start_time = time.perf_counter()
+q1_pandas(pd.read_parquet(PATH, engine="pyarrow"))
+print(f"q1 took: {time.perf_counter() - start_time}")
+
+start_time = time.perf_counter()
+q2_pandas(pd.read_parquet(PATH, engine="pyarrow"))
+print(f"q2 took: {time.perf_counter() - start_time}")
+
+start_time = time.perf_counter()
+q3_pandas(pd.read_parquet(PATH, engine="pyarrow"))
+print(f"q2 took: {time.perf_counter() - start_time}")
diff --git a/m5-forecasting-feature-engineering/polars_queries.py b/m5-forecasting-feature-engineering/polars_queries.py
new file mode 100644
index 0000000..e7a9afa
--- /dev/null
+++ b/m5-forecasting-feature-engineering/polars_queries.py
@@ -0,0 +1,88 @@
+import os
+import time
+
+import polars as pl
+
+print("polars version", pl.__version__)
+
+PROCESSED_DATA_DIR = "data"
+
+TARGET = "sales"
+SHIFT_DAY = 28
+
+# Set this to True if you just want to test that everything runs
+SMALL = True
+if SMALL:
+    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet")
+else:
+    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet")
+
+LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]
+
+
+def q1_polars(df):
+    return df.with_columns(
+        pl.col(TARGET).shift(l).over("id").alias(f"{TARGET}_lag_{l}") for l in LAG_DAYS
+    )
+
+
+def q2_polars(df):
+    return df.with_columns(
+        *[
+            pl.col(TARGET)
+            .shift(SHIFT_DAY)
+            .rolling_mean(window_size=i)
+            .over("id")
+            .alias(f"rolling_mean_{i}")
+            for i in [7, 14, 30, 60, 180]
+        ],
+        *[
+            pl.col(TARGET)
+            .shift(SHIFT_DAY)
+            .rolling_std(window_size=i)
+            .over("id")
+            .alias(f"rolling_std_{i}")
+            for i in [7, 14, 30, 60, 180]
+        ],
+    )
+
+
+def q3_polars(df):
+    return df.with_columns(
+        pl.col(TARGET)
+        .shift(d_shift)
+        .rolling_mean(window_size=d_window)
+        .over("id")
+        .alias(f"rolling_mean_{d_shift}_{d_window}")
+        for d_shift in [1, 7, 14]
+        for d_window in [7, 14, 30, 60]
+    )
+
+
+print("*** polars lazy ***")
+
+start_time = time.perf_counter()
+q1_polars(pl.scan_parquet(PATH)).collect()
+print(f"q1 took: {time.perf_counter() - start_time}")
+
+start_time = time.perf_counter()
+q2_polars(pl.scan_parquet(PATH)).collect()
+print(f"q2 took: {time.perf_counter() - start_time}")
+
+start_time = time.perf_counter()
+q3_polars(pl.scan_parquet(PATH)).collect()
+print(f"q2 took: {time.perf_counter() - start_time}")
+
+print("*** polars eager ***")
+
+start_time = time.perf_counter()
+q1_polars(pl.read_parquet(PATH))
+print(f"q1 took: {time.perf_counter() - start_time}")
+
+start_time = time.perf_counter()
+q2_polars(pl.read_parquet(PATH))
+print(f"q2 took: {time.perf_counter() - start_time}")
+
+start_time = time.perf_counter()
+q3_polars(pl.read_parquet(PATH))
+print(f"q2 took: {time.perf_counter() - start_time}")

From df2ccb59862850d0f65ee449d9c8c060b47027cb Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:35:57 +0100
Subject: [PATCH 2/2] lint

---
 .../pandas_queries.py                         | 20 +++++++++----------
 .../polars_queries.py                         | 12 ++++++-----
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/m5-forecasting-feature-engineering/pandas_queries.py b/m5-forecasting-feature-engineering/pandas_queries.py
index 53599b0..401014b 100644
--- a/m5-forecasting-feature-engineering/pandas_queries.py
+++ b/m5-forecasting-feature-engineering/pandas_queries.py
@@ -1,5 +1,5 @@
-import os
 import time
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
@@ -20,20 +20,20 @@
 # Set this to True if you just want to test that everything runs
 SMALL = True
 if SMALL:
-    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet")
+    PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1_small.parquet"
 else:
-    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet")
+    PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1.parquet"
 
-LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]
+LAG_DAYS = list(range(SHIFT_DAY, SHIFT_DAY + 15))
 
 
 def q1_pandas(df):
     return df.assign(
         **{
-            f"{TARGET}_lag_{l}": df.groupby(["id"], observed=True)[TARGET].transform(
-                lambda x: x.shift(l)
+            f"{TARGET}_lag_{lag}": df.groupby(["id"], observed=True)[TARGET].transform(
+                lambda x: x.shift(lag)  # noqa: B023
             )
-            for l in LAG_DAYS
+            for lag in LAG_DAYS
         }
     )
 
@@ -42,11 +42,11 @@ def q2_pandas(df):
     for i in [7, 14, 30, 60, 180]:
         df["rolling_mean_" + str(i)] = df.groupby(["id"], observed=True)[
             TARGET
-        ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean())
+        ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean())  # noqa: B023
     for i in [7, 14, 30, 60, 180]:
         df["rolling_std_" + str(i)] = df.groupby(["id"], observed=True)[
             TARGET
-        ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std())
+        ].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std())  # noqa: B023
     return df
 
 
@@ -55,7 +55,7 @@ def q3_pandas(df):
         for d_window in [7, 14, 30, 60]:
             col_name = "rolling_mean_" + str(d_shift) + "_" + str(d_window)
             df[col_name] = df.groupby(["id"], observed=True)[TARGET].transform(
-                lambda x: x.shift(d_shift).rolling(d_window).mean()
+                lambda x: x.shift(d_shift).rolling(d_window).mean()  # noqa: B023
             )
     return df
 
diff --git a/m5-forecasting-feature-engineering/polars_queries.py b/m5-forecasting-feature-engineering/polars_queries.py
index e7a9afa..557228e 100644
--- a/m5-forecasting-feature-engineering/polars_queries.py
+++ b/m5-forecasting-feature-engineering/polars_queries.py
@@ -1,5 +1,5 @@
-import os
 import time
+from pathlib import Path
 
 import polars as pl
 
@@ -10,19 +10,21 @@
 TARGET = "sales"
 SHIFT_DAY = 28
 
+
 # Set this to True if you just want to test that everything runs
 SMALL = True
 if SMALL:
-    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1_small.parquet")
+    PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1_small.parquet"
 else:
-    PATH = os.path.join(PROCESSED_DATA_DIR, "grid_part_1.parquet")
+    PATH = Path(PROCESSED_DATA_DIR) / "grid_part_1.parquet"
 
-LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]
+LAG_DAYS = list(range(SHIFT_DAY, SHIFT_DAY + 15))
 
 
 def q1_polars(df):
     return df.with_columns(
-        pl.col(TARGET).shift(l).over("id").alias(f"{TARGET}_lag_{l}") for l in LAG_DAYS
+        pl.col(TARGET).shift(lag).over("id").alias(f"{TARGET}_lag_{lag}")
+        for lag in LAG_DAYS
     )