From 7e3cd159b5bd6ee3b49a8a95e6eb900c949bdecb Mon Sep 17 00:00:00 2001
From: Konstantin Malanchev <hombit@gmail.com>
Date: Sat, 4 May 2024 07:50:34 -0400
Subject: [PATCH] Bench: massive nested series assignment

---
 benchmarks/benchmarks.py | 50 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index 12d36f1..bbbb575 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -35,7 +35,8 @@ def setup(self):
         )
         self.series = pd.Series(
             [original_df] * self.n_objects,
-            # Sorting is happening somewhere, so we need to order by field name here
+            # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we
+            # need to order by field name here for backwards compatibility.
             dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}),
         )
 
@@ -50,3 +51,50 @@ def time_run(self):
     def peakmem_run(self):
         """Benchmark the memory usage of changing a single nested series element."""
         self.run()
+
+
+class ReassignHalfOfNestedSeries:
+    """Benchmark the performance of changing a lot of nested series elements"""
+
+    n_objects = 10_000
+    n_sources = 100
+    series: pd.Series
+    new_series: pd.Series
+
+    def setup(self):
+        """Set up the benchmark environment."""
+        # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to
+        # order by field name here for backwards compatibility.
+        dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()})
+        original_df = pd.DataFrame(
+            {
+                "time": np.linspace(0, 1, self.n_sources),
+                "flux": np.arange(self.n_sources, dtype=np.float64),
+                "band": np.full_like("sdssu", self.n_sources),
+            }
+        )
+        self.series = pd.Series(
+            [original_df] * self.n_objects,
+            dtype=dtype,
+        )
+
+        new_df = pd.DataFrame(
+            {
+                "time": np.arange(self.n_sources, dtype=np.float64),
+                "flux": np.linspace(0, 1, self.n_sources),
+                "band": np.full_like("lsstg", self.n_sources),
+            }
+        )
+        self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype)
+
+    def run(self):
+        """Run the benchmark."""
+        self.series[::2] = self.new_series
+
+    def time_run(self):
+        """Benchmark the runtime of changing a single nested series element."""
+        self.run()
+
+    def peakmem_run(self):
+        """Benchmark the memory usage of changing a single nested series element."""
+        self.run()