From 7e3cd159b5bd6ee3b49a8a95e6eb900c949bdecb Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Sat, 4 May 2024 07:50:34 -0400 Subject: [PATCH] Bench: massive nested series assignment --- benchmarks/benchmarks.py | 50 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 12d36f1..bbbb575 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -35,7 +35,8 @@ def setup(self): ) self.series = pd.Series( [original_df] * self.n_objects, - # Sorting is happening somewhere, so we need to order by field name here + # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we + # need to order by field name here for backwards compatibility. dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}), ) @@ -50,3 +51,50 @@ def time_run(self): def peakmem_run(self): """Benchmark the memory usage of changing a single nested series element.""" self.run() + + +class ReassignHalfOfNestedSeries: + """Benchmark the performance of changing a lot of nested series elements""" + + n_objects = 10_000 + n_sources = 100 + series: pd.Series + new_series: pd.Series + + def setup(self): + """Set up the benchmark environment.""" + # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to + # order by field name here for backwards compatibility. + dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}) + original_df = pd.DataFrame( + { + "time": np.linspace(0, 1, self.n_sources), + "flux": np.arange(self.n_sources, dtype=np.float64), + "band": np.full_like("sdssu", self.n_sources), + } + ) + self.series = pd.Series( + [original_df] * self.n_objects, + dtype=dtype, + ) + + new_df = pd.DataFrame( + { + "time": np.arange(self.n_sources, dtype=np.float64), + "flux": np.linspace(0, 1, self.n_sources), + "band": np.full_like("lsstg", self.n_sources), + } + ) + self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype) + + def run(self): + """Run the benchmark.""" + self.series[::2] = self.new_series + + def time_run(self): + """Benchmark the runtime of changing a single nested series element.""" + self.run() + + def peakmem_run(self): + """Benchmark the memory usage of changing a single nested series element.""" + self.run()