Merge pull request #54 from lincc-frameworks/bench-assign

Benchmarks for nested element assignment
lincc-frameworks · May 7, 2024 · ab5fc4c · ab5fc4c
2 parents f2d488f + 7e3cd15
commit ab5fc4c
Showing 1 changed file with 91 additions and 7 deletions.
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -3,14 +3,98 @@
 For more information on writing benchmarks:
 https://asv.readthedocs.io/en/stable/writing_benchmarks.html."""
 
-from nested_pandas import example_benchmarks
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+from nested_pandas import NestedDtype
 
 
-def time_computation():
-    """Time computations are prefixed with 'time'."""
-    example_benchmarks.runtime_computation()
+class AssignSingleDfToNestedSeries:
+    """Benchmark the performance of changing a single nested series element"""
 
+    n_objects = 10_000
+    n_sources = 100
+    new_df: pd.DataFrame
+    series: pd.Series
 
-def mem_list():
-    """Memory computations are prefixed with 'mem' or 'peakmem'."""
-    return example_benchmarks.memory_computation()
+    def setup(self):
+        """Set up the benchmark environment."""
+        self.new_df = pd.DataFrame(
+            {
+                "time": np.arange(self.n_sources, dtype=np.float64),
+                "flux": np.linspace(0, 1, self.n_sources),
+                "band": np.full_like("lsstg", self.n_sources),
+            }
+        )
+        original_df = pd.DataFrame(
+            {
+                "time": np.linspace(0, 1, self.n_sources),
+                "flux": np.arange(self.n_sources, dtype=np.float64),
+                "band": np.full_like("sdssu", self.n_sources),
+            }
+        )
+        self.series = pd.Series(
+            [original_df] * self.n_objects,
+            # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we
+            # need to order by field name here for backwards compatibility.
+            dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}),
+        )
+
+    def run(self):
+        """Run the benchmark."""
+        self.series[self.n_objects // 2] = self.new_df
+
+    def time_run(self):
+        """Benchmark the runtime of changing a single nested series element."""
+        self.run()
+
+    def peakmem_run(self):
+        """Benchmark the memory usage of changing a single nested series element."""
+        self.run()
+
+
+class ReassignHalfOfNestedSeries:
+    """Benchmark the performance of changing a lot of nested series elements"""
+
+    n_objects = 10_000
+    n_sources = 100
+    series: pd.Series
+    new_series: pd.Series
+
+    def setup(self):
+        """Set up the benchmark environment."""
+        # When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to
+        # order by field name here for backwards compatibility.
+        dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()})
+        original_df = pd.DataFrame(
+            {
+                "time": np.linspace(0, 1, self.n_sources),
+                "flux": np.arange(self.n_sources, dtype=np.float64),
+                "band": np.full_like("sdssu", self.n_sources),
+            }
+        )
+        self.series = pd.Series(
+            [original_df] * self.n_objects,
+            dtype=dtype,
+        )
+
+        new_df = pd.DataFrame(
+            {
+                "time": np.arange(self.n_sources, dtype=np.float64),
+                "flux": np.linspace(0, 1, self.n_sources),
+                "band": np.full_like("lsstg", self.n_sources),
+            }
+        )
+        self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype)
+
+    def run(self):
+        """Run the benchmark."""
+        self.series[::2] = self.new_series
+
+    def time_run(self):
+        """Benchmark the runtime of changing a single nested series element."""
+        self.run()
+
+    def peakmem_run(self):
+        """Benchmark the memory usage of changing a single nested series element."""
+        self.run()