Skip to content

Commit

Permalink
Merge pull request #54 from lincc-frameworks/bench-assign
Browse files Browse the repository at this point in the history
Benchmarks for nested element assignment
  • Loading branch information
hombit authored May 7, 2024
2 parents f2d488f + 7e3cd15 commit ab5fc4c
Showing 1 changed file with 91 additions and 7 deletions.
98 changes: 91 additions & 7 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,98 @@
For more information on writing benchmarks:
https://asv.readthedocs.io/en/stable/writing_benchmarks.html."""

from nested_pandas import example_benchmarks
import numpy as np
import pandas as pd
import pyarrow as pa
from nested_pandas import NestedDtype


def time_computation():
"""Time computations are prefixed with 'time'."""
example_benchmarks.runtime_computation()
class AssignSingleDfToNestedSeries:
"""Benchmark the performance of changing a single nested series element"""

n_objects = 10_000
n_sources = 100
new_df: pd.DataFrame
series: pd.Series

def mem_list():
"""Memory computations are prefixed with 'mem' or 'peakmem'."""
return example_benchmarks.memory_computation()
def setup(self):
"""Set up the benchmark environment."""
self.new_df = pd.DataFrame(
{
"time": np.arange(self.n_sources, dtype=np.float64),
"flux": np.linspace(0, 1, self.n_sources),
"band": np.full_like("lsstg", self.n_sources),
}
)
original_df = pd.DataFrame(
{
"time": np.linspace(0, 1, self.n_sources),
"flux": np.arange(self.n_sources, dtype=np.float64),
"band": np.full_like("sdssu", self.n_sources),
}
)
self.series = pd.Series(
[original_df] * self.n_objects,
# When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we
# need to order by field name here for backwards compatibility.
dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}),
)

def run(self):
"""Run the benchmark."""
self.series[self.n_objects // 2] = self.new_df

def time_run(self):
"""Benchmark the runtime of changing a single nested series element."""
self.run()

def peakmem_run(self):
"""Benchmark the memory usage of changing a single nested series element."""
self.run()


class ReassignHalfOfNestedSeries:
"""Benchmark the performance of changing a lot of nested series elements"""

n_objects = 10_000
n_sources = 100
series: pd.Series
new_series: pd.Series

def setup(self):
"""Set up the benchmark environment."""
# When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to
# order by field name here for backwards compatibility.
dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()})
original_df = pd.DataFrame(
{
"time": np.linspace(0, 1, self.n_sources),
"flux": np.arange(self.n_sources, dtype=np.float64),
"band": np.full_like("sdssu", self.n_sources),
}
)
self.series = pd.Series(
[original_df] * self.n_objects,
dtype=dtype,
)

new_df = pd.DataFrame(
{
"time": np.arange(self.n_sources, dtype=np.float64),
"flux": np.linspace(0, 1, self.n_sources),
"band": np.full_like("lsstg", self.n_sources),
}
)
self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype)

def run(self):
"""Run the benchmark."""
self.series[::2] = self.new_series

def time_run(self):
"""Benchmark the runtime of changing a single nested series element."""
self.run()

def peakmem_run(self):
"""Benchmark the memory usage of changing a single nested series element."""
self.run()

0 comments on commit ab5fc4c

Please sign in to comment.