-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
using dataclasses #40
base: main
Are you sure you want to change the base?
Conversation
I made a similar effort to use dataclasses shortly after 2.0's release, but decided against it because it reduced performance for a pretty arguable code improvement. Your approach seems to perform even worse than mine, perhaps because of the use of reflection:
|
Missing slots - I did not consider this to be optimized for performance. But nice comparison - what did you use for the numbers? |
Results are generated with hyperfine. I tried |
Dropping reflection and looking on the raw performance to create an object and access the properties for normal/slots/dataclasses: import dataclasses
import timeit
import datetime
import copy
import typing
class Dataset:
"""
ZFS dataset statistics over a timespan in seconds
"""
name = ''
reads = 0
nread = 0
writes = 0
nwritten = 0
nunlinks = 0
nunlinked = 0
timespan = 0
def __init__(self, name=''):
self.name = name
@classmethod
def from_dict(cls, data):
d = cls(data['dataset_name'])
d.reads = int(data['reads'])
d.nread = int(data['nread'])
d.writes = int(data['writes'])
d.nwritten = int(data['nwritten'])
d.nunlinks = int(data.get('nunlinks', 0))
d.nunlinked = int(data.get('nunlinked', 0))
d.timespan = int(data['timestamp']) / 1e9
return d
def __add__(self, other):
"""
Return a new Dataset with this one's values added to other
Note timespan is set to the maximum of the two values instead of being
added.
"""
d = copy.copy(self)
# d = Dataset(self.name)
d.reads += other.reads
d.nread += other.nread
d.writes += other.writes
d.nwritten += other.nwritten
d.nunlinks += other.nunlinks
d.nunlinked += other.nunlinked
d.timespan = max(d.timespan, other.timespan)
return d
class OptimizedDataset:
"""
ZFS dataset statistics over a timespan in seconds
"""
def __init__(self, name='', reads=0, nread=0, writes=0, nwritten=0, nunlinks=0, nunlinked=0, timespan=0):
self.name = name
self.reads = reads
self.nread = nread
self.writes = writes
self.nwritten = nwritten
self.nunlinks = nunlinks
self.nunlinked = nunlinked
self.timespan = timespan
@classmethod
def from_dict(cls, data):
d = cls(data['dataset_name'],
int(data['reads']),
int(data['nread']),
int(data['writes']),
int(data['nwritten']),
int(data.get('nunlinks', 0)),
int(data.get('nunlinked', 0)),
int(data['timestamp']) / 1e9)
return d
def __add__(self, other):
"""
Return a new Dataset with this one's values added to other
Note timespan is set to the maximum of the two values instead of being
added.
"""
d = OptimizedDataset(self.name,
self.reads + other.reads,
self.nread + other.nread,
self.writes + other.writes,
self.nwritten + other.nwritten,
self.nunlinks + other.nunlinks,
self.nunlinked + other.nunlinked,
max(self.timespan, other.timespan))
return d
class SlotsDataset:#(Dataset):
__slots__ = ["name","reads","nread","writes","nwritten","nunlinks","nunlinked","timespan"]
"""
ZFS dataset statistics over a timespan in seconds
"""
def __init__(self, name='', reads=0, nread=0, writes=0, nwritten=0, nunlinks=0, nunlinked=0, timespan=0):
self.name = name
self.reads = reads
self.nread = nread
self.writes = writes
self.nwritten = nwritten
self.nunlinks = nunlinks
self.nunlinked = nunlinked
self.timespan = timespan
@classmethod
def from_dict(cls, data):
d = cls(data['dataset_name'],
int(data['reads']),
int(data['nread']),
int(data['writes']),
int(data['nwritten']),
int(data.get('nunlinks', 0)),
int(data.get('nunlinked', 0)),
int(data['timestamp']) / 1e9)
return d
def __add__(self, other):
"""
Return a new Dataset with this one's values added to other
Note timespan is set to the maximum of the two values instead of being
added.
"""
d = SlotsDataset(self.name,
self.reads + other.reads,
self.nread + other.nread,
self.writes + other.writes,
self.nwritten + other.nwritten,
self.nunlinks + other.nunlinks,
self.nunlinked + other.nunlinked,
max(self.timespan, other.timespan))
return d
@dataclasses.dataclass #(slots=True)
class DataclassDataset:
"""
ZFS dataset statistics over a timespan in seconds
The Dataset fields are "documented" here:
https://github.com/openzfs/zfs/blob/9681de4657686d0ed19ca18d578513e74395f00f/module/zfs/dataset_kstats.c#L32
"""
timestamp: int = 0
dataset_name: str = ''
writes: int = 0
nwritten: int = 0
reads: int = 0
nread: int = 0
nunlinks: int = 0
nunlinked: int = 0
timespan: int = 0
@classmethod
def from_dict(cls, data):
d = cls(int(data['timestamp']),
data['dataset_name'],
int(data['reads']),
int(data['nread']),
int(data['writes']),
int(data['nwritten']),
int(data.get('nunlinks', 0)),
int(data.get('nunlinked', 0)),
int(data['timestamp']) / 1e9)
return d
def __add__(self, other):
"""
Return a new Dataset with this one's values added to other
Note timespan is set to the maximum of the two values instead of being
added.
"""
d = DataclassDataset(self.timestamp, self.dataset_name,
self.reads + other.reads,
self.nread + other.nread,
self.writes + other.writes,
self.nwritten + other.nwritten,
self.nunlinks + other.nunlinks,
self.nunlinked + other.nunlinked,
self.timespan + other.timespan )
return d
@dataclasses.dataclass(slots=True)
class DataclassSlotsDataset:
"""
ZFS dataset statistics over a timespan in seconds
The Dataset fields are "documented" here:
https://github.com/openzfs/zfs/blob/9681de4657686d0ed19ca18d578513e74395f00f/module/zfs/dataset_kstats.c#L32
"""
timestamp: int = 0
dataset_name: str = ''
writes: int = 0
nwritten: int = 0
reads: int = 0
nread: int = 0
nunlinks: int = 0
nunlinked: int = 0
timespan: int = 0
@classmethod
def from_dict(cls, data):
d = cls(int(data['timestamp']),
data['dataset_name'],
int(data['reads']),
int(data['nread']),
int(data['writes']),
int(data['nwritten']),
int(data.get('nunlinks', 0)),
int(data.get('nunlinked', 0)),
int(data['timestamp']) / 1e9)
return d
def __add__(self, other):
"""
Return a new Dataset with this one's values added to other
Note timespan is set to the maximum of the two values instead of being
added.
"""
d = DataclassSlotsDataset(self.timestamp, self.dataset_name,
self.reads + other.reads,
self.nread + other.nread,
self.writes + other.writes,
self.nwritten + other.nwritten,
self.nunlinks + other.nunlinks,
self.nunlinked + other.nunlinked,
self.timespan + other.timespan )
return d
def main():
data = {i:idx for idx,i in enumerate(SlotsDataset.__slots__)}
data["dataset_name"] = "test"
data["timestamp"] = datetime.datetime.now().timestamp()
del data["name"]
del data["timespan"]
cnt = 1000000
print("create")
print(timeit.timeit(lambda: Dataset.from_dict(data), number=cnt))
print(timeit.timeit(lambda: OptimizedDataset.from_dict(data), number=cnt))
print(timeit.timeit(lambda: SlotsDataset.from_dict(data), number=cnt))
print(timeit.timeit(lambda: DataclassDataset.from_dict(data), number=cnt))
print(timeit.timeit(lambda: DataclassSlotsDataset.from_dict(data), number=cnt))
print("add")
v = Dataset.from_dict(data)
print(timeit.timeit(lambda: v+v, number=cnt))
v = OptimizedDataset.from_dict(data)
print(timeit.timeit(lambda: v+v, number=cnt))
v = SlotsDataset.from_dict(data)
print(timeit.timeit(lambda: v+v, number=cnt))
v = DataclassDataset.from_dict(data)
print(timeit.timeit(lambda: v+v, number=cnt))
v = DataclassSlotsDataset.from_dict(data)
print(timeit.timeit(lambda: v+v, number=cnt))
if __name__ == "__main__":
main() Dataset
python 3.10 dataclasses with slots=True outperforms the alternatives when doing the first I think order (combined init & add) would be along:
|
Our Python MSV is 3.7, so Dataclass slots is a non-starter for now. These long positional argument lists are quite nasty - they're begging for an easy-to-miss mixup and I'd rather avoid them. I added versions with keyword arguments, plus a version that avoids
Also, damn my CPU sucks. |
You've been competing with I've updated the PR to use |
They're teenagers now. I knew they sucked, but I didn't quite expect a similarly-clocked 15W laptop part to beat them by that margin. |
- no kwargs in __init__ - fixes requested & required
due to clock - the numbers for 7742 (ubuntu 23.04/py3.10) are 10-14% worse than my ryzen mobile:
And - there is a new revision available. |
Looking like a 5-10% performance uplift on my hardware, plus a 0.5-2% drop in memory use. |
Hi,
I started of my own version of this, but found yours basically the last moment …
My name of choice was objset, but the poll is finished I guess.
I used dataclasses - ported here.
And updated the fields to match current OpenZFS git.