From 99d67a5e43a6db61849456c45217e52c63dc4a04 Mon Sep 17 00:00:00 2001 From: Tim Hoffmann Date: Thu, 28 Sep 2023 12:01:40 +0200 Subject: [PATCH] ENH: propagating attrs always uses deepcopy Always using a deepcopy prevents shared state and thus unintentional modification of the attrs of other objects. IMHO this safety has a higher priority than the slight performance cost of the deepcopy. The implementation now skips the copying if *attrs* are not used (i.e. an empty dict). This check takes only ~20ns. Thus, the attrs propagation mechanism has no performance impact if attrs are not used. Closes #54134. --- pandas/core/generic.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 738f4cbe6bc430..f46242bc37258f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2,6 +2,7 @@ from __future__ import annotations import collections +import copy import datetime as dt from functools import partial import gc @@ -364,6 +365,11 @@ def attrs(self) -> dict[Hashable, Any]: attrs is experimental and may change without warning. + Many operations that create new datasets will copy *attrs*. Copies are + always deep so that changing *attrs* will only affect the present + dataset. `pandas.concatenate` copies *attrs* only if all input + datasets have the same *attrs*. + See Also -------- DataFrame.flags : Global flags applying to this object. @@ -6191,8 +6197,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: stable across pandas releases. """ if isinstance(other, NDFrame): - for name in other.attrs: - self.attrs[name] = other.attrs[name] + if other.attrs: + # One could make the deepcopy unconditionally, however a deepcopy + # of an empty dict is 50x more expensive than the empty check. + # However, we want attrs propagation to have minimal performance + # impact it attrs are not used; i.e. attrs is an empty dict. + self.attrs = copy.deepcopy(other.attrs) self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. @@ -6201,11 +6211,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - attrs = other.objs[0].attrs - check_attrs = all(objs.attrs == attrs for objs in other.objs[1:]) - if check_attrs: - for name in attrs: - self.attrs[name] = attrs[name] + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = copy.deepcopy(attrs) allows_duplicate_labels = all( x.flags.allows_duplicate_labels for x in other.objs