Skip to content

Commit

Permalink
PERF-#6753: Preserve dtypes cache on '.__setitem__()'
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <[email protected]>
  • Loading branch information
dchigarev committed Nov 20, 2023
1 parent adfd2bb commit 4ea385c
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 6 deletions.
11 changes: 9 additions & 2 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2802,6 +2802,7 @@ def apply_full_axis_select_indices(
new_index=None,
new_columns=None,
keep_remaining=False,
new_dtypes=None,
):
"""
Apply a function across an entire axis for a subset of the data.
Expand All @@ -2824,6 +2825,10 @@ def apply_full_axis_select_indices(
advance, and if not provided it must be computed.
keep_remaining : boolean, default: False
Whether or not to drop the data that is not computed over.
new_dtypes : ModinDtypes or pandas.Series, optional
The data types of the result. This is an optimization
because there are functions that always result in a particular data
type, and allows us to avoid (re)computing it.
Returns
-------
Expand Down Expand Up @@ -2852,7 +2857,9 @@ def apply_full_axis_select_indices(
new_index = self.index if axis == 1 else None
if new_columns is None:
new_columns = self.columns if axis == 0 else None
return self.__constructor__(new_partitions, new_index, new_columns, None, None)
return self.__constructor__(
new_partitions, new_index, new_columns, None, None, dtypes=new_dtypes
)

@lazy_metadata_decorator(apply_axis="both")
def apply_select_indices(
Expand Down Expand Up @@ -2906,7 +2913,7 @@ def apply_select_indices(
new_index = self.index if axis == 1 else None
if new_columns is None:
new_columns = self.columns if axis == 0 else None
if new_columns is not None and new_dtypes is not None:
if new_columns is not None and isinstance(new_dtypes, pandas.Series):
assert new_dtypes.index.equals(
new_columns
), f"{new_dtypes=} doesn't have the same columns as in {new_columns=}"
Expand Down
13 changes: 11 additions & 2 deletions modin/core/dataframe/pandas/metadata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,14 +527,23 @@ class ModinDtypes:
Parameters
----------
value : pandas.Series or callable
value : pandas.Series, callable, DtypesDescriptor or ModinDtypes, optional
"""

def __init__(self, value: Union[Callable, pandas.Series, DtypesDescriptor]):
def __init__(
self,
value: Optional[
Union[Callable, pandas.Series, DtypesDescriptor, "ModinDtypes"]
],
):
if callable(value) or isinstance(value, pandas.Series):
self._value = value
elif isinstance(value, DtypesDescriptor):
self._value = value.to_series() if value.is_materialized else value
elif isinstance(value, type(self)):
self._value = value.copy()._value
elif isinstance(value, None):
self._value = DtypesDescriptor()

Check warning on line 546 in modin/core/dataframe/pandas/metadata/dtypes.py

View check run for this annotation

Codecov / codecov/patch

modin/core/dataframe/pandas/metadata/dtypes.py#L545-L546

Added lines #L545 - L546 were not covered by tests
else:
raise ValueError(f"ModinDtypes doesn't work with '{value}'")

Expand Down
24 changes: 24 additions & 0 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
SeriesGroupByDefault,
)
from modin.core.dataframe.base.dataframe.utils import join_columns
from modin.core.dataframe.pandas.metadata import DtypesDescriptor, ModinDtypes
from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
from modin.error_message import ErrorMessage
from modin.utils import (
Expand Down Expand Up @@ -2911,6 +2912,27 @@ def setitem_builder(df, internal_indices=[]): # pragma: no cover
idx = self.get_axis(axis ^ 1).get_indexer_for([key])[0]
return self.insert_item(axis ^ 1, idx, value, how, replace=True)

if axis == 0:
if hasattr(value, "dtype"):
value_dtype = value.dtype
elif is_scalar(value):
value_dtype = np.dtype(type(value))
else:
value_dtype = np.array(value).dtype

old_columns = self.columns.difference(pandas.Index([key]))
old_dtypes = ModinDtypes(self._modin_frame._dtypes).lazy_get(old_columns)
new_dtypes = ModinDtypes.concat(
[
old_dtypes,
DtypesDescriptor({key: value_dtype}, cols_with_unknown_dtypes=[]),
]
# get dtypes in a proper order
).lazy_get(self.columns)
else:
# TODO: apply 'find_common_dtype' to the value's dtype and old column dtypes
new_dtypes = None

Check warning on line 2934 in modin/core/storage_formats/pandas/query_compiler.py

View check run for this annotation

Codecov / codecov/patch

modin/core/storage_formats/pandas/query_compiler.py#L2934

Added line #L2934 was not covered by tests

# TODO: rework by passing list-like values to `apply_select_indices`
# as an item to distribute
if is_list_like(value):
Expand All @@ -2921,6 +2943,7 @@ def setitem_builder(df, internal_indices=[]): # pragma: no cover
new_index=self.index,
new_columns=self.columns,
keep_remaining=True,
new_dtypes=new_dtypes,
)
else:
new_modin_frame = self._modin_frame.apply_select_indices(
Expand All @@ -2929,6 +2952,7 @@ def setitem_builder(df, internal_indices=[]): # pragma: no cover
[key],
new_index=self.index,
new_columns=self.columns,
new_dtypes=new_dtypes,
keep_remaining=True,
)
return self.__constructor__(new_modin_frame)
Expand Down
61 changes: 59 additions & 2 deletions modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import modin.pandas as pd
from modin.config import Engine, ExperimentalGroupbyImpl, MinPartitionSize, NPartitions
from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
from modin.core.dataframe.pandas.dataframe.utils import ColumnInfo, ShuffleSortFunctions
from modin.core.dataframe.pandas.metadata import (
DtypesDescriptor,
Expand Down Expand Up @@ -1946,9 +1947,65 @@ class TestZeroComputationDtypes:
Test cases that shouldn't trigger dtypes computation during their execution.
"""

def test_get_dummies_case(self):
from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
@pytest.mark.parametrize("self_dtype", ["materialized", "partial", "unknown"])
@pytest.mark.parametrize(
"value, value_dtype",
[
[3.5, np.dtype(float)],
[[3.5, 2.4], np.dtype(float)],
[np.array([3.5, 2.4]), np.dtype(float)],
[pd.Series([3.5, 2.4]), np.dtype(float)],
],
)
def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype):
"""
Test that ``df[single_existing_column] = value`` preserves dtypes cache.
"""
with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [3, 4]})
if self_dtype == "materialized":
assert df._query_compiler._modin_frame.has_materialized_dtypes
elif self_dtype == "partial":
df._query_compiler._modin_frame.set_dtypes_cache(
ModinDtypes(
DtypesDescriptor(
{"a": np.dtype(int)}, cols_with_unknown_dtypes=["b", "c"]
)
)
)
elif self_dtype == "unknown":
df._query_compiler._modin_frame.set_dtypes_cache(None)
else:
raise NotImplementedError(self_dtype)

df["b"] = value

if self_dtype == "materialized":
result_dtype = pandas.Series(
[np.dtype(int), value_dtype, np.dtype(int)], index=["a", "b", "c"]
)
assert df._query_compiler._modin_frame.has_materialized_dtypes
assert df.dtypes.equals(result_dtype)
elif self_dtype == "partial":
result_dtype = DtypesDescriptor(
{"a": np.dtype(int), "b": value_dtype},
cols_with_unknown_dtypes=["c"],
columns_order={0: "a", 1: "b", 2: "c"},
)
df._query_compiler._modin_frame._dtypes._value.equals(result_dtype)
elif self_dtype == "unknown":
result_dtype = DtypesDescriptor(
{"b": value_dtype},
cols_with_unknown_dtypes=["a", "b"],
columns_order={0: "a", 1: "b", 2: "c"},
)
df._query_compiler._modin_frame._dtypes._value.equals(result_dtype)
else:
raise NotImplementedError(self_dtype)

patch.assert_not_called()

def test_get_dummies_case(self):
with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
df = pd.DataFrame(
{"items": [1, 2, 3, 4], "b": [3, 3, 4, 4], "c": [1, 0, 0, 1]}
Expand Down

0 comments on commit 4ea385c

Please sign in to comment.