diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 7031b269554..c8fceaa401e 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -2802,6 +2802,7 @@ def apply_full_axis_select_indices( new_index=None, new_columns=None, keep_remaining=False, + new_dtypes=None, ): """ Apply a function across an entire axis for a subset of the data. @@ -2824,6 +2825,10 @@ def apply_full_axis_select_indices( advance, and if not provided it must be computed. keep_remaining : boolean, default: False Whether or not to drop the data that is not computed over. + new_dtypes : ModinDtypes or pandas.Series, optional + The data types of the result. This is an optimization + because there are functions that always result in a particular data + type, and allows us to avoid (re)computing it. Returns ------- @@ -2852,7 +2857,9 @@ def apply_full_axis_select_indices( new_index = self.index if axis == 1 else None if new_columns is None: new_columns = self.columns if axis == 0 else None - return self.__constructor__(new_partitions, new_index, new_columns, None, None) + return self.__constructor__( + new_partitions, new_index, new_columns, None, None, dtypes=new_dtypes + ) @lazy_metadata_decorator(apply_axis="both") def apply_select_indices( @@ -2906,7 +2913,7 @@ def apply_select_indices( new_index = self.index if axis == 1 else None if new_columns is None: new_columns = self.columns if axis == 0 else None - if new_columns is not None and new_dtypes is not None: + if new_columns is not None and isinstance(new_dtypes, pandas.Series): assert new_dtypes.index.equals( new_columns ), f"{new_dtypes=} doesn't have the same columns as in {new_columns=}" diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index fc6cc211fac..74f2afb2e96 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -527,14 +527,23 @@ class ModinDtypes: Parameters ---------- - value : pandas.Series or callable + value : pandas.Series, callable, DtypesDescriptor or ModinDtypes, optional """ - def __init__(self, value: Union[Callable, pandas.Series, DtypesDescriptor]): + def __init__( + self, + value: Optional[ + Union[Callable, pandas.Series, DtypesDescriptor, "ModinDtypes"] + ], + ): if callable(value) or isinstance(value, pandas.Series): self._value = value elif isinstance(value, DtypesDescriptor): self._value = value.to_series() if value.is_materialized else value + elif isinstance(value, type(self)): + self._value = value.copy()._value + elif isinstance(value, None): + self._value = DtypesDescriptor() else: raise ValueError(f"ModinDtypes doesn't work with '{value}'") diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index ba907a8b036..c0efe217188 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -58,6 +58,7 @@ SeriesGroupByDefault, ) from modin.core.dataframe.base.dataframe.utils import join_columns +from modin.core.dataframe.pandas.metadata import DtypesDescriptor, ModinDtypes from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.error_message import ErrorMessage from modin.utils import ( @@ -2911,6 +2912,27 @@ def setitem_builder(df, internal_indices=[]): # pragma: no cover idx = self.get_axis(axis ^ 1).get_indexer_for([key])[0] return self.insert_item(axis ^ 1, idx, value, how, replace=True) + if axis == 0: + if hasattr(value, "dtype"): + value_dtype = value.dtype + elif is_scalar(value): + value_dtype = np.dtype(type(value)) + else: + value_dtype = np.array(value).dtype + + old_columns = self.columns.difference(pandas.Index([key])) + old_dtypes = ModinDtypes(self._modin_frame._dtypes).lazy_get(old_columns) + new_dtypes = ModinDtypes.concat( + [ + old_dtypes, + DtypesDescriptor({key: value_dtype}, cols_with_unknown_dtypes=[]), + ] + # get dtypes in a proper order + ).lazy_get(self.columns) + else: + # TODO: apply 'find_common_dtype' to the value's dtype and old column dtypes + new_dtypes = None + # TODO: rework by passing list-like values to `apply_select_indices` # as an item to distribute if is_list_like(value): @@ -2921,6 +2943,7 @@ def setitem_builder(df, internal_indices=[]): # pragma: no cover new_index=self.index, new_columns=self.columns, keep_remaining=True, + new_dtypes=new_dtypes, ) else: new_modin_frame = self._modin_frame.apply_select_indices( @@ -2929,6 +2952,7 @@ def setitem_builder(df, internal_indices=[]): # pragma: no cover [key], new_index=self.index, new_columns=self.columns, + new_dtypes=new_dtypes, keep_remaining=True, ) return self.__constructor__(new_modin_frame) diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index b459f050263..3946d9672ee 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -21,6 +21,7 @@ import modin.pandas as pd from modin.config import Engine, ExperimentalGroupbyImpl, MinPartitionSize, NPartitions +from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.core.dataframe.pandas.dataframe.utils import ColumnInfo, ShuffleSortFunctions from modin.core.dataframe.pandas.metadata import ( DtypesDescriptor, @@ -1946,9 +1947,65 @@ class TestZeroComputationDtypes: Test cases that shouldn't trigger dtypes computation during their execution. """ - def test_get_dummies_case(self): - from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe + @pytest.mark.parametrize("self_dtype", ["materialized", "partial", "unknown"]) + @pytest.mark.parametrize( + "value, value_dtype", + [ + [3.5, np.dtype(float)], + [[3.5, 2.4], np.dtype(float)], + [np.array([3.5, 2.4]), np.dtype(float)], + [pd.Series([3.5, 2.4]), np.dtype(float)], + ], + ) + def test_preserve_dtypes_setitem(self, self_dtype, value, value_dtype): + """ + Test that ``df[single_existing_column] = value`` preserves dtypes cache. + """ + with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [3, 4]}) + if self_dtype == "materialized": + assert df._query_compiler._modin_frame.has_materialized_dtypes + elif self_dtype == "partial": + df._query_compiler._modin_frame.set_dtypes_cache( + ModinDtypes( + DtypesDescriptor( + {"a": np.dtype(int)}, cols_with_unknown_dtypes=["b", "c"] + ) + ) + ) + elif self_dtype == "unknown": + df._query_compiler._modin_frame.set_dtypes_cache(None) + else: + raise NotImplementedError(self_dtype) + + df["b"] = value + + if self_dtype == "materialized": + result_dtype = pandas.Series( + [np.dtype(int), value_dtype, np.dtype(int)], index=["a", "b", "c"] + ) + assert df._query_compiler._modin_frame.has_materialized_dtypes + assert df.dtypes.equals(result_dtype) + elif self_dtype == "partial": + result_dtype = DtypesDescriptor( + {"a": np.dtype(int), "b": value_dtype}, + cols_with_unknown_dtypes=["c"], + columns_order={0: "a", 1: "b", 2: "c"}, + ) + df._query_compiler._modin_frame._dtypes._value.equals(result_dtype) + elif self_dtype == "unknown": + result_dtype = DtypesDescriptor( + {"b": value_dtype}, + cols_with_unknown_dtypes=["a", "b"], + columns_order={0: "a", 1: "b", 2: "c"}, + ) + df._query_compiler._modin_frame._dtypes._value.equals(result_dtype) + else: + raise NotImplementedError(self_dtype) + + patch.assert_not_called() + def test_get_dummies_case(self): with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame( {"items": [1, 2, 3, 4], "b": [3, 3, 4, 4], "c": [1, 0, 0, 1]}