Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#5836: Introduce 'partial' dtypes cache #6663

Merged
merged 10 commits into from
Nov 17, 2023
7 changes: 4 additions & 3 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,10 @@ def try_compute_new_dtypes(first, second, infer_dtypes=None, result_dtype=None,
elif infer_dtypes == "common_cast":
dtypes = maybe_compute_dtypes_common_cast(first, second, axis=axis)
elif infer_dtypes == "float":
dtypes = maybe_compute_dtypes_common_cast(first, second, axis=axis)
if dtypes is not None:
dtypes = dtypes.apply(coerce_int_to_float64)
dtypes = maybe_build_dtypes_series(first, second, dtype=np.dtype(float))
# dtypes = maybe_compute_dtypes_common_cast(first, second, axis=axis)
# if dtypes is not None:
# dtypes = dtypes.apply(coerce_int_to_float64)
else:
# For now we only know how to handle `result_dtype == bool` as that's
# the only value that is being passed here right now, it's unclear
Expand Down
100 changes: 76 additions & 24 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
lazy_metadata_decorator,
)
from modin.core.dataframe.pandas.metadata import (
DtypesDescriptor,
LazyProxyCategoricalDtype,
ModinDtypes,
ModinIndex,
Expand Down Expand Up @@ -314,25 +315,39 @@ def _maybe_update_proxies(self, dtypes, new_parent=None):
new_parent : object, optional
A new parent to link the proxies to. If not specified
will consider the `self` to be a new parent.

Returns
-------
pandas.Series, ModinDtypes or callable
Comment on lines +319 to +321
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return updated value for convenience

"""
new_parent = new_parent or self
if isinstance(dtypes, pandas.Series) or (
isinstance(dtypes, ModinDtypes) and dtypes.is_materialized
):
if isinstance(dtypes, ModinDtypes):
dtypes = dtypes.maybe_specify_new_frame_ref(new_parent)
if isinstance(dtypes, pandas.Series):
for key, value in dtypes.items():
if isinstance(value, LazyProxyCategoricalDtype):
dtypes[key] = value._update_proxy(new_parent, column_name=key)
dchigarev marked this conversation as resolved.
Show resolved Hide resolved
return dtypes

def set_dtypes_cache(self, dtypes):
"""
Set dtypes cache.

Parameters
----------
dtypes : pandas.Series, ModinDtypes or callable
"""
self._maybe_update_proxies(dtypes)
if isinstance(dtypes, ModinDtypes) or dtypes is None:
dtypes : pandas.Series, ModinDtypes, callable or None
"""
dtypes = self._maybe_update_proxies(dtypes)
if dtypes is None and self.has_materialized_columns:
try:
self._dtypes = ModinDtypes(
DtypesDescriptor(
cols_with_unknown_dtypes=self.columns.tolist(), parent_df=self
)
)
except NotImplementedError:
self._dtypes = None
elif isinstance(dtypes, ModinDtypes) or dtypes is None:
self._dtypes = dtypes
else:
self._dtypes = ModinDtypes(dtypes)
Expand All @@ -354,6 +369,18 @@ def dtypes(self):
self.set_dtypes_cache(dtypes)
return dtypes

def get_dtypes_set(self):
"""
Get a set of dtypes that are in this dataframe.

Returns
-------
set
"""
if isinstance(self._dtypes, ModinDtypes):
return self._dtypes.get_dtypes_set()
return set(self.dtypes.values)

def _compute_dtypes(self, columns=None):
"""
Compute the data types via TreeReduce pattern for the specified columns.
Expand All @@ -376,7 +403,13 @@ def dtype_builder(df):
if columns is not None:
# Sorting positions to request columns in the order they're stored (it's more efficient)
numeric_indices = sorted(self.columns.get_indexer_for(columns))
obj = self._take_2d_positional(col_positions=numeric_indices)
Copy link
Collaborator Author

@dchigarev dchigarev Nov 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_take_2d_positional doesn't apply deferred labels which failed some of the new tests, added this simple workaround until we figure out whether we need lazy_metadata_decorator there or not (#0000 TODO: raise an issue)

if any(pos < 0 for pos in numeric_indices):
raise KeyError(
f"Some of the columns are not in index: subset={columns}; columns={self.columns}"
)
obj = self.take_2d_labels_or_positional(
col_labels=self.columns[numeric_indices].tolist()
)
else:
obj = self

Expand Down Expand Up @@ -675,8 +708,11 @@ def _set_columns(self, new_columns):
):
return
new_columns = self._validate_set_axis(new_columns, self._columns_cache)
if self.has_materialized_dtypes:
self.dtypes.index = new_columns
if isinstance(self._dtypes, ModinDtypes):
new_value = self._dtypes.set_index(new_columns)
self.set_dtypes_cache(new_value)
elif isinstance(self._dtypes, pandas.Series):
self.dtypes.index = new_columns
self.set_columns_cache(new_columns)
self.synchronize_labels(axis=1)

Expand Down Expand Up @@ -1146,6 +1182,13 @@ def _take_2d_positional(

if self.has_materialized_dtypes:
new_dtypes = self.dtypes.iloc[monotonic_col_idx]
elif isinstance(self._dtypes, ModinDtypes):
try:
new_dtypes = self._dtypes.lazy_get(
monotonic_col_idx, numeric_index=True
)
except NotImplementedError:
dchigarev marked this conversation as resolved.
Show resolved Hide resolved
new_dtypes = None
else:
new_dtypes = None
else:
Expand Down Expand Up @@ -1441,6 +1484,11 @@ def _reorder_labels(self, row_positions=None, col_positions=None):
col_idx = self.columns[col_positions]
if self.has_materialized_dtypes:
new_dtypes = self.dtypes.iloc[col_positions]
elif isinstance(self._dtypes, ModinDtypes):
try:
new_dtypes = self._dtypes.lazy_get(col_idx)
except NotImplementedError:
new_dtypes = None

if len(col_idx) != len(self.columns):
# The frame was re-partitioned along the 1 axis during reordering using
Expand Down Expand Up @@ -3253,22 +3301,24 @@ def broadcast_apply_full_axis(
kw = {"row_lengths": None, "column_widths": None}
if isinstance(dtypes, str) and dtypes == "copy":
kw["dtypes"] = self.copy_dtypes_cache()
elif isinstance(dtypes, DtypesDescriptor):
kw["dtypes"] = ModinDtypes(dtypes)
elif dtypes is not None:
if isinstance(dtypes, (pandas.Series, ModinDtypes)):
kw["dtypes"] = dtypes.copy()
else:
if new_columns is None:
(
new_columns,
kw["column_widths"],
) = self._compute_axis_labels_and_lengths(1, new_partitions)
kw["dtypes"] = (
pandas.Series(dtypes, index=new_columns)
if is_list_like(dtypes)
else pandas.Series(
[np.dtype(dtypes)] * len(new_columns), index=new_columns
kw["dtypes"] = ModinDtypes(
DtypesDescriptor(remaining_dtype=np.dtype(dtypes))
)
else:
kw["dtypes"] = (
pandas.Series(dtypes, index=new_columns)
if is_list_like(dtypes)
else pandas.Series(
[np.dtype(dtypes)] * len(new_columns), index=new_columns
)
)
)

if not keep_partitioning:
if kw["row_lengths"] is None and new_index is not None:
Expand Down Expand Up @@ -3662,10 +3712,12 @@ def _compute_new_widths():
if all(obj.has_materialized_columns for obj in (self, *others)):
new_columns = self.columns.append([other.columns for other in others])
new_index = joined_index
if self.has_materialized_dtypes and all(
o.has_materialized_dtypes for o in others
):
new_dtypes = pandas.concat([self.dtypes] + [o.dtypes for o in others])
try:
new_dtypes = ModinDtypes.concat(
[self.copy_dtypes_cache()] + [o.copy_dtypes_cache() for o in others]
)
except NotImplementedError:
Fixed Show fixed Hide fixed
new_dtypes = None
# If we have already cached the width of each column in at least one
# of the column's partitions, we can build new_widths for the new
# frame. Typically, if we know the width for any partition in a
Expand Down
4 changes: 2 additions & 2 deletions modin/core/dataframe/pandas/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

"""Utilities and classes to handle work with metadata."""

from .dtypes import LazyProxyCategoricalDtype, ModinDtypes
from .dtypes import DtypesDescriptor, LazyProxyCategoricalDtype, ModinDtypes
from .index import ModinIndex

__all__ = ["ModinDtypes", "ModinIndex", "LazyProxyCategoricalDtype"]
__all__ = ["ModinDtypes", "ModinIndex", "LazyProxyCategoricalDtype", "DtypesDescriptor"]
Loading
Loading