From c31febaafb389fe42b44a73a952003a072111d51 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 15 Apr 2024 14:28:03 +0200 Subject: [PATCH] FIX-#6227: Make sure `Series.unique()` with pyarrow dtype returns `ArrowExtensionArray` (#7042) Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/query_compiler.py | 14 ++++++-- .../hdk_on_native/dataframe/dataframe.py | 2 +- .../hdk_on_native/dataframe/utils.py | 35 +++++++++++++++++-- modin/pandas/series.py | 10 ++++-- modin/tests/pandas/test_series.py | 23 ++++++++++++ 5 files changed, 76 insertions(+), 8 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 5f98683425b..7fd07d5fe91 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2018,7 +2018,10 @@ def str_split(self, pat=None, n=-1, expand=False, regex=None): def unique(self, keep="first", ignore_index=True, subset=None): # kernels with 'pandas.Series.unique()' work faster can_use_unique_kernel = ( - subset is None and ignore_index and len(self.columns) == 1 and keep + subset is None + and ignore_index + and len(self.columns) == 1 + and keep is not False ) if not can_use_unique_kernel and not RangePartitioning.get(): @@ -2028,7 +2031,11 @@ def unique(self, keep="first", ignore_index=True, subset=None): new_modin_frame = self._modin_frame._apply_func_to_range_partitioning( key_columns=self.columns.tolist() if subset is None else subset, func=( - (lambda df: pandas.DataFrame(df.squeeze(axis=1).unique())) + ( + lambda df: pandas.DataFrame( + df.squeeze(axis=1).unique(), columns=["__reduced__"] + ) + ) if can_use_unique_kernel else ( lambda df: df.drop_duplicates( @@ -2039,6 +2046,9 @@ def unique(self, keep="first", ignore_index=True, subset=None): preserve_columns=True, ) else: + # return self.to_pandas().squeeze(axis=1).unique() works faster + # but returns pandas type instead of query compiler + # TODO: https://github.com/modin-project/modin/issues/7182 new_modin_frame = self._modin_frame.apply_full_axis( 0, lambda x: x.squeeze(axis=1).unique(), diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py index e398db23045..09ffe8011a6 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py @@ -2862,7 +2862,7 @@ def to_pandas(self): obj = obj.cast(schema) # concatenate() is called by _partition_mgr_cls.to_pandas # to preserve the categorical dtypes - df = concatenate([arrow_to_pandas(obj)]) + df = concatenate([arrow_to_pandas(obj, self._dtypes)]) else: df = obj.copy() diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py index a8e0531ab63..f99cc256baa 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py @@ -13,9 +13,11 @@ """Utilities for internal use by the ``HdkOnNativeDataframe``.""" +from __future__ import annotations + import re from functools import lru_cache -from typing import Any, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import numpy as np import pandas @@ -27,6 +29,9 @@ from modin.pandas.indexing import is_range_like from modin.utils import MODIN_UNNAMED_SERIES_LABEL +if TYPE_CHECKING: + from modin.core.dataframe.pandas.metadata import ModinDtypes + EMPTY_ARROW_TABLE = pa.Table.from_pandas(pandas.DataFrame({})) @@ -579,7 +584,9 @@ def ensure_supported_dtype(dtype): raise NotImplementedError(f"Type {dtype}") -def arrow_to_pandas(at: pa.Table) -> pandas.DataFrame: +def arrow_to_pandas( + at: pa.Table, dtypes: Optional[Union[ModinDtypes, pandas.Series]] = None +) -> pandas.DataFrame: """ Convert the specified arrow table to pandas. @@ -587,6 +594,8 @@ def arrow_to_pandas(at: pa.Table) -> pandas.DataFrame: ---------- at : pyarrow.Table The table to convert. + dtypes : Union[ModinDtypes, pandas.Series], optional + Dtypes are used to correctly map PyArrow types to pandas. Returns ------- @@ -597,6 +606,28 @@ def mapper(at): if is_dictionary(at) and isinstance(at.value_type, ArrowIntervalType): # The default mapper fails with TypeError: unhashable type: 'dict' return _CategoricalDtypeMapper + elif dtypes is not None and any( + ( + isinstance(dtype, pandas.core.dtypes.dtypes.ArrowDtype) + for dtype in dtypes + ) + ): + # for pandas types that are backed by pyarrow, for example: uint8[pyarrow] + dtype_mapping = { + pa.int8(): pandas.ArrowDtype(pa.int8()), + pa.int16(): pandas.ArrowDtype(pa.int16()), + pa.int32(): pandas.ArrowDtype(pa.int32()), + pa.int64(): pandas.ArrowDtype(pa.int64()), + pa.uint8(): pandas.ArrowDtype(pa.uint8()), + pa.uint16(): pandas.ArrowDtype(pa.uint16()), + pa.uint32(): pandas.ArrowDtype(pa.uint32()), + pa.uint64(): pandas.ArrowDtype(pa.uint64()), + pa.bool_(): pandas.ArrowDtype(pa.bool_()), + pa.float32(): pandas.ArrowDtype(pa.float32()), + pa.float64(): pandas.ArrowDtype(pa.float64()), + pa.string(): pandas.ArrowDtype(pa.string()), + } + return dtype_mapping.get(at, None) return None df = at.to_pandas(types_mapper=mapper) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 70371b922e3..b3cfc065e2c 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -2107,9 +2107,13 @@ def unique(self) -> ArrayLike: # noqa: RT01, D200 """ Return unique values of Series object. """ - return self.__constructor__( - query_compiler=self._query_compiler.unique() - ).to_numpy() + # `values` can't be used here because it performs unnecessary conversion, + # after which the result type does not match the pandas + return ( + self.__constructor__(query_compiler=self._query_compiler.unique()) + .modin.to_pandas() + ._values + ) def update(self, other) -> None: # noqa: PR01, D200 """ diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index cad8879b0c0..b079ce586dd 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -3691,11 +3691,13 @@ def test_unique(data): pandas_result = pandas_series.unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape + assert type(modin_result) is type(pandas_result) modin_result = pd.Series([2, 1, 3, 3], name="A").unique() pandas_result = pandas.Series([2, 1, 3, 3], name="A").unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape + assert type(modin_result) is type(pandas_result) modin_result = pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() pandas_result = pandas.Series( @@ -3703,6 +3705,7 @@ def test_unique(data): ).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape + assert type(modin_result) is type(pandas_result) modin_result = pd.Series( [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] @@ -3712,11 +3715,13 @@ def test_unique(data): ).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape + assert type(modin_result) is type(pandas_result) modin_result = pandas.Series(pd.Categorical(list("baabc"))).unique() pandas_result = pd.Series(pd.Categorical(list("baabc"))).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape + assert type(modin_result) is type(pandas_result) modin_result = pd.Series( pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) @@ -3726,6 +3731,24 @@ def test_unique(data): ).unique() comparator(modin_result, pandas_result) assert modin_result.shape == pandas_result.shape + assert type(modin_result) is type(pandas_result) + + +def test_unique_pyarrow_dtype(): + # See #6227 for details + modin_series, pandas_series = create_test_series( + [1, 0, pd.NA], dtype="uint8[pyarrow]" + ) + + def comparator(df1, df2): + # Perform our own non-strict version of dtypes equality check + df_equals(df1, df2) + # to be sure `unique` return `ArrowExtensionArray` + assert type(df1) is type(df2) + + eval_general( + modin_series, pandas_series, lambda df: df.unique(), comparator=comparator + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)