From c96417b0dfa1e2495743058aaaa768b16431cf10 Mon Sep 17 00:00:00 2001 From: Naren Krishna Date: Wed, 26 Jun 2024 08:52:38 -0700 Subject: [PATCH] SNOW-1445832: Added support for `DataFrame.stack` (#1821) Signed-off-by: Naren Krishna Co-authored-by: Devin Petersohn --- CHANGELOG.md | 1 + docs/source/modin/dataframe.rst | 9 +-- .../modin/supported/dataframe_supported.rst | 3 +- .../snowpark/modin/pandas/dataframe.py | 32 ++++++--- .../compiler/snowflake_query_compiler.py | 70 ++++++++++++++++++- .../modin/plugin/docstrings/dataframe.py | 53 ++++++++++++++ tests/integ/modin/frame/test_stack.py | 54 ++++++++++++++ tests/unit/modin/test_unsupported.py | 1 - 8 files changed, 208 insertions(+), 15 deletions(-) create mode 100644 tests/integ/modin/frame/test_stack.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e6d5613577c..9239d660cc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ - Added support for `Series.case_when` except when condition or replacement is callable. - Added documentation pages for `Index` and its APIs. - Added support for `DataFrame.assign`. +- Added support for `DataFrame.stack`. #### Bug Fixes diff --git a/docs/source/modin/dataframe.rst b/docs/source/modin/dataframe.rst index 2f97bb074fb..a16f02fe276 100644 --- a/docs/source/modin/dataframe.rst +++ b/docs/source/modin/dataframe.rst @@ -183,13 +183,14 @@ DataFrame .. autosummary:: :toctree: pandas_api/ - DataFrame.pivot_table - DataFrame.sort_values - DataFrame.sort_index + DataFrame.melt DataFrame.nlargest DataFrame.nsmallest - DataFrame.melt + DataFrame.pivot_table + DataFrame.sort_index + DataFrame.sort_values DataFrame.squeeze + DataFrame.stack DataFrame.T DataFrame.transpose diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 58cef720cf0..89adf093e41 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -389,7 +389,8 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``squeeze`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``stack`` | N | | | +| ``stack`` | P | ``level``, | ``N`` for MultiIndex | +| | | ``future_stack`` is ignored | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``std`` | P | | ``N`` if ``ddof`` is not 0 or 1 | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index abecc053adb..b01dee741df 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -2354,23 +2354,39 @@ def squeeze(self, axis: Axis | None = None): return Series(query_compiler=self.T._query_compiler) return self.copy() - @dataframe_not_implemented() - def stack(self, level=-1, dropna=True): # noqa: PR01, RT01, D200 + def stack( + self, + level: int | str | list = -1, + dropna: bool | NoDefault = no_default, + sort: bool | NoDefault = no_default, + future_stack: bool = False, # ignored + ): """ Stack the prescribed level(s) from columns to index. """ # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions - if not isinstance(self.columns, pandas.MultiIndex) or ( - isinstance(self.columns, pandas.MultiIndex) - and is_list_like(level) - and len(level) == self.columns.nlevels + if future_stack is not False: + WarningMessage.ignored_argument( # pragma: no cover + operation="DataFrame.stack", + argument="future_stack", + message="future_stack parameter has been ignored with Snowflake execution engine", + ) + if dropna is NoDefault: + dropna = True # pragma: no cover + if sort is NoDefault: + sort = True # pragma: no cover + + # This ensures that non-pandas MultiIndex objects are caught. + is_multiindex = len(self.columns.names) > 1 + if not is_multiindex or ( + is_multiindex and is_list_like(level) and len(level) == self.columns.nlevels ): return self._reduce_dimension( - query_compiler=self._query_compiler.stack(level, dropna) + query_compiler=self._query_compiler.stack(level, dropna, sort) ) else: return self.__constructor__( - query_compiler=self._query_compiler.stack(level, dropna) + query_compiler=self._query_compiler.stack(level, dropna, sort) ) def sub( diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index b2fa32334f0..a82375cc27e 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2503,7 +2503,7 @@ def sort_rows_by_column_values( na_position: Puts NaNs at the beginning if 'first'; 'last' puts NaNs at the end. Defaults to 'last' ignore_index: If True, existing index is ignored and new index is generated which is a gap free sequence from 0 to n-1. Defaults to False. - key: Apply the key function to the values before sorting. Fallback to native pandas if key is provided. + key: Apply the key function to the values before sorting. Returns: A new SnowflakeQueryCompiler instance after applying the sort. @@ -14815,3 +14815,71 @@ def pct_change( } ).frame ) + + def stack( + self, + level: Union[int, str, list] = -1, + dropna: bool = True, + sort: bool = True, + ) -> "SnowflakeQueryCompiler": + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level index with one + or more new inner-most levels compared to the current DataFrame. The new inner-most + levels are created by pivoting the columns of the current dataframe: + - if the columns have a single level, the output is a Series. + - if the columns have multiple levels, the new index level(s) is (are) + taken from the prescribed level(s) and the output is a DataFrame. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index axis, + defined as one index or label, or a list of indices or labels. + + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with missing values. Stacking a + column level onto the index axis can create combinations of index and column values + that are missing from the original dataframe. + + sort : bool, default True + Whether to sort the levels of the resulting MultiIndex. + """ + if level != -1: + ErrorMessage.not_implemented( + "Snowpark pandas doesn't yet support 'level != -1' in stack API", + ) + if self._modin_frame.is_multiindex(axis=1): + ErrorMessage.not_implemented( + "Snowpark pandas doesn't support multiindex columns in stack API" + ) + + index_names = ["index"] + # Stack is equivalent to doing df.melt() with index reset, sorting the values, then setting the index + # Note that we always use sort_rows_by_column_values even if sort is False + qc = ( + self.reset_index() + .melt( + id_vars=index_names, + value_vars=self.columns, + var_name="index_second_level", + value_name=MODIN_UNNAMED_SERIES_LABEL, + ignore_index=False, + ) + .sort_rows_by_column_values( + columns=index_names, # type: ignore + ascending=[True], + kind="stable", + na_position="last", + ignore_index=False, + ) + .replace(to_replace=UNPIVOT_NULL_REPLACE_VALUE, value=np.nan) + .set_index_from_columns(index_names + ["index_second_level"]) # type: ignore + .set_index_names([None, None]) + ) + + if dropna: + return qc.dropna(axis=0, how="any", thresh=None) + else: + return qc diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index d09c764fcff..e873ae88769 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -3227,6 +3227,59 @@ def squeeze(): def stack(): """ Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level index with one + or more new inner-most levels compared to the current DataFrame. The new inner-most + levels are created by pivoting the columns of the current dataframe. + If the columns have a single level, the output is a Series. + If the columns have multiple levels, the new index level(s) is (are) + taken from the prescribed level(s) and the output is a DataFrame. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index axis, + defined as one index or label, or a list of indices or labels. + + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with missing values. Stacking a + column level onto the index axis can create combinations of index and column values + that are missing from the original dataframe. + + sort : bool, default True + Whether to sort the levels of the resulting MultiIndex. + + future_stack : bool, default False + This argument is ignored in Snowpark pandas. + + Returns + ------- + DataFrame or Series + Stacked dataframe or series. + + Notes + ----- + level != -1 and MultiIndex dataframes are not yet supported by Snowpark pandas. + + See Also + -------- + DataFrame.unstack : Unstack prescribed level(s) from index axis onto column axis. + DataFrame.pivot : Reshape dataframe from long format to wide format. + DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + + Examples + -------- + >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height']) + >>> df_single_level_cols + weight height + cat 0 1 + dog 2 3 + >>> df_single_level_cols.stack() + cat weight 0 + height 1 + dog weight 2 + height 3 + dtype: int64 """ def sub(): diff --git a/tests/integ/modin/frame/test_stack.py b/tests/integ/modin/frame/test_stack.py new file mode 100644 index 00000000000..81743028faa --- /dev/null +++ b/tests/integ/modin/frame/test_stack.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pytest + +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result + + +@pytest.mark.parametrize( + "data, index, columns", + [ + ([[0, 1], [2, 3]], ["cat", "dog"], ["weight", "height"]), + ([[0, np.nan], [np.nan, 3]], ["cat", "dog"], ["weight", "height"]), + ], +) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@sql_count_checker(query_count=1) +def test_stack(data, index, columns, dropna, sort): + eval_snowpark_pandas_result( + *create_test_dfs(data=data, index=index, columns=columns), + lambda df: df.stack(dropna=dropna, sort=sort), + ) + + +@sql_count_checker(query_count=0) +def test_stack_level_unsupported(): + df_single_level_cols = pd.DataFrame( + [[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"] + ) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas doesn't yet support 'level != -1' in stack API", + ): + df_single_level_cols.stack(level=0) + + +@sql_count_checker(query_count=0) +def test_stack_multiindex_unsupported(): + multicol1 = pd.MultiIndex.from_tuples([("weight", "kg"), ("weight", "pounds")]) + df_multi_level_cols1 = pd.DataFrame( + [[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1 + ) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas doesn't support multiindex columns in stack API", + ): + df_multi_level_cols1.stack() diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py index 70e1d3a93ec..e17c9f14b7d 100644 --- a/tests/unit/modin/test_unsupported.py +++ b/tests/unit/modin/test_unsupported.py @@ -104,7 +104,6 @@ def test_unsupported_general(general_method, kwargs): ["reorder_levels", {"order": ""}], ["sem", {}], ["set_flags", {}], - ["stack", {}], ["style", {}], ["swapaxes", {"axis1": "", "axis2": ""}], ["swaplevel", {}],