From f0205f336c547e9cd6792ff066db641488850fde Mon Sep 17 00:00:00 2001 From: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> Date: Wed, 1 May 2024 14:05:33 -0700 Subject: [PATCH] SNOW-1347394: Remove BaseQueryCompiler + error in QC default to pandas (#1454) Please answer these questions before submitting your pull requests. Thanks! 1. What GitHub issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1347394 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency 3. Please describe how your code solves the related issue. This PR removes our vendored copy of the `BaseQueryCompiler` class, inheriting the class from upstream Modin instead. Similarly, it removes all the operator registration classes defined in `snowflake.snowpark.modin.core.dataframe.algebra.default2pandas`, with one exception. Upstream Modin does not properly render the names of `property` objects (https://github.com/modin-project/modin/issues/7233), so we should override `DataFrameDefault.register` to fix this until this issue is fixed upstream. This PR incidentally removes `Series.dt.week` + `Series.dt.weekofyear`, which were already removed in pandas 2.0. --------- Co-authored-by: Naren Krishna --- .../algebra/default2pandas/__init__.py | 36 - .../algebra/default2pandas/binary.py | 78 - .../dataframe/algebra/default2pandas/cat.py | 48 - .../algebra/default2pandas/dataframe.py | 36 - .../algebra/default2pandas/datetime.py | 48 - .../algebra/default2pandas/groupby.py | 728 --- .../algebra/default2pandas/resample.py | 106 - .../algebra/default2pandas/rolling.py | 159 - .../algebra/default2pandas/series.py | 49 - .../dataframe/algebra/default2pandas/str.py | 48 - .../execution/dispatching/factories/baseio.py | 2 +- .../snowpark/modin/pandas/__init__.py | 12 + .../snowpark/modin/pandas/general.py | 3 +- .../snowpark/modin/pandas/series_utils.py | 8 - src/snowflake/snowpark/modin/pandas/utils.py | 2 +- .../snowpark/modin/plugin/PANDAS_CHANGELOG.md | 1 + .../modin/plugin/_internal/transpose_utils.py | 4 +- .../modin/plugin/compiler/__init__.py | 4 +- .../modin/plugin/compiler/query_compiler.py | 3980 ----------------- .../compiler/snowflake_query_compiler.py | 55 +- tests/integ/modin/frame/test_value_counts.py | 16 +- tests/integ/modin/test_telemetry.py | 77 +- ...efault2pandas.py => test_unimplemented.py} | 169 +- tests/integ/modin/tools/test_to_datetime.py | 81 +- tests/integ/modin/tools/test_to_numeric.py | 68 +- tests/unit/modin/test_series_dt.py | 22 +- tests/unit/modin/test_series_strings.py | 12 +- 27 files changed, 123 insertions(+), 5729 deletions(-) delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/binary.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/cat.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/dataframe.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/datetime.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/groupby.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/resample.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/rolling.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/series.py delete mode 100644 src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/str.py delete mode 100644 src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py rename tests/integ/modin/{test_default2pandas.py => test_unimplemented.py} (58%) diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/__init__.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/__init__.py index 90a7794a316..2f188e62f8b 100644 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/__init__.py +++ b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/__init__.py @@ -21,46 +21,10 @@ """Module default2pandas provides templates for a query compiler default-to-pandas methods.""" -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.binary import ( - BinaryDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.cat import ( - CatDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.dataframe import ( - DataFrameDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.datetime import ( - DateTimeDefault, -) from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( DefaultMethod, ) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.groupby import ( - GroupByDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.resample import ( - ResampleDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.rolling import ( - RollingDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.series import ( - SeriesDefault, -) -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.str import ( - StrDefault, -) __all__ = [ - "DataFrameDefault", - "DateTimeDefault", - "SeriesDefault", - "StrDefault", - "BinaryDefault", - "ResampleDefault", - "RollingDefault", "DefaultMethod", - "CatDefault", - "GroupByDefault", ] diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/binary.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/binary.py deleted file mode 100644 index 4052c113b25..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/binary.py +++ /dev/null @@ -1,78 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default binary functions builder class.""" -from typing import Any, Callable, Union - -import pandas -from pandas._typing import AnyArrayLike, Scalar - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( - DefaultMethod, -) - - -class BinaryDefault(DefaultMethod): - """Build default-to-pandas methods which executes binary functions.""" - - @classmethod - def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable: - """ - Build function that do fallback to pandas for passed binary `fn`. - - Parameters - ---------- - fn : callable - Binary function to apply to the casted to pandas frame and other operand. - fn_name : str - Function name which will be shown in default-to-pandas warning message. - - Returns - ------- - callable - Function that takes query compiler, does fallback to pandas and applies binary `fn` - to the casted to pandas frame. - """ - - def bin_ops_wrapper( - df: pandas.DataFrame, - other: Union[pandas.DataFrame, pandas.Series, Scalar, AnyArrayLike], - *args: Any, - **kwargs: Any - ) -> pandas.DataFrame: - """Apply specified binary function to the passed operands.""" - squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( - "squeeze_other", False - ) - squeeze_self = kwargs.pop("squeeze_self", False) - - if squeeze_other: - other = other.squeeze(axis=1) - - if squeeze_self: - df = df.squeeze(axis=1) - - result = fn(df, other, *args, **kwargs) - if not isinstance(result, pandas.DataFrame): # pragma: no cover - result = pandas.DataFrame(result) - return result - - return super().build_default_to_pandas(bin_ops_wrapper, fn_name) diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/cat.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/cat.py deleted file mode 100644 index 6a239a7a55c..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/cat.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default applied-on-category functions builder class.""" -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.series import ( - SeriesDefault, -) - - -class CatDefault(SeriesDefault): - """Builder for default-to-pandas methods which is executed under category accessor.""" - - @classmethod - def frame_wrapper( - cls, df: pandas.DataFrame - ) -> pandas.core.arrays.categorical.CategoricalAccessor: - """ - Get category accessor of the passed frame. - - Parameters - ---------- - df : pandas.DataFrame - - Returns - ------- - pandas.core.arrays.categorical.CategoricalAccessor - """ - return df.squeeze(axis=1).cat diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/dataframe.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/dataframe.py deleted file mode 100644 index f22f80f130b..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/dataframe.py +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default DataFrame functions builder class.""" - -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( - DefaultMethod, -) - -# from modin.utils import _inherit_docstrings -from snowflake.snowpark.modin.utils import _inherit_docstrings - - -@_inherit_docstrings(DefaultMethod) -class DataFrameDefault(DefaultMethod): - DEFAULT_OBJECT_TYPE = pandas.DataFrame diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/datetime.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/datetime.py deleted file mode 100644 index d7aefcd165e..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/datetime.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default applied-on-datetime functions builder class.""" -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.series import ( - SeriesDefault, -) - - -class DateTimeDefault(SeriesDefault): - """Builder for default-to-pandas methods which is executed under datetime accessor.""" - - @classmethod - def frame_wrapper( - cls, df: pandas.DataFrame - ) -> pandas.core.indexes.accessors.DatetimeProperties: - """ - Get datetime accessor of the passed frame. - - Parameters - ---------- - df : pandas.DataFrame - - Returns - ------- - pandas.core.indexes.accessors.DatetimeProperties - """ - return df.squeeze(axis=1).dt diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/groupby.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/groupby.py deleted file mode 100644 index ad5d96dc6a0..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/groupby.py +++ /dev/null @@ -1,728 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default GroupBy functions builder class.""" -from typing import Any, Callable, Optional, Union - -import pandas -from pandas.core.dtypes.common import is_list_like - -# Defines a set of string names of functions that are executed in a transform-way in groupby -from pandas.core.groupby.base import transformation_kernels - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( - DefaultMethod, -) -from snowflake.snowpark.modin.utils import ( - MODIN_UNNAMED_SERIES_LABEL, - hashable, - is_property, -) - - -# FIXME: there is no sence of keeping `GroupBy` and `GroupByDefault` logic in a different -# classes. They should be combined. -class GroupBy: - """Builder for GroupBy aggregation functions.""" - - agg_aliases = [ - "agg", - "dict_agg", - pandas.core.groupby.DataFrameGroupBy.agg, - pandas.core.groupby.DataFrameGroupBy.aggregate, - ] - - @staticmethod - def is_transformation_kernel(agg_func: Any) -> bool: - """ - Check whether a passed aggregation function is a transformation. - - Transformation means that the result of the function will be broadcasted - to the frame's original shape. - - Parameters - ---------- - agg_func : Any - - Returns - ------- - bool - """ - return ( - hashable(agg_func) and agg_func in transformation_kernels - ) # pragma: no cover - - @classmethod - def _call_groupby( - cls, df: Union[pandas.DataFrame, pandas.Series], *args: Any, **kwargs: Any - ) -> Union[pandas.core.groupby.DataFrameGroupBy, pandas.core.groupby.SeriesGroupBy]: - """Call .groupby() on passed `df`.""" - return df.groupby(*args, **kwargs) # pragma: no cover - - @classmethod - def validate_by(cls, by: Any) -> Any: - """ - Build valid `by` parameter for `pandas.DataFrame.groupby`. - - Cast all DataFrames in `by` parameter to Series or list of Series in case - of multi-column frame. - - Parameters - ---------- - by : DateFrame, Series, index label or list of such - Object which indicates groups for GroupBy. - - Returns - ------- - Series, index label or list of such - By parameter with all DataFrames casted to Series. - """ - - def try_cast_series(df: Any) -> Any: # pragma: no cover - """Cast one-column frame to Series.""" - if isinstance(df, pandas.DataFrame): - df = df.squeeze(axis=1) - if not isinstance(df, pandas.Series): - return df - if df.name == MODIN_UNNAMED_SERIES_LABEL: - df.name = None - return df - - if isinstance(by, pandas.DataFrame): - by = [try_cast_series(column) for _, column in by.items()] - elif isinstance(by, pandas.Series): - by = [try_cast_series(by)] - elif isinstance(by, list): - by = [try_cast_series(o) for o in by] - return by - - @classmethod - def inplace_applyier_builder( - cls, key: Callable, func: Optional[Union[Callable, str]] = None - ) -> Callable: - """ - Bind actual aggregation function to the GroupBy aggregation method. - - Parameters - ---------- - key : callable - Function that takes GroupBy object and evaluates passed aggregation function. - func : callable or str, optional - Function that takes DataFrame and aggregate its data. Will be applied - to each group at the grouped frame. - - Returns - ------- - callable, - Function that executes aggregation under GroupBy object. - """ - inplace_args = [] if func is None else [func] - - def inplace_applyier( # pragma: no cover - grp: Union[ - pandas.core.groupby.DataFrameGroupBy, pandas.core.groupby.SeriesGroupBy - ], - *func_args: Any, - **func_kwargs: Any, - ) -> Callable: - return key(grp, *inplace_args, *func_args, **func_kwargs) # type: ignore[operator] # pragma: no cover - - return inplace_applyier - - @classmethod - def get_func(cls, key: Callable, **kwargs: Any) -> Callable: - """ - Extract aggregation function from groupby arguments. - - Parameters - ---------- - key : callable or str - Default aggregation function. If aggregation function is not specified - via groupby arguments, then `key` function is used. - **kwargs : dict - GroupBy arguments that may contain aggregation function. - - Returns - ------- - callable - Aggregation function. - - Notes - ----- - There are two ways of how groupby aggregation can be invoked: - 1. Explicitly with query compiler method: `qc.groupby_sum()`. - 2. By passing aggregation function as an argument: `qc.groupby_agg("sum")`. - Both are going to produce the same result, however in the first case actual aggregation - function can be extracted from the method name, while for the second only from the method arguments. - """ - if "agg_func" in kwargs: - return cls.inplace_applyier_builder(key, kwargs["agg_func"]) - elif "func_dict" in kwargs: - return cls.inplace_applyier_builder(key, kwargs["func_dict"]) - else: - return cls.inplace_applyier_builder(key) - - @classmethod - def build_aggregate_method(cls, key: Callable) -> Callable: - """ - Build function for `QueryCompiler.groupby_agg` that can be executed as default-to-pandas. - - Parameters - ---------- - key : callable or str - Default aggregation function. If aggregation function is not specified - via groupby arguments, then `key` function is used. - - Returns - ------- - callable - Function that executes groupby aggregation. - """ - - def fn( - df: Union[pandas.DataFrame, pandas.Series], - by: Any, - axis: int, - groupby_kwargs: dict[str, Any], - agg_args: Any, - agg_kwargs: dict[str, Any], - **kwargs: Any, - ) -> Any: - """Group DataFrame and apply aggregation function to each group.""" - by = cls.validate_by(by) - - grp = cls._call_groupby( - df, by, axis=axis, **groupby_kwargs - ) # pragma: no cover - agg_func = cls.get_func(key, **kwargs) - result = agg_func(grp, *agg_args, **agg_kwargs) - - return result - - return fn - - @classmethod - def build_groupby_reduce_method(cls, agg_func: Any) -> Callable: - """ - Build function for `QueryCompiler.groupby_*` that can be executed as default-to-pandas. - - Parameters - ---------- - agg_func : callable or str - Default aggregation function. If aggregation function is not specified - via groupby arguments, then `agg_func` function is used. - - Returns - ------- - callable - Function that executes groupby aggregation. - """ - - def fn( - df: Union[pandas.DataFrame, pandas.Series], - by: Any, - axis: int, - groupby_kwargs: dict[str, Any], - agg_args: Any, - agg_kwargs: dict[str, Any], - drop: bool = False, - **kwargs: Any, - ) -> Any: - """Group DataFrame and apply aggregation function to each group.""" - if not isinstance(by, (pandas.Series, pandas.DataFrame)): - by = cls.validate_by(by) - grp = cls._call_groupby( - df, by, axis=axis, **groupby_kwargs - ) # pragma: no cover - grp_agg_func = cls.get_func(agg_func, **kwargs) - return grp_agg_func( - grp, - *agg_args, - **agg_kwargs, - ) - - if isinstance(by, pandas.DataFrame): - by = by.squeeze(axis=1) - if ( - drop - and isinstance(by, pandas.Series) - and by.name in df - and df[by.name].equals(by) - ): - by = [by.name] - if isinstance(by, pandas.DataFrame): - df = pandas.concat([df] + [by[[o for o in by if o not in df]]], axis=1) - by = list(by.columns) - - groupby_kwargs = groupby_kwargs.copy() - as_index = groupby_kwargs.pop("as_index", True) - groupby_kwargs["as_index"] = True - - grp = cls._call_groupby( - df, by, axis=axis, **groupby_kwargs - ) # pragma: no cover - func = cls.get_func(agg_func, **kwargs) - result = func(grp, *agg_args, **agg_kwargs) - method = kwargs.get("method") - - if isinstance(result, pandas.Series): - result = result.to_frame( # pragma: no cover - MODIN_UNNAMED_SERIES_LABEL if result.name is None else result.name - ) - - if not as_index: - if isinstance(by, pandas.Series): - # 1. If `drop` is True then 'by' Series represents a column from the - # source frame and so the 'by' is internal. - # 2. If method is 'size' then any 'by' is considered to be internal. - # This is a hacky legacy from the ``groupby_size`` implementation: - # https://github.com/modin-project/modin/issues/3739 - internal_by = (by.name,) if drop or method == "size" else tuple() - else: - internal_by = by - - cls.handle_as_index_for_dataframe( - result, - internal_by, - by_cols_dtypes=( - df.index.dtypes.values - if isinstance(df.index, pandas.MultiIndex) - else (df.index.dtype,) - ), - by_length=len(by), - drop=drop, - method=method, - inplace=True, - ) - - if result.index.name == MODIN_UNNAMED_SERIES_LABEL: - result.index.name = None - - return result - - return fn - - @classmethod - def is_aggregate(cls, key: Union[Callable, str, property]) -> bool: - """Check whether `key` is an alias for pandas.GroupBy.aggregation method.""" - return key in cls.agg_aliases - - @classmethod - def build_property_method(cls, property: property) -> Callable: - """ - Build function for `SnowflakeQueryCompiler.` that can be executed as default-to-pandas - - Parameters - ---------- - property: property - property of groupby object. - - Returns - ------- - callable - Function that executes groupby aggregation and returns property. - """ - - def fn( - df: Union[pandas.DataFrame, pandas.Series], - by: Any, - axis: int, - groupby_kwargs: dict[str, Any], - ) -> Any: - """Group DataFrame and apply aggregation function to each group.""" - by = cls.validate_by(by) # pragma: no cover - - grp = cls._call_groupby( - df, by, axis=axis, **groupby_kwargs - ) # pragma: no cover - - return property.fget(grp) # type: ignore[misc] - - return fn - - @classmethod - def build_groupby(cls, func: Union[Callable, property]) -> Callable: - """ - Build function that groups DataFrame and applies aggregation function to the every group. - - Parameters - ---------- - func : callable or str or property - Default aggregation function. If aggregation function is not specified - via groupby arguments, then `func` function is used. - - Returns - ------- - callable - Function that takes pandas DataFrame and does GroupBy aggregation. - """ - - if is_property(func): - return cls.build_property_method(func) # type: ignore[arg-type] - if cls.is_aggregate(func): - return cls.build_aggregate_method(func) # type: ignore[arg-type] - return cls.build_groupby_reduce_method( - func - ) # pragma: no cover # type: ignore[arg-type] - - @classmethod - def handle_as_index_for_dataframe( - cls, - result: pandas.DataFrame, - internal_by_cols: Any, - by_cols_dtypes: Optional[Any] = None, - by_length: Optional[int] = None, - selection: Optional[Any] = None, - partition_idx: int = 0, - drop: bool = True, - method: Optional[str] = None, - inplace: bool = False, - ) -> pandas.DataFrame: - """ - Handle `as_index=False` parameter for the passed GroupBy aggregation result. - - Parameters - ---------- - result : DataFrame - Frame containing GroupBy aggregation result computed with `as_index=True` - parameter (group names are located at the frame's index). - internal_by_cols : list-like - Internal 'by' columns. - by_cols_dtypes : list-like, optional - Data types of the internal 'by' columns. Required to do special casing - in case of categorical 'by'. If not specified, assume that there is no - categorical data in 'by'. - by_length : int, optional - Amount of keys to group on (including frame columns and external objects like list, Series, etc.) - If not specified, consider `by_length` to be equal ``len(internal_by_cols)``. - selection : label or list of labels, optional - Set of columns that were explicitly selected for aggregation (for example - via dict-aggregation). If not specified assuming that aggregation was - applied to all of the available columns. - partition_idx : int, default: 0 - Positional index of the current partition. - drop : bool, default: True - Indicates whether or not any of the `by` data came from the same frame. - method : str, optional - Name of the groupby function. This is a hint to be able to do special casing. - Note: this parameter is a legacy from the ``groupby_size`` implementation, - it's a hacky one and probably will be removed in the future: https://github.com/modin-project/modin/issues/3739. - inplace : bool, default: False - Modify the DataFrame in place (do not create a new object). - - Returns - ------- - DataFrame - GroupBy aggregation result with the considered `as_index=False` parameter. - """ - if not inplace: - result = result.copy() - - ( - reset_index, - drop, - lvls_to_drop, - cols_to_drop, - ) = cls.handle_as_index( # pragma: no cover - result_cols=result.columns, - result_index_names=result.index.names, - internal_by_cols=internal_by_cols, - by_cols_dtypes=by_cols_dtypes, - by_length=by_length, - selection=selection, - partition_idx=partition_idx, - drop=drop, - method=method, - ) - - if len(lvls_to_drop) > 0: - result.index = result.index.droplevel(lvls_to_drop) - if len(cols_to_drop) > 0: - result.drop(columns=cols_to_drop, inplace=True) - if reset_index: - result.reset_index(drop=drop, inplace=True) - return result - - @staticmethod - def handle_as_index( - result_cols: pandas.Index, - result_index_names: Any, - internal_by_cols: Any, - by_cols_dtypes: Optional[Any] = None, - by_length: Optional[int] = None, - selection: Optional[Any] = None, - partition_idx: int = 0, - drop: bool = True, - method: Optional[str] = None, - ) -> tuple[bool, bool, list[int], Any]: - """ - Compute hints to process ``as_index=False`` parameter for the GroupBy result. - - This function resolves naming conflicts of the index levels to insert and the column labels - for the GroupBy result. The logic of this function assumes that the initial GroupBy result - was computed as ``as_index=True``. - - Parameters - ---------- - result_cols : pandas.Index - Columns of the GroupBy result. - result_index_names : list-like - Index names of the GroupBy result. - internal_by_cols : list-like - Internal 'by' columns. - by_cols_dtypes : list-like, optional - Data types of the internal 'by' columns. Required to do special casing - in case of categorical 'by'. If not specified, assume that there is no - categorical data in 'by'. - by_length : int, optional - Amount of keys to group on (including frame columns and external objects like list, Series, etc.) - If not specified, consider `by_length` to be equal ``len(internal_by_cols)``. - selection : label or list of labels, optional - Set of columns that were explicitly selected for aggregation (for example - via dict-aggregation). If not specified assuming that aggregation was - applied to all of the available columns. - partition_idx : int, default: 0 - Positional index of the current partition. - drop : bool, default: True - Indicates whether or not any of the `by` data came from the same frame. - method : str, optional - Name of the groupby function. This is a hint to be able to do special casing. - Note: this parameter is a legacy from the ``groupby_size`` implementation, - it's a hacky one and probably will be removed in the future: https://github.com/modin-project/modin/issues/3739. - - Returns - ------- - reset_index : bool - Indicates whether to reset index to the default one (0, 1, 2 ... n) at this partition. - drop_index : bool - If `reset_index` is True, indicates whether to drop all index levels (True) or insert them into the - resulting columns (False). - lvls_to_drop : list of ints - Contains numeric indices of the levels of the result index to drop as intersected. - cols_to_drop : list of labels - Contains labels of the columns to drop from the result as intersected. - - Examples - -------- - >>> groupby_result = compute_groupby_without_processing_as_index_parameter() - >>> if not as_index: - >>> reset_index, drop, lvls_to_drop, cols_to_drop = handle_as_index(**extract_required_params(groupby_result)) - >>> if len(lvls_to_drop) > 0: - >>> groupby_result.index = groupby_result.index.droplevel(lvls_to_drop) - >>> if len(cols_to_drop) > 0: - >>> groupby_result = groupby_result.drop(columns=cols_to_drop) - >>> if reset_index: - >>> groupby_result_with_processed_as_index_parameter = groupby_result.reset_index(drop=drop) - >>> else: - >>> groupby_result_with_processed_as_index_parameter = groupby_result - """ - if by_length is None: - by_length = len(internal_by_cols) - - reset_index = method != "transform" and ( - by_length > 0 or selection is not None - ) # pragma: no cover - - # If the method is "size" then the result contains only one unique named column - # and we don't have to worry about any naming conflicts, so inserting all of - # the "by" into the result (just a fast-path) - if method == "size": - return reset_index, False, [], [] - - # pandas logic of resolving naming conflicts is the following: - # 1. If any categorical is in 'by' and 'by' is multi-column, then the categorical - # index is prioritized: drop intersected columns and insert all of the 'by' index - # levels to the frame as columns. - # 2. Otherwise, aggregation result is prioritized: drop intersected index levels and - # insert the filtered ones to the frame as columns. - if by_cols_dtypes is not None: - keep_index_levels = ( - by_length > 1 - and selection is None - and any(isinstance(x, pandas.CategoricalDtype) for x in by_cols_dtypes) - ) - else: - keep_index_levels = False - - # 1. We insert 'by'-columns to the result at the beginning of the frame and so only to the - # first partition, if partition_idx != 0 we just drop the index. If there are no columns - # that are required to drop (keep_index_levels is True) then we can exit here. - # 2. We don't insert 'by'-columns to the result if 'by'-data came from a different - # frame (drop is False), there's only one exception for this rule: if the `method` is "size", - # so if (drop is False) and method is not "size" we just drop the index and so can exit here. - if (not keep_index_levels and partition_idx != 0) or ( - not drop and method != "size" - ): - return reset_index, True, [], [] - - if not isinstance(internal_by_cols, pandas.Index): - if not is_list_like(internal_by_cols): - internal_by_cols = [internal_by_cols] - internal_by_cols = pandas.Index(internal_by_cols) - - internal_by_cols = ( - internal_by_cols[ - ~internal_by_cols.str.startswith(MODIN_UNNAMED_SERIES_LABEL, na=False) - ] - if hasattr(internal_by_cols, "str") - else internal_by_cols - ) - - if selection is not None and not isinstance(selection, pandas.Index): - selection = pandas.Index(selection) - - lvls_to_drop: list[int] = [] # pragma: no cover - cols_to_drop: Any = [] # pragma: no cover - - if not keep_index_levels: - # We want to insert only these internal-by-cols that are not presented - # in the result in order to not create naming conflicts - if selection is None: - cols_to_insert = frozenset(internal_by_cols) - frozenset(result_cols) - else: - cols_to_insert = frozenset( - # We have to use explicit 'not in' check and not just difference - # of sets because of specific '__contains__' operator in case of - # scalar 'col' and MultiIndex 'selection'. - col - for col in internal_by_cols - if col not in selection - ) - else: - cols_to_insert = internal_by_cols - # We want to drop such internal-by-cols that are presented - # in the result in order to not create naming conflicts - cols_to_drop = frozenset(internal_by_cols) & frozenset(result_cols) - - if partition_idx == 0: - lvls_to_drop = [ - i - for i, name in enumerate(result_index_names) - if name not in cols_to_insert - ] - else: - lvls_to_drop = result_index_names - - drop = False - if len(lvls_to_drop) == len(result_index_names): - drop = True - lvls_to_drop = [] - - return reset_index, drop, lvls_to_drop, cols_to_drop - - -class SeriesGroupBy(GroupBy): - """Builder for GroupBy aggregation functions for Series.""" - - @classmethod - def _call_groupby(cls, df: pandas.DataFrame, *args: Any, **kwargs: Any) -> Callable: - """Call .groupby() on passed `df` squeezed to Series.""" - # We can end up here by two means - either by "true" call - # like Series().groupby() or by df.groupby()[item]. - - if len(df.columns) == 1: # pragma: no cover - # Series().groupby() case - return df.squeeze(axis=1).groupby(*args, **kwargs) # pragma: no cover - # In second case surrounding logic will supplement grouping columns, - # so we need to drop them after grouping is over; our originally - # selected column is always the first, so use it - return df.groupby(*args, **kwargs)[df.columns[0]] # pragma: no cover - - -class GroupByDefault(DefaultMethod): - """Builder for default-to-pandas GroupBy aggregation functions.""" - - _groupby_cls = GroupBy - - OBJECT_TYPE = "GroupBy" - - @classmethod - def register(cls, func: Callable, **kwargs: Any) -> Callable: - """ - Build default-to-pandas GroupBy aggregation function. - - Parameters - ---------- - func : callable or str - Default aggregation function. If aggregation function is not specified - via groupby arguments, then `func` function is used. - **kwargs : kwargs - Additional arguments that will be passed to function builder. - - Returns - ------- - callable - Functiom that takes query compiler and defaults to pandas to do GroupBy - aggregation. - """ - return super().register( - cls._groupby_cls.build_groupby(func), - fn_name=cls.get_func_name_for_registered_method(func), - **kwargs, - ) - - # This specifies a `pandas.DataFrameGroupBy` method to pass the `agg_func` to, - # it's based on `how` to apply it. Going by pandas documentation: - # 1. `.aggregate(func)` applies func row/column wise. - # 2. `.apply(func)` applies func to a DataFrames, holding a whole group (group-wise). - # 3. `.transform(func)` is the same as `.apply()` but also broadcast the `func` - # result to the group's original shape. - # 4. 'direct' mode means that the passed `func` has to be applied directly - # to the `pandas.DataFrameGroupBy` object. - _aggregation_methods_dict = { - "axis_wise": pandas.core.groupby.DataFrameGroupBy.aggregate, - "group_wise": pandas.core.groupby.DataFrameGroupBy.apply, - "transform": pandas.core.groupby.DataFrameGroupBy.transform, - "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs), - } - - @classmethod - def get_aggregation_method(cls, how: str) -> Callable: - """ - Return `pandas.DataFrameGroupBy` method that implements the passed `how` UDF applying strategy. - - Parameters - ---------- - how : {"axis_wise", "group_wise", "transform"} - `how` parameter of the ``BaseQueryCompiler.groupby_agg``. - - Returns - ------- - callable(pandas.DataFrameGroupBy, callable, *args, **kwargs) -> [pandas.DataFrame | pandas.Series] - - Notes - ----- - Visit ``BaseQueryCompiler.groupby_agg`` doc-string for more information about `how` parameter. - """ - return cls._aggregation_methods_dict[how] # pragma: no cover - - -class SeriesGroupByDefault(GroupByDefault): - """Builder for default-to-pandas GroupBy aggregation functions for Series.""" - - _groupby_cls = SeriesGroupBy - - _aggregation_methods_dict = { - "axis_wise": pandas.core.groupby.SeriesGroupBy.aggregate, - "group_wise": pandas.core.groupby.SeriesGroupBy.apply, - "transform": pandas.core.groupby.SeriesGroupBy.transform, - "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs), - } diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/resample.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/resample.py deleted file mode 100644 index 559c1a9cb3b..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/resample.py +++ /dev/null @@ -1,106 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default Resamle functions builder class.""" -from typing import Any, Callable, Union - -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( - DefaultMethod, -) - - -# FIXME: there is no sence of keeping `Resampler` and `ResampleDefault` logic in a different -# classes. They should be combined. -class Resampler: - """Builder class for resampled aggregation functions.""" - - @classmethod - def build_resample(cls, func: Union[Callable, property], squeeze_self: bool) -> Any: - """ - Build function that resamples time-series data and does aggregation. - - Parameters - ---------- - func : callable - Aggregation function to execute under resampled frame. - squeeze_self : bool - Whether or not to squeeze frame before resampling. - - Returns - ------- - callable - Function that takes pandas DataFrame and applies aggregation - to resampled time-series data. - """ - - def fn( # pragma: no cover - df: pandas.DataFrame, - resample_kwargs: dict[str, Any], - *args: Any, - **kwargs: Any - ) -> Any: - """Resample time-series data of the passed frame and apply specified aggregation.""" - if squeeze_self: - df = df.squeeze(axis=1) - resampler = df.resample(**resample_kwargs) - - if type(func) == property: - return func.fget(resampler) # type: ignore[misc] # pragma: no cover - - return func(resampler, *args, **kwargs) # type: ignore[operator] # pragma: no cover - - return fn - - -class ResampleDefault(DefaultMethod): - """Builder for default-to-pandas resampled aggregation functions.""" - - OBJECT_TYPE = "Resampler" - - @classmethod - def register( - cls, func: Callable, squeeze_self: bool = False, **kwargs: Any - ) -> Callable: - """ - Build function that do fallback to pandas and aggregate resampled data. - - Parameters - ---------- - func : callable - Aggregation function to execute under resampled frame. - squeeze_self : bool, default: False - Whether or not to squeeze frame before resampling. - **kwargs : kwargs - Additional arguments that will be passed to function builder. - - Returns - ------- - callable - Function that takes query compiler and does fallback to pandas to resample - time-series data and apply aggregation on it. - """ - return super().register( - Resampler.build_resample(func, squeeze_self), - fn_name=func.__name__, - **kwargs - ) diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/rolling.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/rolling.py deleted file mode 100644 index be662920c80..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/rolling.py +++ /dev/null @@ -1,159 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default Rolling functions builder class.""" - -from typing import Any, Callable, Union - -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( - DefaultMethod, -) - - -class RollingDefault(DefaultMethod): - """Builder for default-to-pandas aggregation on a rolling window functions.""" - - OBJECT_TYPE = "Rolling" - - @classmethod - def _build_rolling(cls, func: Union[Callable, property]) -> Callable: - """ - Build function that creates a rolling window and executes `func` on it. - - Parameters - ---------- - func : callable - Function to execute on a rolling window. - - Returns - ------- - callable - Function that takes pandas DataFrame and applies `func` on a rolling window. - """ - - def fn( # pragma: no cover - df: pandas.DataFrame, - rolling_kwargs: dict[str, Any], - *args: Any, - **kwargs: Any - ) -> Any: - """Create rolling window for the passed frame and execute specified `func` on it.""" - roller = df.rolling(**rolling_kwargs) # pragma: no cover - - if type(func) == property: # pragma: no cover - return func.fget(roller) # type: ignore[misc] # pragma: no cover - - return func(roller, *args, **kwargs) # type: ignore[operator] # pragma: no cover - - return fn # pragma: no cover - - @classmethod - def register(cls, func: Callable, **kwargs: Any) -> Callable: - """ - Build function that do fallback to pandas to apply `func` on a rolling window. - - Parameters - ---------- - func : callable - Function to execute on a rolling window. - **kwargs : kwargs - Additional arguments that will be passed to function builder. - - Returns - ------- - callable - Function that takes query compiler and defaults to pandas to apply aggregation - `func` on a rolling window. - """ - return super().register( # pragma: no cover - cls._build_rolling(func), fn_name=func.__name__, **kwargs - ) - - -class ExpandingDefault(DefaultMethod): - """Builder for default-to-pandas aggregation on an expanding window functions.""" - - OBJECT_TYPE = "Expanding" - - @classmethod - def _build_expanding( - cls, func: Union[Callable, property], squeeze_self: bool - ) -> Callable: - """ - Build function that creates an expanding window and executes `func` on it. - - Parameters - ---------- - func : callable - Function to execute on a expanding window. - squeeze_self : bool - Whether or not to squeeze frame before executing the window function. - - Returns - ------- - callable - Function that takes pandas DataFrame and applies `func` on a expanding window. - """ - - def fn( # pragma: no cover - df: pandas.DataFrame, rolling_args: Any, *args: Any, **kwargs: Any - ) -> Any: - """Create rolling window for the passed frame and execute specified `func` on it.""" - if squeeze_self: # pragma: no cover - df = df.squeeze(axis=1) # pragma: no cover - roller = df.expanding(*rolling_args) # pragma: no cover - - if type(func) == property: # pragma: no cover - return func.fget(roller) # type: ignore[misc] # pragma: no cover - - return func(roller, *args, **kwargs) # type: ignore[operator] # pragma: no cover - - return fn # pragma: no cover - - @classmethod - def register( - cls, func: Callable, squeeze_self: bool = False, **kwargs: Any - ) -> Callable: - """ - Build function that do fallback to pandas to apply `func` on a expanding window. - - Parameters - ---------- - func : callable - Function to execute on an expanding window. - squeeze_self : bool, default: False - Whether or not to squeeze frame before executing the window function. - **kwargs : kwargs - Additional arguments that will be passed to function builder. - - Returns - ------- - callable - Function that takes query compiler and defaults to pandas to apply aggregation - `func` on an expanding window. - """ - return super().register( # pragma: no cover - cls._build_expanding(func, squeeze_self=squeeze_self), - fn_name=func.__name__, - **kwargs - ) diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/series.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/series.py deleted file mode 100644 index 455360cef59..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/series.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default Series functions builder class.""" -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default import ( - DefaultMethod, -) - - -class SeriesDefault(DefaultMethod): - """Builder for default-to-pandas methods which is executed under Series.""" - - OBJECT_TYPE = "Series" - - @classmethod - def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.Series: - """ - Squeeze passed DataFrame to be able to process Series-specific functions on it. - - Parameters - ---------- - df : pandas.DataFrame - One-column DataFrame to squeeze. - - Returns - ------- - pandas.Series - """ - return df.squeeze(axis=1) diff --git a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/str.py b/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/str.py deleted file mode 100644 index 4de39a3c084..00000000000 --- a/src/snowflake/snowpark/modin/core/dataframe/algebra/default2pandas/str.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Module houses default applied-on-str functions builder class.""" -import pandas - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.series import ( - SeriesDefault, -) - - -class StrDefault(SeriesDefault): - """Builder for default-to-pandas methods which is executed under `str` accessor.""" - - @classmethod - def frame_wrapper( - cls, df: pandas.DataFrame - ) -> pandas.core.strings.accessor.StringMethods: - """ - Get `str` accessor of the passed frame. - - Parameters - ---------- - df : pandas.DataFrame - - Returns - ------- - pandas.core.strings.accessor.StringMethods - """ - return df.squeeze(axis=1).str diff --git a/src/snowflake/snowpark/modin/core/execution/dispatching/factories/baseio.py b/src/snowflake/snowpark/modin/core/execution/dispatching/factories/baseio.py index 6825cf30c23..2dc0c510130 100644 --- a/src/snowflake/snowpark/modin/core/execution/dispatching/factories/baseio.py +++ b/src/snowflake/snowpark/modin/core/execution/dispatching/factories/baseio.py @@ -30,9 +30,9 @@ from typing import Any import pandas +from modin.core.storage_formats import BaseQueryCompiler # pragma: no cover from pandas.util._decorators import doc -from snowflake.snowpark.modin.plugin.compiler import BaseQueryCompiler from snowflake.snowpark.modin.utils import _inherit_docstrings _doc_default_io_method = """ diff --git a/src/snowflake/snowpark/modin/pandas/__init__.py b/src/snowflake/snowpark/modin/pandas/__init__.py index dbda8bb8ae5..8ddfc84ba73 100644 --- a/src/snowflake/snowpark/modin/pandas/__init__.py +++ b/src/snowflake/snowpark/modin/pandas/__init__.py @@ -331,3 +331,15 @@ def __getattr__(name: str) -> Any: for name in _EXTENSION_ATTRS: _ext.register_pd_accessor(name)(getattr(pd_extensions, name)) + + +# TODO: https://github.com/modin-project/modin/issues/7233 +# Upstream Modin does not properly render property names in default2pandas warnings, so we need +# to override DefaultMethod.register. +import modin.core.dataframe.algebra.default2pandas # noqa: E402 + +import snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default # noqa: E402 + +modin.core.dataframe.algebra.default2pandas.default.DefaultMethod.register = ( + snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.default.DefaultMethod.register +) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index d0ac54caeea..7334730f729 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -73,7 +73,6 @@ VALID_TO_DATETIME_UNIT, ) from snowflake.snowpark.modin.plugin._typing import ListLike, ListLikeOfFloats -from snowflake.snowpark.modin.plugin.compiler import BaseQueryCompiler from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -86,6 +85,8 @@ # linking to `snowflake.snowpark.DataFrame`, we need to explicitly # qualify return types in this file with `snowflake.snowpark.modin.pandas.DataFrame`. # SNOW-1233342: investigate how to fix these links without using absolute paths + from modin.core.storage_formats import BaseQueryCompiler # pragma: no cover + import snowflake # pragma: no cover _logger = getLogger(__name__) diff --git a/src/snowflake/snowpark/modin/pandas/series_utils.py b/src/snowflake/snowpark/modin/pandas/series_utils.py index 0a513955ed0..c775bc17273 100644 --- a/src/snowflake/snowpark/modin/pandas/series_utils.py +++ b/src/snowflake/snowpark/modin/pandas/series_utils.py @@ -1340,14 +1340,6 @@ def microsecond(self): def nanosecond(self): return Series(query_compiler=self._query_compiler.dt_nanosecond()) - @property - def week(self): - return Series(query_compiler=self._query_compiler.dt_week()) - - @property - def weekofyear(self): - return Series(query_compiler=self._query_compiler.dt_weekofyear()) - @property def dayofweek(self): return Series(query_compiler=self._query_compiler.dt_dayofweek()) diff --git a/src/snowflake/snowpark/modin/pandas/utils.py b/src/snowflake/snowpark/modin/pandas/utils.py index f458a1d970b..4a4ccb215a2 100644 --- a/src/snowflake/snowpark/modin/pandas/utils.py +++ b/src/snowflake/snowpark/modin/pandas/utils.py @@ -27,6 +27,7 @@ import numpy as np import pandas +from modin.core.storage_formats import BaseQueryCompiler # pragma: no cover from pandas._typing import ( AggFuncType, AggFuncTypeBase, @@ -46,7 +47,6 @@ from snowflake.snowpark.modin.plugin._internal.aggregation_utils import ( get_pandas_aggr_func_name, ) -from snowflake.snowpark.modin.plugin.compiler import BaseQueryCompiler from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.modin.utils import hashable diff --git a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md index 9bccdf74793..384c3244840 100644 --- a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md +++ b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md @@ -30,6 +30,7 @@ - `dot` binary operation between `DataFrame/Series`. - `xor` binary operation between `DataFrame/Series`. - All `DataFrame/Series.groupby` operations if either `axis == 1`, both `by` and `level` are configured, or `by` contains any non-pandas hashable labels. +- Removed `Series.dt.week` and `Series.dt.weekofyear` to align Snowpark pandas with the pandas 2.2.1 API. ### Behavior Changes - As a part of the transition to pandas 2.2.1, pandas `df.loc` and `__setitem__` have buggy behavior when a column key is used to assign a DataFrame item to a DataFrame (a scalar column key and DataFrame item are used for assignment (https://github.com/pandas-dev/pandas/issues/58482)). Snowpark pandas deviates from this behavior and will maintain the same behavior as pandas from versions 1.5.x. diff --git a/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py index e59ce0fc9a9..4538752f279 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py @@ -5,11 +5,9 @@ from typing import Union import pandas as native_pd +from modin.core.dataframe.algebra.default2pandas import DataFrameDefault from snowflake.snowpark.functions import any_value, get -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas import ( - DataFrameDefault, -) from snowflake.snowpark.modin.plugin._internal.frame import InternalFrame from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import ( OrderedDataFrame, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/__init__.py b/src/snowflake/snowpark/modin/plugin/compiler/__init__.py index 7be4dbcff73..ab057dfff5a 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/__init__.py @@ -21,6 +21,4 @@ """The module represents the base query compiler that defines the common query compiler API.""" -from snowflake.snowpark.modin.plugin.compiler.query_compiler import BaseQueryCompiler - -__all__ = ["BaseQueryCompiler"] +__all__ = [] diff --git a/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py deleted file mode 100644 index 271a6732469..00000000000 --- a/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py +++ /dev/null @@ -1,3980 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -""" -Module contains class ``BaseQueryCompiler``. - -``BaseQueryCompiler`` is a parent abstract class for any other query compiler class. -""" - -import abc -from collections.abc import Hashable -from typing import Any, Optional - -import numpy as np -import pandas -import pandas.core.resample -from pandas._libs.lib import no_default -from pandas._typing import Axis, IndexLabel, Suffixes -from pandas.core.dtypes.common import is_scalar - -from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas import ( - BinaryDefault, - CatDefault, - DataFrameDefault, - DateTimeDefault, - GroupByDefault, - ResampleDefault, - RollingDefault, - SeriesDefault, - StrDefault, -) -from snowflake.snowpark.modin.plugin.compiler import doc_utils -from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage -from snowflake.snowpark.modin.utils import ( - MODIN_UNNAMED_SERIES_LABEL, - try_cast_to_pandas, -) - - -# FIXME: many of the BaseQueryCompiler methods are hiding actual arguments -# by using *args and **kwargs. They should be spread into actual parameters. -# Currently actual arguments are placed in the methods docstrings, but since they're -# not presented in the function's signature it makes linter to raise `PR02: unknown parameters` -# warning. For now, they're silenced by using `noqa` (Modin issue #3108). -class BaseQueryCompiler(abc.ABC): - """ - Abstract class that handles the queries to Modin dataframes. - - This class defines common query compilers API, most of the methods - are already implemented and defaulting to pandas. - - Attributes - ---------- - _shape_hint : {"row", "column", None}, default: None - Shape hint for frames known to be a column or a row, otherwise None. - - Notes - ----- - See the Abstract Methods and Fields section immediately below this - for a list of requirements for subclassing this object. - """ - - def default_to_pandas(self, pandas_op, *args, **kwargs): - """ - Do fallback to pandas for the passed function. - - Parameters - ---------- - pandas_op : callable(pandas.DataFrame) -> object - Function to apply to the casted to pandas frame. - *args : iterable - Positional arguments to pass to `pandas_op`. - **kwargs : dict - Key-value arguments to pass to `pandas_op`. - - Returns - ------- - BaseQueryCompiler - The result of the `pandas_op`, converted back to ``BaseQueryCompiler``. - """ - args = try_cast_to_pandas(args) - kwargs = try_cast_to_pandas(kwargs) - - result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) - if isinstance(result, pandas.Series): - if result.name is None: - result.name = MODIN_UNNAMED_SERIES_LABEL - result = result.to_frame() - if isinstance(result, pandas.DataFrame): - return self.from_pandas(result, type(self._modin_frame)) - else: - return result - - # Abstract Methods and Fields: Must implement in children classes - # In some cases, there you may be able to use the same implementation for - # some of these abstract methods, but for the sake of generality they are - # treated differently. - - _shape_hint = None - - # Metadata modification abstract methods - def add_prefix(self, prefix, axis=1): - """ - Add string prefix to the index labels along specified axis. - - Parameters - ---------- - prefix : str - The string to add before each label. - axis : {0, 1}, default: 1 - Axis to add prefix along. 0 is for index and 1 is for columns. - - Returns - ------- - BaseQueryCompiler - New query compiler with updated labels. - """ - if axis: - return DataFrameDefault.register(pandas.DataFrame.add_prefix)( - self, prefix=prefix - ) - else: - return SeriesDefault.register(pandas.Series.add_prefix)(self, prefix=prefix) - - def add_suffix(self, suffix, axis=1): - """ - Add string suffix to the index labels along specified axis. - - Parameters - ---------- - suffix : str - The string to add after each label. - axis : {0, 1}, default: 1 - Axis to add suffix along. 0 is for index and 1 is for columns. - - Returns - ------- - BaseQueryCompiler - New query compiler with updated labels. - """ - if axis: - return DataFrameDefault.register(pandas.DataFrame.add_suffix)( - self, suffix=suffix - ) - else: - return SeriesDefault.register(pandas.Series.add_suffix)(self, suffix=suffix) - - # END Metadata modification abstract methods - - # Abstract copy - - def copy(self): - """ - Make a copy of this object. - - Returns - ------- - BaseQueryCompiler - Copy of self. - - Notes - ----- - For copy, we don't want a situation where we modify the metadata of the - copies if we end up modifying something here. We copy all of the metadata - to prevent that. - """ - raise NotImplementedError # pragma: no cover - - # END Abstract copy - - # Data Management Methods - @abc.abstractmethod - def free(self): - """Trigger a cleanup of this object.""" - pass - - @abc.abstractmethod - def finalize(self): - """Finalize constructing the dataframe calling all deferred functions which were used to build it.""" - pass - - # END Data Management Methods - - # To/From pandas - @abc.abstractmethod - def to_pandas( - self, - *, - statement_params: Optional[dict[str, str]] = None, - **kwargs: Any, - ) -> pandas.DataFrame: - """ - Convert underlying query compilers data to ``pandas.DataFrame``. - - Args: - statement_params: Dictionary of statement level parameters to be set while executing this action. - - Returns: - pandas.DataFrame - The QueryCompiler converted to pandas.""" - pass - - @classmethod - @abc.abstractmethod - def from_pandas(cls, df, data_cls): - """ - Build QueryCompiler from pandas DataFrame. - - Parameters - ---------- - df : pandas.DataFrame - The pandas DataFrame to convert from. - data_cls : type - :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class - (or its descendant) to convert to. - - Returns - ------- - BaseQueryCompiler - QueryCompiler containing data from the pandas DataFrame. - """ - pass - - # END To/From pandas - - # From Arrow - @classmethod - @abc.abstractmethod - def from_arrow(cls, at, data_cls): - """ - Build QueryCompiler from Arrow Table. - - Parameters - ---------- - at : Arrow Table - The Arrow Table to convert from. - data_cls : type - :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class - (or its descendant) to convert to. - - Returns - ------- - BaseQueryCompiler - QueryCompiler containing data from the pandas DataFrame. - """ - pass - - # END From Arrow - - # Dataframe exchange protocol - - @abc.abstractmethod - def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): - """ - Get a DataFrame exchange protocol object representing data of the Modin DataFrame. - - See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. - - Parameters - ---------- - nan_as_null : bool, default: False - A keyword intended for the consumer to tell the producer - to overwrite null values in the data with ``NaN`` (or ``NaT``). - This currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - allow_copy : bool, default: True - A keyword that defines whether or not the library is allowed - to make a copy of the data. For example, copying data would be necessary - if a library supports strided buffers, given that this protocol - specifies contiguous buffers. Currently, if the flag is set to ``False`` - and a copy is needed, a ``RuntimeError`` will be raised. - - Returns - ------- - ProtocolDataframe - A dataframe object following the DataFrame protocol specification. - """ - pass - - @classmethod - @abc.abstractmethod - def from_dataframe(cls, df, data_cls): - """ - Build QueryCompiler from a DataFrame object supporting the dataframe exchange protocol `__dataframe__()`. - - Parameters - ---------- - df : DataFrame - The DataFrame object supporting the dataframe exchange protocol. - data_cls : type - :py:class:`~modin.core.dataframe.pandas.dataframe.dataframe.PandasDataframe` class - (or its descendant) to convert to. - - Returns - ------- - BaseQueryCompiler - QueryCompiler containing data from the DataFrame. - """ - pass - - # END Dataframe exchange protocol - - # Abstract inter-data operations (e.g. add, sub) - # These operations require two DataFrames and will change the shape of the - # data if the index objects don't match. An outer join + op is performed, - # such that columns/rows that don't have an index on the other DataFrame - # result in NaN values. - - @doc_utils.doc_binary_method(operation="addition", sign="+") - def add(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.add)(self, other=other, **kwargs) - - @doc_utils.add_refer_to("DataFrame.combine") - def combine(self, other, **kwargs): # noqa: PR02 - """ - Perform column-wise combine with another QueryCompiler with passed `func`. - - If axes are not equal, perform frames alignment first. - - Parameters - ---------- - other : BaseQueryCompiler - Left operand of the binary operation. - func : callable(pandas.Series, pandas.Series) -> pandas.Series - Function that takes two ``pandas.Series`` with aligned axes - and returns one ``pandas.Series`` as resulting combination. - fill_value : float or None - Value to fill missing values with after frame alignment occurred. - overwrite : bool - If True, columns in `self` that do not exist in `other` - will be overwritten with NaNs. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - Result of combine. - """ - return BinaryDefault.register(pandas.DataFrame.combine)( - self, other=other, **kwargs - ) - - @doc_utils.add_refer_to("DataFrame.combine_first") - def combine_first(self, other, **kwargs): # noqa: PR02 - """ - Fill null elements of `self` with value in the same location in `other`. - - If axes are not equal, perform frames alignment first. - - Parameters - ---------- - other : BaseQueryCompiler - Provided frame to use to fill null values from. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - """ - return BinaryDefault.register(pandas.DataFrame.combine_first)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="equality comparison", sign="==") - def eq(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.eq)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method(operation="integer division", sign="//") - def floordiv(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.floordiv)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method( - operation="greater than or equal comparison", sign=">=", op_type="comparison" - ) - def ge(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.ge)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method( - operation="greater than comparison", sign=">", op_type="comparison" - ) - def gt(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.gt)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method( - operation="less than or equal comparison", sign="<=", op_type="comparison" - ) - def le(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.le)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method( - operation="less than comparison", sign="<", op_type="comparison" - ) - def lt(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.lt)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method(operation="modulo", sign="%") - def mod(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.mod)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method(operation="multiplication", sign="*") - def mul(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.mul)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method( - operation="multiplication", sign="*", self_on_right=True - ) - def rmul(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rmul)( - self, other=other, **kwargs - ) - - @doc_utils.add_refer_to("DataFrame.corr") - def corr(self, **kwargs): # noqa: PR02 - """ - Compute pairwise correlation of columns, excluding NA/null values. - - Parameters - ---------- - method : {'pearson', 'kendall', 'spearman'} or callable(pandas.Series, pandas.Series) -> pandas.Series - Correlation method. - min_periods : int - Minimum number of observations required per pair of columns - to have a valid result. If fewer than `min_periods` non-NA values - are present the result will be NA. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - Correlation matrix. - """ - return DataFrameDefault.register(pandas.DataFrame.corr)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.cov") - def cov(self, **kwargs): # noqa: PR02 - """ - Compute pairwise covariance of columns, excluding NA/null values. - - Parameters - ---------- - min_periods : int - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - Covariance matrix. - """ - return DataFrameDefault.register(pandas.DataFrame.cov)(self, **kwargs) - - def dot(self, other, **kwargs): # noqa: PR02 - """ - Compute the matrix multiplication of `self` and `other`. - - Parameters - ---------- - other : BaseQueryCompiler or NumPy array - The other query compiler or NumPy array to matrix multiply with `self`. - squeeze_self : boolean - If `self` is a one-column query compiler, indicates whether it represents Series object. - squeeze_other : boolean - If `other` is a one-column query compiler, indicates whether it represents Series object. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - A new query compiler that contains result of the matrix multiply. - """ - if kwargs.get("squeeze_self", False): - applyier = pandas.Series.dot - else: - applyier = pandas.DataFrame.dot - return BinaryDefault.register(applyier)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method( - operation="not equal comparison", sign="!=", op_type="comparison" - ) - def ne(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.ne)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method(operation="exponential power", sign="**") - def pow(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.pow)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method(operation="addition", sign="+", self_on_right=True) - def radd(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.radd)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method( - operation="integer division", sign="//", self_on_right=True - ) - def rfloordiv(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rfloordiv)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="modulo", sign="%", self_on_right=True) - def rmod(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rmod)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method( - operation="exponential power", sign="**", self_on_right=True - ) - def rpow(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rpow)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="subtraction", sign="-", self_on_right=True) - def rsub(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rsub)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="division", sign="/", self_on_right=True) - def rtruediv(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rtruediv)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="subtraction", sign="-") - def sub(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.sub)(self, other=other, **kwargs) - - @doc_utils.doc_binary_method(operation="division", sign="/") - def truediv(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.truediv)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="conjunction", sign="&", op_type="logical") - def __and__(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.__and__)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="disjunction", sign="|", op_type="logical") - def __or__(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.__or__)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method( - operation="conjunction", sign="&", op_type="logical", self_on_right=True - ) - def __rand__(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.__rand__)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method( - operation="disjunction", sign="|", op_type="logical", self_on_right=True - ) - def __ror__(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.__ror__)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method( - operation="exclusive or", sign="^", op_type="logical", self_on_right=True - ) - def __rxor__(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.__rxor__)( - self, other=other, **kwargs - ) - - @doc_utils.doc_binary_method(operation="exclusive or", sign="^", op_type="logical") - def __xor__(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.__xor__)( - self, other=other, **kwargs - ) - - # FIXME: query compiler shoudln't care about differences between Frame and Series. - # We should combine `df_update` and `series_update` into one method (Modin issue #3101). - @doc_utils.add_refer_to("DataFrame.update") - def df_update(self, other, **kwargs): # noqa: PR02 - """ - Update values of `self` using non-NA values of `other` at the corresponding positions. - - If axes are not equal, perform frames alignment first. - - Parameters - ---------- - other : BaseQueryCompiler - Frame to grab replacement values from. - join : {"left"} - Specify type of join to align frames if axes are not equal - (note: currently only one type of join is implemented). - overwrite : bool - Whether to overwrite every corresponding value of self, or only if it's NAN. - filter_func : callable(pandas.Series, pandas.Series) -> numpy.ndarray - Function that takes column of the self and return bool mask for values, that - should be overwritten in the self frame. - errors : {"raise", "ignore"} - If "raise", will raise a ``ValueError`` if `self` and `other` both contain - non-NA data in the same place. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with updated values. - """ - return BinaryDefault.register(pandas.DataFrame.update, inplace=True)( - self, other=other, **kwargs - ) - - @doc_utils.add_refer_to("Series.update") - def series_update(self, other, **kwargs): # noqa: PR02 - """ - Update values of `self` using values of `other` at the corresponding indices. - - Parameters - ---------- - other : BaseQueryCompiler - One-column query compiler with updated values. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with updated values. - """ - return BinaryDefault.register(pandas.Series.update, inplace=True)( - self, - other=other, - squeeze_self=True, - **kwargs, - ) - - @doc_utils.add_refer_to("DataFrame.clip") - def clip(self, lower, upper, **kwargs): # noqa: PR02 - """ - Trim values at input threshold. - - Parameters - ---------- - lower : float or list-like - upper : float or list-like - axis : {0, 1} - inplace : {False} - This parameter serves the compatibility purpose. Always has to be False. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler with values limited by the specified thresholds. - """ - if isinstance(lower, BaseQueryCompiler): - lower = lower.to_pandas().squeeze(1) - if isinstance(upper, BaseQueryCompiler): - upper = upper.to_pandas().squeeze(1) - return DataFrameDefault.register(pandas.DataFrame.clip)( - self, lower=lower, upper=upper, **kwargs - ) - - @doc_utils.add_refer_to("DataFrame.merge") - def merge(self, right, **kwargs): # noqa: PR02 - """ - Merge QueryCompiler objects using a database-style join. - - Parameters - ---------- - right : BaseQueryCompiler - QueryCompiler of the right frame to merge with. - how : {"left", "right", "outer", "inner", "cross"} - on : label or list of such - left_on : label or list of such - right_on : label or list of such - left_index : bool - right_index : bool - sort : bool - suffixes : list-like - copy : bool - indicator : bool or str - validate : str - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler that contains result of the merge. - """ - raise NotImplementedError - - def _get_column_as_pandas_series(self, key): - """ - Get column data by label as pandas.Series. - - Parameters - ---------- - key : Any - Column label. - - Returns - ------- - pandas.Series - """ - result = self.getitem_array([key]).to_pandas().squeeze(axis=1) - if not isinstance(result, pandas.Series): - raise RuntimeError( - f"Expected getting column {key} to give " - + f"pandas.Series, but instead got {type(result)}" - ) - return result - - def merge_asof( - self, - right: "BaseQueryCompiler", - left_on: Optional[IndexLabel] = None, - right_on: Optional[IndexLabel] = None, - left_index: bool = False, - right_index: bool = False, - left_by=None, - right_by=None, - suffixes: Suffixes = ("_x", "_y"), - tolerance=None, - allow_exact_matches: bool = True, - direction: str = "backward", - ): - # pandas fallbacks for tricky cases: - if ( - # No idea how this works or why it does what it does; and in fact - # there's a pandas bug suggesting it's wrong: - # https://github.com/pandas-dev/pandas/issues/33463 - (left_index and right_on is not None) - # This is the case where by is a list of columns. If we're copying lots - # of columns out of pandas, maybe not worth trying our path, it's not - # clear it's any better: - or not (left_by is None or is_scalar(left_by)) - or not (right_by is None or is_scalar(right_by)) - # The implementation below assumes that the right index is unique - # because it uses merge_asof to map each position in the merged - # index to the label of the one right row that should be merged - # at that row position. - or not right.index.is_unique - ): - return self.default_to_pandas( - pandas.merge_asof, - right, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - left_by=left_by, - right_by=right_by, - suffixes=suffixes, - tolerance=tolerance, - allow_exact_matches=allow_exact_matches, - direction=direction, - ) - - if left_on is None: - left_column = self.index - else: - left_column = self._get_column_as_pandas_series(left_on) - - if right_on is None: - right_column = right.index - else: - right_column = right._get_column_as_pandas_series(right_on) - - left_pandas_limited = {"on": left_column} - right_pandas_limited = {"on": right_column, "right_labels": right.index} - extra_kwargs = {} # extra arguments to pandas merge_asof # pragma: no cover - - if left_by is not None or right_by is not None: - extra_kwargs["by"] = "by" - left_pandas_limited["by"] = self._get_column_as_pandas_series(left_by) - right_pandas_limited["by"] = right._get_column_as_pandas_series(right_by) - - # 1. Construct pandas DataFrames with just the 'on' and optional 'by' - # columns, and the index as another column. - left_pandas_limited = pandas.DataFrame(left_pandas_limited, index=self.index) - right_pandas_limited = pandas.DataFrame(right_pandas_limited) - - # 2. Use pandas' merge_asof to figure out how to map labels on left to - # labels on the right. - merged = pandas.merge_asof( - left_pandas_limited, - right_pandas_limited, - on="on", - direction=direction, - allow_exact_matches=allow_exact_matches, - tolerance=tolerance, - **extra_kwargs, - ) - # Now merged["right_labels"] shows which labels from right map to left's index. - - # 3. Re-index right using the merged["right_labels"]; at this point right - # should be same length and (semantically) same order as left: - right_subset = right.reindex( - axis=0, labels=pandas.Index(merged["right_labels"]) - ) - if not right_index: - right_subset = right_subset.drop(columns=[right_on]) - if right_by is not None and left_by == right_by: - right_subset = right_subset.drop(columns=[right_by]) - right_subset.index = self.index - - # 4. Merge left and the new shrunken right: - result = self.merge( - right_subset, - left_index=True, - right_index=True, - suffixes=suffixes, - how="left", - ) - - # 5. Clean up to match pandas output: - if left_on is not None and right_index: - result = result.insert( - # In theory this could use get_indexer_for(), but that causes an error: - list(result.columns).index(left_on + suffixes[0]), - left_on, - result.getitem_array([left_on + suffixes[0]]), - ) - if not left_index and not right_index: - result = result.reset_index(drop=True) - - return result - - # END Abstract inter-data operations - - def is_series_like(self): - raise NotImplementedError # pragma: no cover - - # END Abstract Transpose - - # Abstract reindex/reset_index (may shuffle data) - @doc_utils.add_refer_to("DataFrame.reindex") - def reindex(self, axis, labels, **kwargs): # noqa: PR02 - """ - Align QueryCompiler data with a new index along specified axis. - - Parameters - ---------- - axis : {0, 1} - Axis to align labels along. 0 is for index, 1 is for columns. - labels : list-like - Index-labels to align with. - method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} - Method to use for filling holes in reindexed frame. - fill_value : scalar - Value to use for missing values in the resulted frame. - limit : int - tolerance : int - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler with aligned axis. - """ - return DataFrameDefault.register(pandas.DataFrame.reindex)( - self, axis=axis, labels=labels, **kwargs - ) - - @doc_utils.add_refer_to("DataFrame.reset_index") - def reset_index(self, **kwargs): # noqa: PR02 - """ - Reset the index, or a level of it. - - Parameters - ---------- - drop : bool - Whether to drop the reset index or insert it at the beginning of the frame. - level : int or label, optional - Level to remove from index. Removes all levels by default. - col_level : int or label - If the columns have multiple levels, determines which level the labels - are inserted into. - col_fill : label - If the columns have multiple levels, determines how the other levels - are named. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler with reset index. - """ - return DataFrameDefault.register(pandas.DataFrame.reset_index)(self, **kwargs) - - # END Abstract reindex/reset_index - - # Full Reduce operations - # - # These operations result in a reduced dimensionality of data. - # Currently, this means a pandas Series will be returned, but in the future - # we will implement a Distributed Series, and this will be returned - # instead. - - def is_monotonic_increasing(self): - """ - Return boolean if values in the object are monotonically increasing. - - Returns - ------- - bool - """ - return SeriesDefault.register(pandas.Series.is_monotonic_increasing)(self) - - def is_monotonic_decreasing(self): - """ - Return boolean if values in the object are monotonically decreasing. - - Returns - ------- - bool - """ - return SeriesDefault.register(pandas.Series.is_monotonic_decreasing)(self) - - @doc_utils.doc_reduce_agg( - method="production", - refer_to="prod", - extra_params=["**kwargs"], - params="axis : {0, 1}", - ) - def prod(self, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.prod)(self, **kwargs) - - # END Abstract full Reduce operations - - # Abstract map partitions operations - # These operations are operations that apply a function to every partition. - def abs(self): - """ - Get absolute numeric value of each element. - - Returns - ------- - BaseQueryCompiler - QueryCompiler with absolute numeric value of each element. - """ - return DataFrameDefault.register(pandas.DataFrame.abs)(self) - - # FIXME: `**kwargs` which follows `numpy.conj` signature was inherited - # from ``PandasQueryCompiler``, we should get rid of this dependency. - # (Modin issue #3108) - def conj(self, **kwargs): - """ - Get the complex conjugate for every element of self. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - BaseQueryCompiler - QueryCompiler with conjugate applied element-wise. - - Notes - ----- - Please refer to ``numpy.conj`` for parameters description. - """ - - def conj(df, *args, **kwargs): - return pandas.DataFrame(np.conj(df)) - - return DataFrameDefault.register(conj)(self, **kwargs) - - # FIXME: this method is not supposed to take any parameters (Modin issue #3108). - def negative(self, **kwargs): - """ - Change the sign for every value of self. - - Parameters - ---------- - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - - Notes - ----- - Be aware, that all QueryCompiler values have to be numeric. - """ - return DataFrameDefault.register(pandas.DataFrame.__neg__)(self, **kwargs) - - @doc_utils.add_one_column_warning - # FIXME: adding refer-to note will create two instances of the "Notes" section, - # this breaks numpydoc style rules and also crashes the doc-style checker script. - # For now manually added the refer-to message. - # @doc_utils.add_refer_to("Series.view") - def series_view(self, **kwargs): # noqa: PR02 - """ - Reinterpret underlying data with new dtype. - - Parameters - ---------- - dtype : dtype - Data type to reinterpret underlying data with. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler of the same data in memory, with reinterpreted values. - - Notes - ----- - - Be aware, that if this method do fallback to pandas, then newly created - QueryCompiler will be the copy of the original data. - - Please refer to ``modin.pandas.Series.view`` for more information - about parameters and output format. - """ - return SeriesDefault.register(pandas.Series.view)(self, **kwargs) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("to_timedelta") - def to_timedelta(self, unit="ns", errors="raise"): # noqa: PR02 - """ - Convert argument to timedelta. - - Parameters - ---------- - unit : str, default: "ns" - Denotes the unit of the arg for numeric arg. Defaults to "ns". - errors : {"ignore", "raise", "coerce"}, default: "raise" - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with converted to timedelta values. - """ - return SeriesDefault.register(pandas.to_timedelta)( - self, unit=unit, errors=errors - ) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.searchsorted") - def searchsorted(self, **kwargs): # noqa: PR02 - """ - Find positions in a sorted `self` where `value` should be inserted to maintain order. - - Parameters - ---------- - value : list-like - side : {"left", "right"} - sorter : list-like, optional - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - One-column QueryCompiler which contains indices to insert. - """ - return SeriesDefault.register(pandas.Series.searchsorted)(self, **kwargs) - - # END Abstract map partitions operations - - @doc_utils.add_refer_to("DataFrame.stack") - def stack(self, level, dropna): - """ - Stack the prescribed level(s) from columns to index. - - Parameters - ---------- - level : int or label - dropna : bool - - Returns - ------- - BaseQueryCompiler - """ - return DataFrameDefault.register(pandas.DataFrame.stack)( - self, level=level, dropna=dropna - ) - - def infer_objects(self): - """ - Attempt to infer better dtypes for object columns. - - Attempts soft conversion of object-dtyped columns, leaving non-object - and unconvertible columns unchanged. The inference rules are the same - as during normal Series/DataFrame construction. - - Returns - ------- - BaseQueryCompiler - New query compiler with udpated dtypes. - """ - return DataFrameDefault.register(pandas.DataFrame.infer_objects)(self) - - @property - def dtypes(self): - """ - Get columns dtypes. - - Returns - ------- - pandas.Series - Series with dtypes of each column. - """ - return self.to_pandas().dtypes - - # END Abstract map partitions across select indices - - # Abstract column/row partitions reduce operations - # - # These operations result in a reduced dimensionality of data. - # Currently, this means a pandas Series will be returned, but in the future - # we will implement a Distributed Series, and this will be returned - # instead. - - # FIXME: we're handling level parameter at front-end, it shouldn't - # propagate to the query compiler (Modin issue #3102) - @doc_utils.add_refer_to("DataFrame.all") - def all(self, **kwargs): # noqa: PR02 - """ - Return whether all the elements are true, potentially over an axis. - - Parameters - ---------- - axis : {0, 1}, optional - bool_only : bool, optional - skipna : bool - level : int or label - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - If axis was specified return one-column QueryCompiler with index labels - of the specified axis, where each row contains boolean of whether all elements - at the corresponding row or column are True. Otherwise return QueryCompiler - with a single bool of whether all elements are True. - """ - return DataFrameDefault.register(pandas.DataFrame.all)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.any") - def any(self, **kwargs): # noqa: PR02 - """ - Return whether any element is true, potentially over an axis. - - Parameters - ---------- - axis : {0, 1}, optional - bool_only : bool, optional - skipna : bool - level : int or label - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - If axis was specified return one-column QueryCompiler with index labels - of the specified axis, where each row contains boolean of whether any element - at the corresponding row or column is True. Otherwise return QueryCompiler - with a single bool of whether any element is True. - """ - return DataFrameDefault.register(pandas.DataFrame.any)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.memory_usage") - def memory_usage(self, **kwargs): # noqa: PR02 - """ - Return the memory usage of each column in bytes. - - Parameters - ---------- - index : bool - deep : bool - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - One-column QueryCompiler with index labels of `self`, where each row - contains the memory usage for the corresponding column. - """ - return DataFrameDefault.register(pandas.DataFrame.memory_usage)(self, **kwargs) - - @doc_utils.doc_reduce_agg( - method="value at the given quantile", - refer_to="quantile", - params=""" - q : float - axis : {0, 1} - numeric_only : bool - interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}""", - extra_params=["**kwargs"], - ) - def quantile_for_single_value(self, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.quantile)(self, **kwargs) - - @doc_utils.doc_reduce_agg( - method="unbiased skew", refer_to="skew", extra_params=["skipna", "**kwargs"] - ) - def skew(self, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.skew)(self, **kwargs) - - @doc_utils.doc_reduce_agg( - method="standard deviation of the mean", - refer_to="sem", - extra_params=["skipna", "ddof", "**kwargs"], - ) - def sem(self, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.sem)(self, **kwargs) - - # END Abstract column/row partitions reduce operations - - # Abstract column/row partitions reduce operations over select indices - # - # These operations result in a reduced dimensionality of data. - # Currently, this means a pandas Series will be returned, but in the future - # we will implement a Distributed Series, and this will be returned - # instead. - @doc_utils.add_refer_to("DataFrame.describe") - def describe(self, **kwargs): # noqa: PR02 - """ - Generate descriptive statistics. - - Parameters - ---------- - percentiles : list-like - include : "all" or list of dtypes, optional - exclude : list of dtypes, optional - datetime_is_numeric : bool - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler object containing the descriptive statistics - of the underlying data. - """ - return DataFrameDefault.register(pandas.DataFrame.describe)(self, **kwargs) - - # END Abstract column/row partitions reduce operations over select indices - - # Map across rows/columns - # These operations require some global knowledge of the full column/row - # that is being operated on. This means that we have to put all of that - # data in the same place. - - @doc_utils.doc_cum_agg(method="sum", refer_to="cumsum") - def cumsum(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cumsum)(self, **kwargs) - - @doc_utils.doc_cum_agg(method="maximum", refer_to="cummax") - def cummax(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cummax)(self, **kwargs) - - @doc_utils.doc_cum_agg(method="minimum", refer_to="cummin") - def cummin(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cummin)(self, **kwargs) - - @doc_utils.doc_cum_agg(method="product", refer_to="cumprod") - def cumprod(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cumprod)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.diff") - def diff(self, fold_axis, **kwargs): # noqa: PR02 - """ - First discrete difference of element. - - Parameters - ---------- - periods : int - fold_axis : {0, 1} - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler of the same shape as `self`, where each element is the difference - between the corresponding value and the previous value in this row or column. - """ - return DataFrameDefault.register(pandas.DataFrame.diff)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.nlargest") - def nlargest(self, n=5, columns=None, keep="first"): - """ - Return the first `n` rows ordered by `columns` in descending order. - - Parameters - ---------- - n : int, default: 5 - columns : list of labels, optional - Column labels to order by. - (note: this parameter can be omitted only for a single-column query compilers - representing Series object, otherwise `columns` has to be specified). - keep : {"first", "last", "all"}, default: "first" - - Returns - ------- - BaseQueryCompiler - """ - if columns is None: - return SeriesDefault.register(pandas.Series.nlargest)(self, n=n, keep=keep) - else: - return DataFrameDefault.register(pandas.DataFrame.nlargest)( - self, n=n, columns=columns, keep=keep - ) - - @doc_utils.add_refer_to("DataFrame.nsmallest") - def nsmallest(self, n=5, columns=None, keep="first"): - """ - Return the first `n` rows ordered by `columns` in ascending order. - - Parameters - ---------- - n : int, default: 5 - columns : list of labels, optional - Column labels to order by. - (note: this parameter can be omitted only for a single-column query compilers - representing Series object, otherwise `columns` has to be specified). - keep : {"first", "last", "all"}, default: "first" - - Returns - ------- - BaseQueryCompiler - """ - if columns is None: - return SeriesDefault.register(pandas.Series.nsmallest)(self, n=n, keep=keep) - else: - return DataFrameDefault.register(pandas.DataFrame.nsmallest)( - self, n=n, columns=columns, keep=keep - ) - - @doc_utils.add_refer_to("DataFrame.eval") - def eval(self, expr, **kwargs): - """ - Evaluate string expression on QueryCompiler columns. - - Parameters - ---------- - expr : str - **kwargs : dict - - Returns - ------- - BaseQueryCompiler - QueryCompiler containing the result of evaluation. - """ - return DataFrameDefault.register(pandas.DataFrame.eval)( - self, expr=expr, **kwargs - ) - - @doc_utils.add_refer_to("DataFrame.mode") - def mode(self, **kwargs): # noqa: PR02 - """ - Get the modes for every column or row. - - Parameters - ---------- - axis : {0, 1} - numeric_only : bool - dropna : bool - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with modes calculated along given axis. - """ - return DataFrameDefault.register(pandas.DataFrame.mode)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.query") - def query(self, expr, **kwargs): - """ - Query columns of the QueryCompiler with a boolean expression. - - Parameters - ---------- - expr : str - **kwargs : dict - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing the rows where the boolean expression is satisfied. - """ - return DataFrameDefault.register(pandas.DataFrame.query)( - self, expr=expr, **kwargs - ) - - @doc_utils.add_refer_to("DataFrame.rank") - def rank(self, **kwargs): # noqa: PR02 - """ - Compute numerical rank along the specified axis. - - By default, equal values are assigned a rank that is the average of the ranks - of those values, this behavior can be changed via `method` parameter. - - Parameters - ---------- - axis : {0, 1} - method : {"average", "min", "max", "first", "dense"} - numeric_only : bool - na_option : {"keep", "top", "bottom"} - ascending : bool - pct : bool - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - QueryCompiler of the same shape as `self`, where each element is the - numerical rank of the corresponding value along row or column. - """ - return DataFrameDefault.register(pandas.DataFrame.rank)(self, **kwargs) - - @doc_utils.add_refer_to("DataFrame.melt") - def melt(self, *args, **kwargs): # noqa: PR02 - """ - Unpivot QueryCompiler data from wide to long format. - - Parameters - ---------- - id_vars : list of labels, optional - value_vars : list of labels, optional - var_name : label - value_name : label - col_level : int or label - ignore_index : bool - *args : iterable - Serves the compatibility purpose. Does not affect the result. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with unpivoted data. - """ - return DataFrameDefault.register(pandas.DataFrame.melt)(self, *args, **kwargs) - - # END Abstract map across rows/columns - - # Map across rows/columns - # These operations require some global knowledge of the full column/row - # that is being operated on. This means that we have to put all of that - # data in the same place. - @doc_utils.doc_reduce_agg( - method="value at the given quantile", - refer_to="quantile", - params=""" - q : list-like - axis : {0, 1} - numeric_only : bool - interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}""", - extra_params=["**kwargs"], - ) - def quantile_for_list_of_values(self, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.quantile)(self, **kwargs) - - # END Abstract map across rows/columns - - # Abstract __getitem__ methods - def getitem_array(self, key): - """ - Mask QueryCompiler with `key`. - - Parameters - ---------- - key : BaseQueryCompiler, np.ndarray or list of column labels - Boolean mask represented by QueryCompiler or ``np.ndarray`` of the same - shape as `self`, or enumerable of columns to pick. - - Returns - ------- - BaseQueryCompiler - New masked QueryCompiler. - """ - if isinstance(key, type(self)): - key = key.to_pandas().squeeze(axis=1) - - def getitem_array(df, key): - return df[key] - - return DataFrameDefault.register(getitem_array)(self, key) - - # END Abstract __getitem__ methods - - # Abstract insert - # This method changes the shape of the resulting data. In pandas, this - # operation is always inplace, but this object is immutable, so we just - # return a new one from here and let the front end handle the inplace - # update. - def insert(self, loc, column, value): - """ - Insert new column. - - Parameters - ---------- - loc : int - Insertion position. - column : label - Label of the new column. - value : One-column BaseQueryCompiler, 1D array or scalar - Data to fill new column with. - - Returns - ------- - BaseQueryCompiler - QueryCompiler with new column inserted. - """ - raise NotImplementedError - - # END Abstract insert - - def explode(self, column): - """ - Explode the given columns. - - Parameters - ---------- - column : Union[Hashable, Sequence[Hashable]] - The columns to explode. - - Returns - ------- - BaseQueryCompiler - QueryCompiler that contains the results of execution. For each row - in the input QueryCompiler, if the selected columns each contain M - items, there will be M rows created by exploding the columns. - """ - return DataFrameDefault.register(pandas.DataFrame.explode)(self, column) - - # END UDF - - # Manual Partitioning methods (e.g. merge, groupby) - # These methods require some sort of manual partitioning due to their - # nature. They require certain data to exist on the same partition, and - # after the shuffle, there should be only a local map required. - - # FIXME: `map_args` and `reduce_args` leaked there from `PandasQueryCompiler.groupby_*`, - # pandas storage format implements groupby via TreeReduce approach, but for other storage formats these - # parameters make no sense, they shouldn't be present in a base class. - - @doc_utils.doc_groupby_method( - action="count non-null values", - result="number of non-null values", - refer_to="count", - ) - def groupby_count( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.count)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="check whether any element is True", - result="boolean of whether there is any element which is True", - refer_to="any", - ) - def groupby_any( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.any)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get the minimum value", result="minimum value", refer_to="min" - ) - def groupby_min( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.min)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method(result="product", refer_to="prod") - def groupby_prod( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.prod)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get the maximum value", result="maximum value", refer_to="max" - ) - def groupby_max( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.max)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="check whether all elements are True", - result="boolean of whether all elements are True", - refer_to="all", - ) - def groupby_all( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.all)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method(result="sum", refer_to="sum") - def groupby_sum( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.sum)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get the number of elements", - result="number of elements", - refer_to="size", - ) - def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - result = GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.size)( - self, - by=by, - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - method="size", - ) - if not groupby_kwargs.get("as_index", False): - # Renaming 'MODIN_UNNAMED_SERIES_LABEL' to a proper name - result.columns = result.columns[:-1].append(pandas.Index(["size"])) - return result - - @doc_utils.add_refer_to("GroupBy.aggregate") - def groupby_agg( - self, - by, - agg_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - how="axis_wise", - drop=False, - ): - raise NotImplementedError # pragma: no cover - - @doc_utils.doc_groupby_method( - action="compute the mean value", result="mean value", refer_to="mean" - ) - def groupby_mean( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="mean", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="compute unbiased skew", result="unbiased skew", refer_to="skew" - ) - def groupby_skew( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="skew", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get cumulative production", - result="production of all the previous values", - refer_to="cumprod", - ) - def groupby_cumprod( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="cumprod", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="compute standard deviation", result="standard deviation", refer_to="std" - ) - def groupby_std( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="std", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="compute numerical rank", result="numerical rank", refer_to="rank" - ) - def groupby_rank( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="rank", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="compute variance", result="variance", refer_to="var" - ) - def groupby_var( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="var", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get the number of unique values", - result="number of unique values", - refer_to="nunique", - ) - def groupby_nunique( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, **kwargs - ): - return self.groupby_agg( - by=by, - agg_func="nunique", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get the median value", result="median value", refer_to="median" - ) - def groupby_median( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="median", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="compute specified quantile", - result="quantile value", - refer_to="quantile", - ) - def groupby_quantile( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - raise NotImplementedError # pragma: no cover - - @doc_utils.doc_groupby_method( - action="fill NaN values", - result="`fill_value` if it was NaN, original value otherwise", - refer_to="fillna", - ) - def groupby_fillna( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="fillna", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="get data types", result="data type", refer_to="dtypes" - ) - def groupby_dtypes( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="dtypes", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - @doc_utils.doc_groupby_method( - action="shift data with the specified settings", - result="shifted value", - refer_to="shift", - ) - def groupby_shift( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - return self.groupby_agg( - by=by, - agg_func="shift", - axis=axis, - groupby_kwargs=groupby_kwargs, - agg_args=agg_args, - agg_kwargs=agg_kwargs, - drop=drop, - ) - - # END Manual Partitioning methods - - @doc_utils.add_refer_to("DataFrame.unstack") - def unstack(self, level, fill_value): - """ - Pivot a level of the (necessarily hierarchical) index labels. - - Parameters - ---------- - level : int or label - fill_value : scalar or dict - - Returns - ------- - BaseQueryCompiler - """ - return DataFrameDefault.register(pandas.DataFrame.unstack)( - self, level=level, fill_value=fill_value - ) - - @doc_utils.add_refer_to("DataFrame.pivot") - def pivot(self, index, columns, values): - """ - Produce pivot table based on column values. - - Parameters - ---------- - index : label or list of such, pandas.Index, optional - columns : label or list of such - values : label or list of such, optional - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing pivot table. - """ - return DataFrameDefault.register(pandas.DataFrame.pivot)( - self, index=index, columns=columns, values=values - ) - - @doc_utils.add_refer_to("DataFrame.pivot_table") - def pivot_table( - self, - index, - values, - columns, - aggfunc, - fill_value, - margins, - dropna, - margins_name, - observed, - sort, - ): - """ - Create a spreadsheet-style pivot table from underlying data. - - Parameters - ---------- - index : label, pandas.Grouper, array or list of such - values : label, optional - columns : column, pandas.Grouper, array or list of such - aggfunc : callable(pandas.Series) -> scalar, dict of list of such - fill_value : scalar, optional - margins : bool - dropna : bool - margins_name : str - observed : bool - sort : bool - - Returns - ------- - BaseQueryCompiler - """ - return DataFrameDefault.register(pandas.DataFrame.pivot_table)( - self, - index=index, - values=values, - columns=columns, - aggfunc=aggfunc, - fill_value=fill_value, - margins=margins, - dropna=dropna, - margins_name=margins_name, - observed=observed, - sort=sort, - ) - - @doc_utils.add_refer_to("get_dummies") - def get_dummies(self, columns, **kwargs): # noqa: PR02 - """ - Convert categorical variables to dummy variables for certain columns. - - Parameters - ---------- - columns : label or list of such - Columns to convert. - prefix : str or list of such - prefix_sep : str - dummy_na : bool - drop_first : bool - dtype : dtype - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with categorical variables converted to dummy. - """ - - def get_dummies(df, columns, **kwargs): - return pandas.get_dummies(df, columns=columns, **kwargs) - - return DataFrameDefault.register(get_dummies)(self, columns=columns, **kwargs) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.repeat") - def repeat(self, repeats): - """ - Repeat each element of one-column QueryCompiler given number of times. - - Parameters - ---------- - repeats : int or array of ints - The number of repetitions for each element. This should be a - non-negative integer. Repeating 0 times will return an empty - QueryCompiler. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler with repeated elements. - """ - return SeriesDefault.register(pandas.Series.repeat)(self, repeats=repeats) - - def get_axis(self, axis): - """ - Return index labels of the specified axis. - - Parameters - ---------- - axis : {0, 1} - Axis to return labels on. - 0 is for index, when 1 is for columns. - - Returns - ------- - pandas.Index - """ - return self.index if axis == 0 else self.columns - - # TODO SNOW-884220: deprecate this function when loc getitem is supported. - # Note: reference the latest modin when deprecating this function. - def get_positions_from_labels(self, row_loc, col_loc): - """ - Compute index and column positions from their respective locators. - - Inputs to this method are arguments the the pandas user could pass to loc. - This function will compute the corresponding index and column positions - that the user could equivalently pass to iloc. - - Parameters - ---------- - row_loc : scalar, slice, list, array or tuple - Row locator. - col_loc : scalar, slice, list, array or tuple - Columns locator. - - Returns - ------- - row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise - List of index labels. - col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise - List of columns labels. - - Notes - ----- - Usage of `slice(None)` as a resulting lookup is a hack to pass information about - full-axis grab without computing actual indices that triggers lazy computations. - Ideally, this API should get rid of using slices as indexers and either use a - common ``Indexer`` object or range and ``np.ndarray`` only. - """ - from snowflake.snowpark.modin.pandas.indexing import ( - boolean_mask_to_numeric, - is_boolean_array, - is_list_like, - is_range_like, - ) - - lookups = [] - for axis, axis_loc in enumerate((row_loc, col_loc)): - if is_scalar(axis_loc): - axis_loc = np.array([axis_loc]) - if isinstance(axis_loc, slice) or is_range_like(axis_loc): - if isinstance(axis_loc, slice) and axis_loc == slice(None): - axis_lookup = axis_loc - else: - axis_labels = self.get_axis(axis) - # `slice_indexer` returns a fully-defined numeric slice for a non-fully-defined labels-based slice - axis_lookup = axis_labels.slice_indexer( - axis_loc.start, axis_loc.stop, axis_loc.step - ) - # Converting negative indices to their actual positions: - axis_lookup = pandas.RangeIndex( - start=( - axis_lookup.start - if axis_lookup.start >= 0 - else axis_lookup.start + len(axis_labels) - ), - stop=( - axis_lookup.stop - if axis_lookup.stop >= 0 - else axis_lookup.stop + len(axis_labels) - ), - step=axis_lookup.step, - ) - elif self.has_multiindex(axis): - # `Index.get_locs` raises an IndexError by itself if missing labels were provided, - # we don't have to do missing-check for the received `axis_lookup`. - if isinstance(axis_loc, pandas.MultiIndex): - axis_lookup = self.get_axis(axis).get_indexer_for(axis_loc) - else: - axis_lookup = self.get_axis(axis).get_locs(axis_loc) - elif is_boolean_array(axis_loc): - axis_lookup = boolean_mask_to_numeric(axis_loc) - else: - axis_labels = self.get_axis(axis) - if is_list_like(axis_loc) and not isinstance( - axis_loc, (np.ndarray, pandas.Index) - ): - # `Index.get_indexer_for` works much faster with numpy arrays than with python lists, - # so although we lose some time here on converting to numpy, `Index.get_indexer_for` - # speedup covers the loss that we gain here. - axis_loc = np.array(axis_loc, dtype=axis_labels.dtype) - axis_lookup = axis_labels.get_indexer_for(axis_loc) - # `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether - # there are any -1 in the received indexer to raise a KeyError here. - missing_mask = axis_lookup == -1 - if missing_mask.any(): - missing_labels = ( - axis_loc[missing_mask] - if is_list_like(axis_loc) - # If `axis_loc` is not a list-like then we can't select certain - # labels that are missing and so printing the whole indexer - else axis_loc - ) - raise KeyError(missing_labels) - - if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup): - axis_lookup = axis_lookup.values - - lookups.append(axis_lookup) - return lookups - - @abc.abstractmethod - def take_2d_positional(self, index, columns): # pragma: no cover - """ - Index QueryCompiler with passed keys. - - Parameters - ---------- - index : list-like of ints, - Positional indices of rows to grab. - columns : list-like of ints, - Positional indices of columns to grab. - - Returns - ------- - BaseQueryCompiler - New masked QueryCompiler. - """ - pass - - # END Abstract methods for QueryCompiler - - @pandas.util.cache_readonly - def __constructor__(self): - """ - Get query compiler constructor. - - By default, constructor method will invoke an init. - - Returns - ------- - callable - """ - return type(self) - - # __delitem__ - # This will change the shape of the resulting data. - def delitem(self, key): - """ - Drop `key` column. - - Parameters - ---------- - key : label - Column name to drop. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler without `key` column. - """ - return self.drop(columns=[key]) - - # END __delitem__ - - @abc.abstractmethod - def has_multiindex(self, axis=0): # pragma: no cover - pass - - def get_index_name(self, axis=0): - # TODO (SNOW-850751): clean this up and add implementation in snowflake query compiler - """ - Get index name of specified axis. - - Parameters - ---------- - axis : {0, 1}, default: 0 - Axis to get index name on. - - Returns - ------- - hashable - Index name, None for MultiIndex. - """ - return self.get_axis(axis).name - - def set_index_name(self, name, axis=0): - # TODO (SNOW-850751): clean this up and add implementation in snowflake query compiler - """ - Set index name for the specified axis. - - Parameters - ---------- - name : hashable - New index name. - axis : {0, 1}, default: 0 - Axis to set name along. - """ - self.get_axis(axis).name = name - - def get_index_names(self, axis=0): - """ - Get index names of specified axis. - - Parameters - ---------- - axis : {0, 1}, default: 0 - Axis to get index names on. - - Returns - ------- - list - Index names. - """ - raise NotImplementedError - - # DateTime methods - - @doc_utils.doc_dt_round(refer_to="ceil") - def dt_ceil(self, freq, ambiguous="raise", nonexistent="raise"): - return DateTimeDefault.register(pandas.Series.dt.ceil)( - self, freq, ambiguous, nonexistent - ) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.components") - def dt_components(self): - """ - Spread each date-time value into its components (days, hours, minutes...). - - Returns - ------- - BaseQueryCompiler - """ - return DateTimeDefault.register(pandas.Series.dt.components)(self) - - @doc_utils.doc_dt_timestamp( - prop="the date without timezone information", refer_to="date" - ) - def dt_date(self): - return DateTimeDefault.register(pandas.Series.dt.date)(self) - - @doc_utils.doc_dt_timestamp(prop="day component", refer_to="day") - def dt_day(self): - return DateTimeDefault.register(pandas.Series.dt.day)(self) - - @doc_utils.doc_dt_timestamp( - prop="day name", refer_to="day_name", params="locale : str, optional" - ) - def dt_day_name(self, locale=None): - return DateTimeDefault.register(pandas.Series.dt.day_name)(self, locale) - - @doc_utils.doc_dt_timestamp(prop="integer day of week", refer_to="dayofweek") - # FIXME: `dt_dayofweek` is an alias for `dt_weekday`, one of them should - # be removed (Modin issue #3107). - def dt_dayofweek(self): - return DateTimeDefault.register(pandas.Series.dt.dayofweek)(self) - - @doc_utils.doc_dt_timestamp(prop="day of year", refer_to="dayofyear") - def dt_dayofyear(self): - return DateTimeDefault.register(pandas.Series.dt.dayofyear)(self) - - @doc_utils.doc_dt_interval(prop="days", refer_to="days") - def dt_days(self): - return DateTimeDefault.register(pandas.Series.dt.days)(self) - - @doc_utils.doc_dt_timestamp( - prop="number of days in month", refer_to="days_in_month" - ) - # FIXME: `dt_days_in_month` is an alias for `dt_daysinmonth`, one of them should - # be removed (Modin issue #3107). - def dt_days_in_month(self): - return DateTimeDefault.register(pandas.Series.dt.days_in_month)(self) - - @doc_utils.doc_dt_timestamp(prop="number of days in month", refer_to="daysinmonth") - def dt_daysinmonth(self): - return DateTimeDefault.register(pandas.Series.dt.daysinmonth)(self) - - @doc_utils.doc_dt_period(prop="the timestamp of end time", refer_to="end_time") - def dt_end_time(self): - return DateTimeDefault.register(pandas.Series.dt.end_time)(self) - - @doc_utils.doc_dt_round(refer_to="floor") - def dt_floor(self, freq, ambiguous="raise", nonexistent="raise"): - return DateTimeDefault.register(pandas.Series.dt.floor)( - self, freq, ambiguous, nonexistent - ) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.freq") - def dt_freq(self): - """ - Get the time frequency of the underlying time-series data. - - Returns - ------- - BaseQueryCompiler - QueryCompiler containing a single value, the frequency of the data. - """ - return DateTimeDefault.register(pandas.Series.dt.freq)(self) - - @doc_utils.doc_dt_timestamp(prop="hour", refer_to="hour") - def dt_hour(self): - return DateTimeDefault.register(pandas.Series.dt.hour)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether corresponding year is leap", - refer_to="is_leap_year", - ) - def dt_is_leap_year(self): - return DateTimeDefault.register(pandas.Series.dt.is_leap_year)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether the date is the last day of the month", - refer_to="is_month_end", - ) - def dt_is_month_end(self): - return DateTimeDefault.register(pandas.Series.dt.is_month_end)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether the date is the first day of the month", - refer_to="is_month_start", - ) - def dt_is_month_start(self): - return DateTimeDefault.register(pandas.Series.dt.is_month_start)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether the date is the last day of the quarter", - refer_to="is_quarter_end", - ) - def dt_is_quarter_end(self): - return DateTimeDefault.register(pandas.Series.dt.is_quarter_end)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether the date is the first day of the quarter", - refer_to="is_quarter_start", - ) - def dt_is_quarter_start(self): - return DateTimeDefault.register(pandas.Series.dt.is_quarter_start)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether the date is the last day of the year", - refer_to="is_year_end", - ) - def dt_is_year_end(self): - return DateTimeDefault.register(pandas.Series.dt.is_year_end)(self) - - @doc_utils.doc_dt_timestamp( - prop="the boolean of whether the date is the first day of the year", - refer_to="is_year_start", - ) - def dt_is_year_start(self): - return DateTimeDefault.register(pandas.Series.dt.is_year_start)(self) - - @doc_utils.doc_dt_timestamp(prop="microseconds component", refer_to="microsecond") - def dt_microsecond(self): - return DateTimeDefault.register(pandas.Series.dt.microsecond)(self) - - @doc_utils.doc_dt_interval(prop="microseconds component", refer_to="microseconds") - def dt_microseconds(self): - return DateTimeDefault.register(pandas.Series.dt.microseconds)(self) - - @doc_utils.doc_dt_timestamp(prop="minute component", refer_to="minute") - def dt_minute(self): - return DateTimeDefault.register(pandas.Series.dt.minute)(self) - - @doc_utils.doc_dt_timestamp(prop="month component", refer_to="month") - def dt_month(self): - return DateTimeDefault.register(pandas.Series.dt.month)(self) - - @doc_utils.doc_dt_timestamp( - prop="the month name", refer_to="month name", params="locale : str, optional" - ) - def dt_month_name(self, locale=None): - return DateTimeDefault.register(pandas.Series.dt.month_name)(self, locale) - - @doc_utils.doc_dt_timestamp(prop="nanoseconds component", refer_to="nanosecond") - def dt_nanosecond(self): - return DateTimeDefault.register(pandas.Series.dt.nanosecond)(self) - - @doc_utils.doc_dt_interval(prop="nanoseconds component", refer_to="nanoseconds") - def dt_nanoseconds(self): - return DateTimeDefault.register(pandas.Series.dt.nanoseconds)(self) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.normalize") - def dt_normalize(self): - """ - Set the time component of each date-time value to midnight. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing date-time values with midnight time. - """ - return DateTimeDefault.register(pandas.Series.dt.normalize)(self) - - @doc_utils.doc_dt_timestamp(prop="quarter component", refer_to="quarter") - def dt_quarter(self): - return DateTimeDefault.register(pandas.Series.dt.quarter)(self) - - @doc_utils.doc_dt_period(prop="the fiscal year", refer_to="qyear") - def dt_qyear(self): - return DateTimeDefault.register(pandas.Series.dt.qyear)(self) - - @doc_utils.doc_dt_round(refer_to="round") - def dt_round(self, freq, ambiguous="raise", nonexistent="raise"): - return DateTimeDefault.register(pandas.Series.dt.round)( - self, freq, ambiguous, nonexistent - ) - - @doc_utils.doc_dt_timestamp(prop="seconds component", refer_to="second") - def dt_second(self): - return DateTimeDefault.register(pandas.Series.dt.second)(self) - - @doc_utils.doc_dt_interval(prop="seconds component", refer_to="seconds") - def dt_seconds(self): - return DateTimeDefault.register(pandas.Series.dt.seconds)(self) - - @doc_utils.doc_dt_period(prop="the timestamp of start time", refer_to="start_time") - def dt_start_time(self): - return DateTimeDefault.register(pandas.Series.dt.start_time)(self) - - @doc_utils.add_refer_to("Series.dt.strftime") - def dt_strftime(self, date_format): - """ - Format underlying date-time data using specified format. - - Parameters - ---------- - date_format : str - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing formatted date-time values. - """ - return DateTimeDefault.register(pandas.Series.dt.strftime)(self, date_format) - - @doc_utils.doc_dt_timestamp(prop="time component", refer_to="time") - def dt_time(self): - return DateTimeDefault.register(pandas.Series.dt.time)(self) - - @doc_utils.doc_dt_timestamp( - prop="time component with timezone information", refer_to="timetz" - ) - def dt_timetz(self): - return DateTimeDefault.register(pandas.Series.dt.timetz)(self) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.to_period") - def dt_to_period(self, freq=None): - """ - Convert underlying data to the period at a particular frequency. - - Parameters - ---------- - freq : str, optional - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing period data. - """ - return DateTimeDefault.register(pandas.Series.dt.to_period)(self, freq) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.to_pydatetime") - def dt_to_pydatetime(self): - """ - Convert underlying data to array of python native ``datetime``. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing 1D array of ``datetime`` objects. - """ - return DateTimeDefault.register(pandas.Series.dt.to_pydatetime)(self) - - # FIXME: there are no references to this method, we should either remove it - # or add a call reference at the DataFrame level (Modin issue #3103). - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.to_pytimedelta") - def dt_to_pytimedelta(self): - """ - Convert underlying data to array of python native ``datetime.timedelta``. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing 1D array of ``datetime.timedelta``. - """ - return DateTimeDefault.register(pandas.Series.dt.to_pytimedelta)(self) - - @doc_utils.doc_dt_period( - prop="the timestamp representation", refer_to="to_timestamp" - ) - def dt_to_timestamp(self): - return DateTimeDefault.register(pandas.Series.dt.to_timestamp)(self) - - @doc_utils.doc_dt_interval(prop="duration in seconds", refer_to="total_seconds") - def dt_total_seconds(self): - return DateTimeDefault.register(pandas.Series.dt.total_seconds)(self) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.tz") - def dt_tz(self): - """ - Get the time-zone of the underlying time-series data. - - Returns - ------- - BaseQueryCompiler - QueryCompiler containing a single value, time-zone of the data. - """ - return DateTimeDefault.register(pandas.Series.dt.tz)(self) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.tz_convert") - def dt_tz_convert(self, tz): - """ - Convert time-series data to the specified time zone. - - Parameters - ---------- - tz : str, pytz.timezone - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing values with converted time zone. - """ - return DateTimeDefault.register(pandas.Series.dt.tz_convert)(self, tz) - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.dt.tz_localize") - def dt_tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): - """ - Localize tz-naive to tz-aware. - - Parameters - ---------- - tz : str, pytz.timezone, optional - ambiguous : {"raise", "inner", "NaT"} or bool mask, default: "raise" - nonexistent : {"raise", "shift_forward", "shift_backward, "NaT"} or pandas.timedelta, default: "raise" - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing values with localized time zone. - """ - return DateTimeDefault.register(pandas.Series.dt.tz_localize)( - self, tz, ambiguous, nonexistent - ) - - @doc_utils.doc_dt_timestamp(prop="week component", refer_to="week") - def dt_week(self): - return DateTimeDefault.register(pandas.Series.dt.week)(self) - - @doc_utils.doc_dt_timestamp(prop="integer day of week", refer_to="weekday") - def dt_weekday(self): - return DateTimeDefault.register(pandas.Series.dt.weekday)(self) - - @doc_utils.doc_dt_timestamp(prop="week of year", refer_to="weekofyear") - def dt_weekofyear(self): - return DateTimeDefault.register(pandas.Series.dt.weekofyear)(self) - - @doc_utils.doc_dt_timestamp(prop="year component", refer_to="year") - def dt_year(self): - return DateTimeDefault.register(pandas.Series.dt.year)(self) - - # End of DateTime methods - - # Resample methods - - # FIXME: - # 1. Query Compiler shouldn't care about differences between Series and DataFrame - # so `resample_agg_df` and `resample_agg_ser` should be combined (Modin issue #3104). - # 2. In DataFrame API `Resampler.aggregate` is an alias for `Resampler.apply` - # we should remove one of these methods: `resample_agg_*` or `resample_app_*` (Modin issue #3107). - @doc_utils.doc_resample_agg( - action="apply passed aggregation function", - params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", - output="function names", - refer_to="agg", - ) - def resample_agg_df(self, resample_kwargs, func, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.aggregate)( - self, resample_kwargs, func, *args, **kwargs - ) - - @doc_utils.add_deprecation_warning(replacement_method="resample_agg_df") - @doc_utils.doc_resample_agg( - action="apply passed aggregation function in a one-column query compiler", - params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", - output="function names", - refer_to="agg", - ) - def resample_agg_ser(self, resample_kwargs, func, *args, **kwargs): - return ResampleDefault.register( - pandas.core.resample.Resampler.aggregate, squeeze_self=True - )(self, resample_kwargs, func, *args, **kwargs) - - @doc_utils.add_deprecation_warning(replacement_method="resample_agg_df") - @doc_utils.doc_resample_agg( - action="apply passed aggregation function", - params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", - output="function names", - refer_to="apply", - ) - def resample_app_df(self, resample_kwargs, func, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.apply)( - self, resample_kwargs, func, *args, **kwargs - ) - - @doc_utils.add_deprecation_warning(replacement_method="resample_agg_df") - @doc_utils.doc_resample_agg( - action="apply passed aggregation function in a one-column query compiler", - params="func : str, dict, callable(pandas.Series) -> scalar, or list of such", - output="function names", - refer_to="apply", - ) - def resample_app_ser(self, resample_kwargs, func, *args, **kwargs): - return ResampleDefault.register( - pandas.core.resample.Resampler.apply, squeeze_self=True - )(self, resample_kwargs, func, *args, **kwargs) - - def resample_asfreq(self, resample_kwargs, fill_value): - """ - Resample time-series data and get the values at the new frequency. - - Group data into intervals by time-series row/column with - a specified frequency and get values at the new frequency. - - Parameters - ---------- - resample_kwargs : dict - Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. - fill_value : scalar - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing values at the specified frequency. - """ - return ResampleDefault.register(pandas.core.resample.Resampler.asfreq)( - self, resample_kwargs, fill_value - ) - - # FIXME: `resample_backfill` is an alias for `resample_bfill`, one of these method - # should be removed (Modin issue #3107). - @doc_utils.doc_resample_fillna(method="back-fill", refer_to="backfill") - def resample_backfill(self, resample_kwargs, limit): - return ResampleDefault.register(pandas.core.resample.Resampler.backfill)( - self, resample_kwargs, limit - ) - - @doc_utils.doc_resample_fillna(method="back-fill", refer_to="bfill") - def resample_bfill(self, resample_kwargs, limit): - return ResampleDefault.register(pandas.core.resample.Resampler.bfill)( - self, resample_kwargs, limit - ) - - @doc_utils.doc_resample_reduce( - result="number of non-NA values", refer_to="count", compatibility_params=False - ) - def resample_count(self, resample_kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.count)( - self, resample_kwargs - ) - - # FIXME: `resample_ffill` is an alias for `resample_pad`, one of these method - # should be removed (Modin issue #3107). - @doc_utils.doc_resample_fillna(method="forward-fill", refer_to="ffill") - def resample_ffill(self, resample_kwargs, limit): - return ResampleDefault.register(pandas.core.resample.Resampler.ffill)( - self, resample_kwargs, limit - ) - - # FIXME: we should combine all resample fillna methods into `resample_fillna` - # (Modin issue #3107) - @doc_utils.doc_resample_fillna( - method="specified", refer_to="fillna", params="method : str" - ) - def resample_fillna(self, resample_kwargs, method, limit): - return ResampleDefault.register(pandas.core.resample.Resampler.fillna)( - self, resample_kwargs, method, limit - ) - - @doc_utils.doc_resample_reduce(result="first element", refer_to="first") - def resample_first(self, resample_kwargs, numeric_only, min_count, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.first)( - self, - resample_kwargs, - numeric_only=numeric_only, - min_count=min_count, - *args, # noqa: B026 - **kwargs, - ) - - # FIXME: This function takes Modin DataFrame via `obj` parameter, - # we should avoid leaking of the high-level objects to the query compiler level. - # (Modin issue #3106) - def resample_get_group(self, resample_kwargs, name, obj): - """ - Resample time-series data and get the specified group. - - Group data into intervals by time-series row/column with - a specified frequency and get the values of the specified group. - - Parameters - ---------- - resample_kwargs : dict - Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. - name : object - obj : modin.pandas.DataFrame, optional - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing the values from the specified group. - """ - return ResampleDefault.register(pandas.core.resample.Resampler.get_group)( - self, resample_kwargs, name, obj - ) - - @doc_utils.doc_resample_fillna( - method="specified interpolation", - refer_to="interpolate", - params=""" - method : str - axis : {0, 1} - limit : int - inplace : {False} - This parameter serves the compatibility purpose. Always has to be False. - limit_direction : {"forward", "backward", "both"} - limit_area : {None, "inside", "outside"} - downcast : str, optional - **kwargs : dict - """, - overwrite_template_params=True, - ) - def resample_interpolate( - self, - resample_kwargs, - method, - axis, - limit, - inplace, - limit_direction, - limit_area, - downcast, - **kwargs, - ): - return ResampleDefault.register(pandas.core.resample.Resampler.interpolate)( - self, - resample_kwargs, - method, - axis, - limit, - inplace, - limit_direction, - limit_area, - downcast, - **kwargs, - ) - - @doc_utils.doc_resample_reduce(result="last element", refer_to="last") - def resample_last(self, resample_kwargs, numeric_only, min_count, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.last)( - self, - resample_kwargs, - numeric_only=numeric_only, - min_count=min_count, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_reduce(result="maximum value", refer_to="max") - def resample_max(self, resample_kwargs, numeric_only, min_count, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.max)( - self, - resample_kwargs, - numeric_only=numeric_only, - min_count=min_count, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_reduce(result="mean value", refer_to="mean") - def resample_mean(self, resample_kwargs, numeric_only, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.mean)( - self, - resample_kwargs, - numeric_only=numeric_only, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_reduce(result="median value", refer_to="median") - def resample_median(self, resample_kwargs, numeric_only, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.median)( - self, - resample_kwargs, - numeric_only=numeric_only, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_reduce(result="minimum value", refer_to="min") - def resample_min(self, resample_kwargs, numeric_only, min_count, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.min)( - self, - resample_kwargs, - numeric_only=numeric_only, - min_count=min_count, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_fillna(method="'nearest'", refer_to="nearest") - def resample_nearest(self, resample_kwargs, limit): - return ResampleDefault.register(pandas.core.resample.Resampler.nearest)( - self, resample_kwargs, limit - ) - - @doc_utils.doc_resample_reduce(result="number of unique values", refer_to="nunique") - def resample_nunique(self, resample_kwargs, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.nunique)( - self, resample_kwargs, *args, **kwargs - ) - - # FIXME: Query Compiler shouldn't care about differences between Series and DataFrame - # so `resample_ohlc_df` and `resample_ohlc_ser` should be combined (Modin issue #3104). - @doc_utils.doc_resample_agg( - action="compute open, high, low and close values", - output="labels of columns containing computed values", - refer_to="ohlc", - ) - def resample_ohlc_df(self, resample_kwargs, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.ohlc)( - self, resample_kwargs, *args, **kwargs - ) - - @doc_utils.doc_resample_agg( - action="compute open, high, low and close values", - output="labels of columns containing computed values", - refer_to="ohlc", - ) - def resample_ohlc_ser(self, resample_kwargs, *args, **kwargs): - return ResampleDefault.register( - pandas.core.resample.Resampler.ohlc, squeeze_self=True - )(self, resample_kwargs, *args, **kwargs) - - @doc_utils.doc_resample_fillna(method="'pad'", refer_to="pad") - def resample_pad(self, resample_kwargs, limit): - return ResampleDefault.register(pandas.core.resample.Resampler.pad)( - self, resample_kwargs, limit - ) - - # FIXME: This method require us to build high-level resampler object - # which we shouldn't do at the query compiler. We need to move this at the front. - # (Modin issue #3105) - @doc_utils.add_refer_to("Resampler.pipe") - def resample_pipe(self, resample_kwargs, func, *args, **kwargs): - """ - Resample time-series data and apply aggregation on it. - - Group data into intervals by time-series row/column with - a specified frequency, build equivalent ``pandas.Resampler`` object - and apply passed function to it. - - Parameters - ---------- - resample_kwargs : dict - Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. - func : callable(pandas.Resampler) -> object or tuple(callable, str) - *args : iterable - Positional arguments to pass to function. - **kwargs : dict - Keyword arguments to pass to function. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing the result of passed function. - """ - return ResampleDefault.register(pandas.core.resample.Resampler.pipe)( - self, resample_kwargs, func, *args, **kwargs - ) - - @doc_utils.doc_resample_reduce( - result="product", - params=""" - numeric_only: bool - min_count : int - """, - refer_to="prod", - ) - def resample_prod(self, resample_kwargs, numeric_only, min_count, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.prod)( - self, - resample_kwargs, - numeric_only=numeric_only, - min_count=min_count, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_reduce( - result="quantile", params="q : float", refer_to="quantile" - ) - def resample_quantile(self, resample_kwargs, q, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.quantile)( - self, resample_kwargs, q, *args, **kwargs - ) - - @doc_utils.doc_resample_reduce( - result="standard error of the mean", - params=""" - ddof : int - numeric_only: bool - """, - refer_to="sem", - ) - def resample_sem(self, resample_kwargs, ddof, numeric_only, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.sem)( - self, - resample_kwargs, - ddof=ddof, - numeric_only=numeric_only, - *args, # noqa: B026 - **kwargs, - ) - - @doc_utils.doc_resample_reduce( - result="number of elements in a group", refer_to="size" - ) - def resample_size(self, resample_kwargs, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.size)( - self, resample_kwargs, *args, **kwargs - ) - - @doc_utils.doc_resample_reduce( - result="standard deviation", - params=""" - ddof : int - numeric_only: bool - """, - refer_to="std", - ) - def resample_std(self, resample_kwargs, ddof, numeric_only, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.std)( - self, resample_kwargs, ddof, numeric_only, *args, **kwargs - ) - - @doc_utils.doc_resample_reduce( - result="sum", - params=""" - numeric_only: bool - min_count : int - """, - refer_to="sum", - ) - def resample_sum(self, resample_kwargs, numeric_only, min_count, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.sum)( - self, - resample_kwargs, - numeric_only=numeric_only, - min_count=min_count, - *args, # noqa: B026 - **kwargs, - ) - - def resample_transform(self, resample_kwargs, arg, *args, **kwargs): - """ - Resample time-series data and apply aggregation on it. - - Group data into intervals by time-series row/column with - a specified frequency and call passed function on each group. - In contrast to ``resample_app_df`` apply function to the whole group, - instead of a single axis. - - Parameters - ---------- - resample_kwargs : dict - Resample parameters as expected by ``modin.pandas.DataFrame.resample`` signature. - arg : callable(pandas.DataFrame) -> pandas.Series - *args : iterable - Positional arguments to pass to function. - **kwargs : dict - Keyword arguments to pass to function. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing the result of passed function. - """ - return ResampleDefault.register(pandas.core.resample.Resampler.transform)( - self, resample_kwargs, arg, *args, **kwargs - ) - - @doc_utils.doc_resample_reduce( - result="variance", - params=""" - ddof : int - numeric_only: bool - """, - refer_to="var", - ) - def resample_var(self, resample_kwargs, ddof, numeric_only, *args, **kwargs): - return ResampleDefault.register(pandas.core.resample.Resampler.var)( - self, resample_kwargs, ddof, numeric_only, *args, **kwargs - ) - - # End of Resample methods - - # Str methods - - @doc_utils.doc_str_method(refer_to="capitalize", params="") - def str_capitalize(self): - return StrDefault.register(pandas.Series.str.capitalize)(self) - - @doc_utils.doc_str_method( - refer_to="center", - params=""" - width : int - fillchar : str, default: ' '""", - ) - def str_center(self, width, fillchar=" "): - return StrDefault.register(pandas.Series.str.center)(self, width, fillchar) - - @doc_utils.doc_str_method( - refer_to="contains", - params=""" - pat : str - case : bool, default: True - flags : int, default: 0 - na : object, default: np.NaN - regex : bool, default: True""", - ) - def str_contains(self, pat, case=True, flags=0, na=np.NaN, regex=True): - return StrDefault.register(pandas.Series.str.contains)( - self, pat, case, flags, na, regex - ) - - @doc_utils.doc_str_method( - refer_to="count", - params=""" - pat : str - flags : int, default: 0 - **kwargs : dict""", - ) - def str_count(self, pat, flags=0, **kwargs): - return StrDefault.register(pandas.Series.str.count)(self, pat, flags, **kwargs) - - @doc_utils.doc_str_method( - refer_to="endswith", - params=""" - pat : str - na : object, default: np.NaN""", - ) - def str_endswith(self, pat, na=np.NaN): - return StrDefault.register(pandas.Series.str.endswith)(self, pat, na) - - @doc_utils.doc_str_method( - refer_to="find", - params=""" - sub : str - start : int, default: 0 - end : int, optional""", - ) - def str_find(self, sub, start=0, end=None): - return StrDefault.register(pandas.Series.str.find)(self, sub, start, end) - - @doc_utils.doc_str_method( - refer_to="findall", - params=""" - pat : str - flags : int, default: 0 - **kwargs : dict""", - ) - def str_findall(self, pat, flags=0, **kwargs): - return StrDefault.register(pandas.Series.str.findall)( - self, pat, flags, **kwargs - ) - - @doc_utils.doc_str_method(refer_to="get", params="i : int") - def str_get(self, i): - return StrDefault.register(pandas.Series.str.get)(self, i) - - @doc_utils.doc_str_method(refer_to="get_dummies", params="sep : str") - def str_get_dummies(self, sep): - return StrDefault.register(pandas.Series.str.get_dummies)(self, sep) - - @doc_utils.doc_str_method( - refer_to="index", - params=""" - sub : str - start : int, default: 0 - end : int, optional""", - ) - def str_index(self, sub, start=0, end=None): - return StrDefault.register(pandas.Series.str.index)(self, sub, start, end) - - @doc_utils.doc_str_method(refer_to="isalnum", params="") - def str_isalnum(self): - return StrDefault.register(pandas.Series.str.isalnum)(self) - - @doc_utils.doc_str_method(refer_to="isalpha", params="") - def str_isalpha(self): - return StrDefault.register(pandas.Series.str.isalpha)(self) - - @doc_utils.doc_str_method(refer_to="isdecimal", params="") - def str_isdecimal(self): - return StrDefault.register(pandas.Series.str.isdecimal)(self) - - @doc_utils.doc_str_method(refer_to="isdigit", params="") - def str_isdigit(self): - return StrDefault.register(pandas.Series.str.isdigit)(self) - - @doc_utils.doc_str_method(refer_to="islower", params="") - def str_islower(self): - return StrDefault.register(pandas.Series.str.islower)(self) - - @doc_utils.doc_str_method(refer_to="isnumeric", params="") - def str_isnumeric(self): - return StrDefault.register(pandas.Series.str.isnumeric)(self) - - @doc_utils.doc_str_method(refer_to="isspace", params="") - def str_isspace(self): - return StrDefault.register(pandas.Series.str.isspace)(self) - - @doc_utils.doc_str_method(refer_to="istitle", params="") - def str_istitle(self): - return StrDefault.register(pandas.Series.str.istitle)(self) - - @doc_utils.doc_str_method(refer_to="isupper", params="") - def str_isupper(self): - return StrDefault.register(pandas.Series.str.isupper)(self) - - @doc_utils.doc_str_method(refer_to="join", params="sep : str") - def str_join(self, sep): - return StrDefault.register(pandas.Series.str.join)(self, sep) - - @doc_utils.doc_str_method(refer_to="len", params="") - def str_len(self): - return StrDefault.register(pandas.Series.str.len)(self) - - @doc_utils.doc_str_method( - refer_to="ljust", - params=""" - width : int - fillchar : str, default: ' '""", - ) - def str_ljust(self, width, fillchar=" "): - return StrDefault.register(pandas.Series.str.ljust)(self, width, fillchar) - - @doc_utils.doc_str_method(refer_to="lower", params="") - def str_lower(self): - return StrDefault.register(pandas.Series.str.lower)(self) - - @doc_utils.doc_str_method(refer_to="lstrip", params="to_strip : str, optional") - def str_lstrip(self, to_strip=None): - return StrDefault.register(pandas.Series.str.lstrip)(self, to_strip) - - @doc_utils.doc_str_method( - refer_to="match", - params=""" - pat : str - case : bool, default: True - flags : int, default: 0 - na : object, default: np.NaN""", - ) - def str_match(self, pat, case=True, flags=0, na=np.NaN): - return StrDefault.register(pandas.Series.str.match)(self, pat, case, flags, na) - - @doc_utils.doc_str_method( - refer_to="extract", - params=""" - pat : str - flags : int, default: 0 - expand : bool, default: True""", - ) - def str_extract(self, pat, flags=0, expand=True): - return StrDefault.register(pandas.Series.str.extract)(self, pat, flags, expand) - - @doc_utils.doc_str_method( - refer_to="extractall", - params=""" - pat : str - flags : int, default: 0""", - ) - def str_extractall(self, pat, flags=0): - return StrDefault.register(pandas.Series.str.extractall)(self, pat, flags) - - @doc_utils.doc_str_method( - refer_to="normalize", params="form : {'NFC', 'NFKC', 'NFD', 'NFKD'}" - ) - def str_normalize(self, form): - return StrDefault.register(pandas.Series.str.normalize)(self, form) - - @doc_utils.doc_str_method( - refer_to="pad", - params=""" - width : int - side : {'left', 'right', 'both'}, default: 'left' - fillchar : str, default: ' '""", - ) - def str_pad(self, width, side="left", fillchar=" "): - return StrDefault.register(pandas.Series.str.pad)(self, width, side, fillchar) - - @doc_utils.doc_str_method( - refer_to="partition", - params=""" - sep : str, default: ' ' - expand : bool, default: True""", - ) - def str_partition(self, sep=" ", expand=True): - return StrDefault.register(pandas.Series.str.partition)(self, sep, expand) - - @doc_utils.doc_str_method(refer_to="removeprefix", params="prefix : str") - def str_removeprefix(self, prefix): - return StrDefault.register(pandas.Series.str.removeprefix)(self, prefix) - - @doc_utils.doc_str_method(refer_to="removesuffix", params="suffix : str") - def str_removesuffix(self, suffix): - return StrDefault.register(pandas.Series.str.removesuffix)(self, suffix) - - @doc_utils.doc_str_method(refer_to="repeat", params="repeats : int") - def str_repeat(self, repeats): - return StrDefault.register(pandas.Series.str.repeat)(self, repeats) - - @doc_utils.doc_str_method( - refer_to="replace", - params=""" - pat : str - repl : str or callable - n : int, default: -1 - case : bool, optional - flags : int, default: 0 - regex : bool, default: True""", - ) - def str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - return StrDefault.register(pandas.Series.str.replace)( - self, pat, repl, n, case, flags, regex - ) - - @doc_utils.doc_str_method( - refer_to="rfind", - params=""" - sub : str - start : int, default: 0 - end : int, optional""", - ) - def str_rfind(self, sub, start=0, end=None): - return StrDefault.register(pandas.Series.str.rfind)(self, sub, start, end) - - @doc_utils.doc_str_method( - refer_to="rindex", - params=""" - sub : str - start : int, default: 0 - end : int, optional""", - ) - def str_rindex(self, sub, start=0, end=None): - return StrDefault.register(pandas.Series.str.rindex)(self, sub, start, end) - - @doc_utils.doc_str_method( - refer_to="rjust", - params=""" - width : int - fillchar : str, default: ' '""", - ) - def str_rjust(self, width, fillchar=" "): - return StrDefault.register(pandas.Series.str.rjust)(self, width, fillchar) - - @doc_utils.doc_str_method( - refer_to="rpartition", - params=""" - sep : str, default: ' ' - expand : bool, default: True""", - ) - def str_rpartition(self, sep=" ", expand=True): - return StrDefault.register(pandas.Series.str.rpartition)(self, sep, expand) - - @doc_utils.doc_str_method( - refer_to="rsplit", - params=""" - pat : str, optional - n : int, default: -1 - expand : bool, default: False""", - ) - def str_rsplit(self, pat=None, n=-1, expand=False): - return StrDefault.register(pandas.Series.str.rsplit)(self, pat, n, expand) - - @doc_utils.doc_str_method(refer_to="rstrip", params="to_strip : str, optional") - def str_rstrip(self, to_strip=None): - return StrDefault.register(pandas.Series.str.rstrip)(self, to_strip) - - @doc_utils.doc_str_method( - refer_to="slice", - params=""" - start : int, optional - stop : int, optional - step : int, optional""", - ) - def str_slice(self, start=None, stop=None, step=None): - return StrDefault.register(pandas.Series.str.slice)(self, start, stop, step) - - @doc_utils.doc_str_method( - refer_to="slice_replace", - params=""" - start : int, optional - stop : int, optional - repl : str or callable, optional""", - ) - def str_slice_replace(self, start=None, stop=None, repl=None): - return StrDefault.register(pandas.Series.str.slice_replace)( - self, start, stop, repl - ) - - @doc_utils.doc_str_method( - refer_to="split", - params=""" - pat : str, optional - n : int, default: -1 - expand : bool, default: False""", - ) - def str_split(self, pat=None, n=-1, expand=False, regex=None): - return StrDefault.register(pandas.Series.str.split)( - self, pat, n=n, expand=expand, regex=regex - ) - - @doc_utils.doc_str_method( - refer_to="startswith", - params=""" - pat : str - na : object, default: np.NaN""", - ) - def str_startswith(self, pat, na=np.NaN): - return StrDefault.register(pandas.Series.str.startswith)(self, pat, na) - - @doc_utils.doc_str_method(refer_to="strip", params="to_strip : str, optional") - def str_strip(self, to_strip=None): - return StrDefault.register(pandas.Series.str.strip)(self, to_strip) - - @doc_utils.doc_str_method(refer_to="swapcase", params="") - def str_swapcase(self): - return StrDefault.register(pandas.Series.str.swapcase)(self) - - @doc_utils.doc_str_method(refer_to="title", params="") - def str_title(self): - return StrDefault.register(pandas.Series.str.title)(self) - - @doc_utils.doc_str_method(refer_to="translate", params="table : dict") - def str_translate(self, table): - return StrDefault.register(pandas.Series.str.translate)(self, table) - - @doc_utils.doc_str_method(refer_to="upper", params="") - def str_upper(self): - return StrDefault.register(pandas.Series.str.upper)(self) - - @doc_utils.doc_str_method( - refer_to="wrap", - params=""" - width : int - **kwargs : dict""", - ) - def str_wrap(self, width, **kwargs): - return StrDefault.register(pandas.Series.str.wrap)(self, width, **kwargs) - - @doc_utils.doc_str_method(refer_to="zfill", params="width : int") - def str_zfill(self, width): - return StrDefault.register(pandas.Series.str.zfill)(self, width) - - @doc_utils.doc_str_method( - refer_to="encode", params="encoding: str, errors: str, optional" - ) - def str_encode(self, encoding, errors="strict"): - return StrDefault.register(pandas.Series.str.encode)(self, encoding, errors) - - @doc_utils.doc_str_method( - refer_to="decode", params="encoding: str, errors: str, optional" - ) - def str_decode(self, encoding, errors="strict"): - return StrDefault.register(pandas.Series.str.decode)(self, encoding, errors) - - @doc_utils.doc_str_method( - refer_to="cat", - params=""" - others : Series, Index, DataFrame, np.ndarray or list-like, - sep : str, default: '', - na_rep : str or None, default: None, - join : {'left', 'right', 'outer', 'inner'}, default: 'left'""", - ) - def str_cat(self, others, sep=None, na_rep=None, join="left"): - return StrDefault.register(pandas.Series.str.cat)( - self, others, sep, na_rep, join - ) - - @doc_utils.doc_str_method(refer_to="__getitem__", params="key : object") - def str___getitem__(self, key): - return StrDefault.register(pandas.Series.str.__getitem__)(self, key) - - # End of Str methods - - # Rolling methods - - def shift( - self, - periods: int = 1, - freq=None, - axis: Axis = 0, - fill_value: Hashable = no_default, - ) -> "BaseQueryCompiler": - - # TODO: implement generic Modin version - ErrorMessage.not_implemented( # pragma: no cover - "base method shift not implemented" # pragma: no cover - ) # pragma: no cover - - # FIXME: most of the rolling/window methods take *args and **kwargs parameters - # which are only needed for the compatibility with numpy, this behavior is inherited - # from the API level, we should get rid of it (Modin issue #3108). - - @doc_utils.doc_window_method( - result="the result of passed functions", - action="apply specified functions", - refer_to="aggregate", - params=""" - func : str, dict, callable(pandas.Series) -> scalar, or list of such - *args : iterable - **kwargs : dict""", - build_rules="udf_aggregation", - ) - def rolling_aggregate(self, fold_axis, rolling_args, func, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.aggregate)( - self, rolling_args, func, *args, **kwargs - ) - - # FIXME: at the query compiler method `rolling_apply` is an alias for `rolling_aggregate`, - # one of these should be removed (Modin issue #3107). - @doc_utils.add_deprecation_warning(replacement_method="rolling_aggregate") - @doc_utils.doc_window_method( - result="the result of passed function", - action="apply specified function", - refer_to="apply", - params=""" - func : callable(pandas.Series) -> scalar - raw : bool, default: False - engine : None, default: None - This parameters serves the compatibility purpose. Always has to be None. - engine_kwargs : None, default: None - This parameters serves the compatibility purpose. Always has to be None. - args : tuple, optional - kwargs : dict, optional""", - build_rules="udf_aggregation", - ) - def rolling_apply( - self, - fold_axis, - rolling_args, - func, - raw=False, - engine=None, - engine_kwargs=None, - args=None, - kwargs=None, - ): - return RollingDefault.register(pandas.core.window.rolling.Rolling.apply)( - self, rolling_args, func, raw, engine, engine_kwargs, args, kwargs - ) - - @doc_utils.doc_window_method( - result="correlation", - refer_to="corr", - params=""" - other : modin.pandas.Series, modin.pandas.DataFrame, list-like, optional - pairwise : bool, optional - *args : iterable - **kwargs : dict""", - ) - def rolling_corr( - self, fold_axis, rolling_args, other=None, pairwise=None, *args, **kwargs - ): - return RollingDefault.register(pandas.core.window.rolling.Rolling.corr)( - self, rolling_args, other, pairwise, *args, **kwargs - ) - - @doc_utils.doc_window_method(result="number of non-NA values", refer_to="count") - def rolling_count(self, fold_axis, rolling_args): - return RollingDefault.register(pandas.core.window.rolling.Rolling.count)( - self, rolling_args - ) - - @doc_utils.doc_window_method( - result="covariance", - refer_to="cov", - params=""" - other : modin.pandas.Series, modin.pandas.DataFrame, list-like, optional - pairwise : bool, optional - ddof : int, default: 1 - **kwargs : dict""", - ) - def rolling_cov( - self, fold_axis, rolling_args, other=None, pairwise=None, ddof=1, **kwargs - ): - return RollingDefault.register(pandas.core.window.rolling.Rolling.cov)( - self, rolling_args, other, pairwise, ddof, **kwargs - ) - - @doc_utils.doc_window_method( - result="unbiased kurtosis", refer_to="kurt", params="**kwargs : dict" - ) - def rolling_kurt(self, fold_axis, rolling_args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.kurt)( - self, rolling_args, **kwargs - ) - - @doc_utils.doc_window_method( - result="maximum value", - refer_to="max", - params=""" - *args : iterable - **kwargs : dict""", - ) - def rolling_max(self, fold_axis, rolling_args, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.max)( - self, rolling_args, *args, **kwargs - ) - - @doc_utils.doc_window_method( - result="mean value", - refer_to="mean", - params=""" - *args : iterable - **kwargs : dict""", - ) - def rolling_mean(self, fold_axis, rolling_args, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.mean)( - self, rolling_args, *args, **kwargs - ) - - @doc_utils.doc_window_method( - result="median value", refer_to="median", params="**kwargs : dict" - ) - def rolling_median(self, fold_axis, rolling_args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.median)( - self, rolling_args, **kwargs - ) - - @doc_utils.doc_window_method( - result="minimum value", - refer_to="min", - params=""" - *args : iterable - **kwargs : dict""", - ) - def rolling_min(self, fold_axis, rolling_args, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.min)( - self, rolling_args, *args, **kwargs - ) - - @doc_utils.doc_window_method( - result="quantile", - refer_to="quantile", - params=""" - quantile : float - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, default: 'linear' - **kwargs : dict""", - ) - def rolling_quantile( - self, fold_axis, rolling_args, quantile, interpolation="linear", **kwargs - ): - return RollingDefault.register(pandas.core.window.rolling.Rolling.quantile)( - self, rolling_args, quantile, interpolation, **kwargs - ) - - @doc_utils.doc_window_method( - result="unbiased skewness", refer_to="skew", params="**kwargs : dict" - ) - def rolling_skew(self, fold_axis, rolling_args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.skew)( - self, rolling_args, **kwargs - ) - - @doc_utils.doc_window_method( - result="standard deviation", - refer_to="std", - params=""" - ddof : int, default: 1 - *args : iterable - **kwargs : dict""", - ) - def rolling_std(self, fold_axis, rolling_args, ddof=1, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.std)( - self, rolling_args, ddof, *args, **kwargs - ) - - @doc_utils.doc_window_method( - result="sum", - refer_to="sum", - params=""" - *args : iterable - **kwargs : dict""", - ) - def rolling_sum(self, fold_axis, rolling_args, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.sum)( - self, rolling_args, *args, **kwargs - ) - - @doc_utils.doc_window_method( - result="variance", - refer_to="var", - params=""" - ddof : int, default: 1 - *args : iterable - **kwargs : dict""", - ) - def rolling_var(self, fold_axis, rolling_args, ddof=1, *args, **kwargs): - return RollingDefault.register(pandas.core.window.rolling.Rolling.var)( - self, rolling_args, ddof, *args, **kwargs - ) - - # End of Rolling methods - - # Window methods - - @doc_utils.doc_window_method( - win_type="window of the specified type", - result="mean", - refer_to="mean", - params=""" - *args : iterable - **kwargs : dict""", - ) - def window_mean(self, fold_axis, window_args, *args, **kwargs): - return RollingDefault.register(pandas.core.window.Window.mean)( - self, window_args, *args, **kwargs - ) - - @doc_utils.doc_window_method( - win_type="window of the specified type", - result="standard deviation", - refer_to="std", - params=""" - ddof : int, default: 1 - *args : iterable - **kwargs : dict""", - ) - def window_std(self, fold_axis, window_args, ddof=1, *args, **kwargs): - return RollingDefault.register(pandas.core.window.Window.std)( - self, window_args, ddof, *args, **kwargs - ) - - @doc_utils.doc_window_method( - win_type="window of the specified type", - result="sum", - refer_to="sum", - params=""" - *args : iterable - **kwargs : dict""", - ) - def window_sum(self, fold_axis, window_args, *args, **kwargs): - return RollingDefault.register(pandas.core.window.Window.sum)( - self, window_args, *args, **kwargs - ) - - @doc_utils.doc_window_method( - win_type="window of the specified type", - result="variance", - refer_to="var", - params=""" - ddof : int, default: 1 - *args : iterable - **kwargs : dict""", - ) - def window_var(self, fold_axis, window_args, ddof=1, *args, **kwargs): - return RollingDefault.register(pandas.core.window.Window.var)( - self, window_args, ddof, *args, **kwargs - ) - - # End of Window methods - - # Categories methods - - @doc_utils.add_one_column_warning - @doc_utils.add_refer_to("Series.cat.codes") - def cat_codes(self): - """ - Convert underlying categories data into its codes. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing the integer codes of the underlying - categories. - """ - return CatDefault.register(pandas.Series.cat.codes)(self) - - # End of Categories methods - - # DataFrame methods - - @doc_utils.doc_reduce_agg( - method="mean absolute deviation", - params=""" - axis : {0, 1} - skipna : bool - level : None, default: None - Serves the compatibility purpose. Always has to be None.""", - refer_to="mad", - ) - def mad(self, axis, skipna, level=None): - return DataFrameDefault.register(pandas.DataFrame.mad)( - self, axis=axis, skipna=skipna, level=level - ) - - @doc_utils.doc_reduce_agg( - method="unbiased kurtosis", refer_to="kurt", extra_params=["skipna", "**kwargs"] - ) - def kurt(self, axis, level=None, numeric_only=None, skipna=True, **kwargs): - return DataFrameDefault.register(pandas.DataFrame.kurt)( - self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - - sum_min_count = sum - prod_min_count = prod - - @doc_utils.add_refer_to("DataFrame.compare") - def compare(self, other, align_axis, keep_shape, keep_equal, result_names): - """ - Compare data of two QueryCompilers and highlight the difference. - - Parameters - ---------- - other : BaseQueryCompiler - Query compiler to compare with. Have to be the same shape and the same - labeling as `self`. - align_axis : {0, 1} - keep_shape : bool - keep_equal : bool - result_names : tuple - - Returns - ------- - BaseQueryCompiler - New QueryCompiler containing the differences between `self` and passed - query compiler. - """ - return DataFrameDefault.register(pandas.DataFrame.compare)( - self, - other=other, - align_axis=align_axis, - keep_shape=keep_shape, - keep_equal=keep_equal, - result_names=result_names, - ) - - def repartition(self, axis=None): - """ - Repartitioning QueryCompiler objects to get ideal partitions inside. - - Allows to improve performance where the query compiler can't improve - yet by doing implicit repartitioning. - - Parameters - ---------- - axis : {0, 1, None}, optional - The axis along which the repartitioning occurs. - `None` is used for repartitioning along both axes. - - Returns - ------- - BaseQueryCompiler - The repartitioned BaseQueryCompiler. - """ - - axes = [0, 1] if axis is None else [axis] - - new_query_compiler = self - for _ax in axes: - new_query_compiler = new_query_compiler.__constructor__( - new_query_compiler._modin_frame.apply_full_axis( - _ax, - lambda df: df, - new_index=self._modin_frame._index_cache, - new_columns=self._modin_frame._columns_cache, - keep_partitioning=False, - sync_labels=False, - ) - ) - return new_query_compiler - - # End of DataFrame methods diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index f740fbf3b4d..5c9b3bfb64c 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -7,12 +7,13 @@ import re from collections.abc import Hashable, Iterable, Mapping, Sequence from datetime import tzinfo -from typing import Any, Callable, Literal, Optional, Union, get_args +from typing import Any, Callable, Literal, NoReturn, Optional, Union, get_args import numpy as np import numpy.typing as npt import pandas as native_pd import pandas.core.resample +from modin.core.storage_formats import BaseQueryCompiler from numpy import dtype from pandas._libs import lib from pandas._libs.lib import no_default @@ -227,10 +228,6 @@ rule_to_snowflake_width_and_slice_unit, validate_resample_supported_by_snowflake, ) -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - SnowparkPandasTelemetryField, - TelemetryField, -) from snowflake.snowpark.modin.plugin._internal.timestamp_utils import ( VALID_TO_DATETIME_DF_KEYS, DateTimeOrigin, @@ -299,10 +296,6 @@ PandasLabelToSnowflakeIdentifierPair, SnowflakeSupportedFileTypeLit, ) -from snowflake.snowpark.modin.plugin.compiler.query_compiler import BaseQueryCompiler -from snowflake.snowpark.modin.plugin.default2pandas.stored_procedure_utils import ( - StoredProcedureDefault, -) from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.modin.utils import MODIN_UNNAMED_SERIES_LABEL @@ -646,6 +639,9 @@ def finalize(self) -> None: def free(self) -> None: pass + def execute(self) -> None: + pass + def to_numpy( self, dtype: Optional[npt.DTypeLike] = None, @@ -664,7 +660,7 @@ def repartition(self, axis: Any = None) -> "SnowflakeQueryCompiler": def default_to_pandas( self, pandas_op: Callable, *args: Any, **kwargs: Any - ) -> "SnowflakeQueryCompiler": + ) -> NoReturn: func_name = pandas_op.__name__ # this is coming from Modin's encoding scheme in default.py:build_default_to_pandas @@ -672,21 +668,18 @@ def default_to_pandas( # extract DataFrame operation, following extraction fails if not adhering to above format object_type, fn_name = func_name[len("". - # Not following this format will cause exception. - fn.__name__ = f"" - with SqlCounter(query_count=7, fallback_count=1, sproc_count=1): - new_query_compiler = df._query_compiler.default_to_pandas(fn, 2) - assert new_query_compiler.snowpark_pandas_api_calls == [ - {"is_fallback": True, "name": "DataFrame.mod"} - ] - - # Test NotImplementedError - fn.__name__ = f"" - df._query_compiler.snowpark_pandas_api_calls.clear() - with SqlCounter(query_count=7, fallback_count=1, sproc_count=1): - new_query_compiler = df._query_compiler.default_to_pandas(fn, 2) - assert df._query_compiler.snowpark_pandas_api_calls == [] - assert new_query_compiler.snowpark_pandas_api_calls == [ - {"is_fallback": True, "name": "Series.mod"} - ] - - # another fallback example - with SqlCounter(query_count=7, fallback_count=1, sproc_count=1): - df2 = df.dropna(axis=1) - assert df2._query_compiler.snowpark_pandas_api_calls == [ - {"name": "DataFrame.dropna", "is_fallback": True}, - {"name": "DataFrame.DataFrame.dropna", "argument": ["axis"]}, - ] - - @patch.object(TelemetryClient, "send") @sql_count_checker(query_count=0) def test_send_snowpark_pandas_telemetry_helper(send_mock): diff --git a/tests/integ/modin/test_default2pandas.py b/tests/integ/modin/test_unimplemented.py similarity index 58% rename from tests/integ/modin/test_default2pandas.py rename to tests/integ/modin/test_unimplemented.py index f2d61617a1f..fa876af85f5 100644 --- a/tests/integ/modin/test_default2pandas.py +++ b/tests/integ/modin/test_unimplemented.py @@ -1,13 +1,10 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # -import logging -import re from collections.abc import Generator from typing import Callable, Union import modin.pandas as pd -import numpy as np import pandas as native_pd import pytest from _pytest.logging import LogCaptureFixture @@ -19,10 +16,7 @@ PACKAGING_REQUIREMENT, SNOWPARK_PANDAS_IMPORT, ) -from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import sql_count_checker -from tests.integ.modin.utils import assert_snowpark_pandas_equal_to_pandas def eval_and_validate_unsupported_methods( @@ -50,44 +44,9 @@ def eval_and_validate_unsupported_methods( native_pd_args = native_pd_args[0] if len(native_pd_args) == 1 else native_pd_args snow_pd_args = snow_pd_args[0] if len(snow_pd_args) == 1 else snow_pd_args - # verify SnowflakeQueryCompiler default_to_pandas is called - caplog.clear() - # Normally, the warning message only appears once per Python process for - # each unique message. Clear the set of printed warnings so that the - # warning appears for each test case. - WarningMessage.printed_warnings.clear() - with caplog.at_level(logging.DEBUG): - result = func(snow_pd_args) - if inplace: - result = snow_pd_args[0] - # This phrase is from the internal message that appears at DEBUG level. - assert any( - record.levelno == logging.DEBUG - and "Default to (native) pandas" in record.message - for record in caplog.records - ) - # This phrase is from the WARNING log message that the user will see. - assert any( - record.levelno == logging.WARNING - and re.search( - r"Falling back to native pandas with a stored procedure for .* Execution of this method could be slow", - record.message, - ) - for record in caplog.records - ) - - assert func_name in caplog.text - - native_result = func(native_pd_args) - if inplace: - native_result = native_pd_args[0] - # verify the result for snowpark and native pandas after the operation - if isinstance(native_result, (native_pd.DataFrame, native_pd.Series)): - assert_snowpark_pandas_equal_to_pandas(result, native_result, check_dtype=False) - elif isinstance(native_result, np.ndarray): - np.testing.assert_array_equal(result, native_result) - else: - assert native_result == result + func(native_pd_args) + with pytest.raises(NotImplementedError): + func(snow_pd_args) # unsupported methods for both dataframe and series @@ -119,17 +78,11 @@ def eval_and_validate_unsupported_methods( # When any unsupported method gets supported, we should run the test to verify (expect failure) # and remove the corresponding method in the above list. # When most of the methods are supported, we should run all unsupported methods -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "func, func_name", UNSUPPORTED_DATAFRAME_SERIES_METHODS + UNSUPPORTED_DATAFRAME_METHODS, ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_unsupported_dataframe_methods(func, func_name, caplog): data = {"a": [1, 2, 3], "b": [4, 5, 6]} # Native pandas @@ -137,77 +90,21 @@ def test_unsupported_dataframe_methods(func, func_name, caplog): eval_and_validate_unsupported_methods(func, func_name, [native_df], caplog) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker( - query_count=14, - fallback_count=2, - sproc_count=2, - # expect high count because we're falling back to pandas twice. - expect_high_count=True, -) -def test_unsupported_dataframe_method_only_warns_once(caplog): - caplog.clear() - # Ideally, we would run this test method in a separate pytest process, - # e.g. using the @pytest.mark.forked decorator from pytest-forked. However, - # pytest-forked doesn't seem to work with the Snowflake connection object. - # It seems that when one process closes the connection object, the other - # connection can no longer use its connection object. - # Instead, clear WarningMessage.printed_warnings so that there is no record - # of already logging a warning for this method. - WarningMessage.printed_warnings.clear() - df = pd.DataFrame([1, 2]) - df.cumprod() - assert any( - record.levelno == logging.WARNING - and "Falling back to native pandas with a stored procedure for " - + ". Execution of this method could be " - + "slow" - in record.message - for record in caplog.records - ) - caplog.clear() - df.cumprod() - assert not any( - record.levelno == logging.WARNING - and "Falling back to native pandas with a stored procedure for " - + ". Execution of this method could be " - + "slow" - in record.message - for record in caplog.records - ) - - -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "func, func_name", UNSUPPORTED_SERIES_METHODS + UNSUPPORTED_DATAFRAME_SERIES_METHODS, ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_unsupported_series_methods(func, func_name, caplog) -> None: native_series = native_pd.Series([5, 4, 0, 6, 6, 4]) eval_and_validate_unsupported_methods(func, func_name, [native_series], caplog) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "func, func_name", UNSUPPORTED_BINARY_METHODS, ) -@sql_count_checker(query_count=10, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_unsupported_dataframe_binary_methods(func, func_name, caplog) -> None: # Native pandas native_df1 = native_pd.DataFrame([[0, 1], [2, 3]]) @@ -222,17 +119,11 @@ def test_unsupported_dataframe_binary_methods(func, func_name, caplog) -> None: ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "func, func_name", UNSUPPORTED_BINARY_METHODS, ) -@sql_count_checker(query_count=10, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=1) def test_unsupported_series_binary_methods(func, func_name, caplog) -> None: native_se1 = native_pd.Series([1, 2, 3, 0, 2]) native_se2 = native_pd.Series([2, 3, 10, 0, 1]) @@ -253,17 +144,11 @@ def test_unsupported_series_binary_methods(func, func_name, caplog) -> None: ] -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "func, func_name", UNSUPPORTED_STR_METHODS, ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_unsupported_str_methods(func, func_name, caplog) -> None: native_series = native_pd.Series(["bat.aB", "com.fcc", "foo", "bar"]) eval_and_validate_unsupported_methods(func, func_name, [native_series], caplog) @@ -277,17 +162,11 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None: ] -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "func, func_name", UNSUPPORTED_DT_METHODS, ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_unsupported_dt_methods(func, func_name, caplog) -> None: datetime_series = native_pd.Series( native_pd.date_range("2000-01-01", periods=3, freq="h") @@ -295,36 +174,6 @@ def test_unsupported_dt_methods(func, func_name, caplog) -> None: eval_and_validate_unsupported_methods(func, func_name, [datetime_series], caplog) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=SnowparkSQLException, -) -@sql_count_checker(query_count=4, fallback_count=0, sproc_count=1) -def test_fallback_in_stored_proc(session): - def func(session: Session) -> int: - df = pd.DataFrame([1, 2, 3]) - df.apply(lambda x: x) # trigger fallback - df[0].apply(lambda x: x) - # will call transpose (see negative test below) - # return df.sum()[0] - return df[0][0] - - packages = list(session.get_packages().values()) - if "pandas" not in packages: - packages = [native_pd] + packages - if "snowflake-snowpark-python" not in packages: - packages = packages + ["snowflake-snowpark-python"] - if PACKAGING_REQUIREMENT not in packages: - packages.append(PACKAGING_REQUIREMENT) - func_proc = session.sproc.register( - func, - imports=[SNOWPARK_PANDAS_IMPORT], - packages=packages, - ) - assert func_proc() == 1 - - # Negative test for SNOW-972740 - Apply on a series changes causes errors in a later transpose @sql_count_checker(query_count=3, fallback_count=0, sproc_count=0) def test_fallback_transpose_after_apply_in_stored_proc_negative(session): diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py index 74be6299133..79fe0bcf087 100644 --- a/tests/integ/modin/tools/test_to_datetime.py +++ b/tests/integ/modin/tools/test_to_datetime.py @@ -23,7 +23,6 @@ SnowparkFetchDataException, SnowparkSQLException, ) -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_series_equal, @@ -101,23 +100,18 @@ def test_to_datetime_format_scalar(self, cache, arg, expected, format): expected = Timestamp(expected) assert result == expected - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "arg, format", [ ["1/1/2000", "%d/%w/%Y"], ], ) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) - def test_to_datetime_format_fallback(self, cache, arg, format): - assert to_datetime(arg, format=format, cache=cache) == native_pd.to_datetime( - arg, format=format, cache=cache - ) + @sql_count_checker(query_count=0) + def test_to_datetime_format_unimplemented(self, cache, arg, format): + with pytest.raises(NotImplementedError): + assert to_datetime( + arg, format=format, cache=cache + ) == native_pd.to_datetime(arg, format=format, cache=cache) @pytest.mark.parametrize( "arg, format", @@ -382,25 +376,16 @@ def test_to_datetime_format_microsecond(self, cache): def test_to_datetime_format_time(self, cache, value, format, dt): assert to_datetime(value, format=format, cache=cache) == dt - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") - @sql_count_checker(query_count=16, fallback_count=2, sproc_count=2) - def test_to_datetime_with_non_exact_fallback(self, cache): + @sql_count_checker(query_count=0) + def test_to_datetime_with_non_exact_unimplemented(self, cache): # GH 10834 # 8904 # exact kw ser = Series( ["19MAY11", "foobar19MAY11", "19MAY11:00:00:00", "19MAY11 00:00:00Z"] ) - result = to_datetime(ser, format="%d%b%y", exact=False, cache=cache) - expected = to_datetime( - ser.str.extract(r"(\d+\w+\d+)", expand=False), format="%d%b%y", cache=cache - ) - assert_series_equal(result, expected) + with pytest.raises(NotImplementedError): + to_datetime(ser, format="%d%b%y", exact=False, cache=cache) @pytest.mark.parametrize( "arg", @@ -421,11 +406,6 @@ def test_parse_nanoseconds_with_formula(self, cache, arg): result = to_datetime(arg, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) assert result == expected - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) @pytest.mark.parametrize( "value,fmt,expected", [ @@ -433,9 +413,10 @@ def test_parse_nanoseconds_with_formula(self, cache, arg): ["2013020", "%Y%U%w", Timestamp("2013-01-13")], ], ) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) + @sql_count_checker(query_count=0) def test_to_datetime_format_weeks(self, value, fmt, expected, cache): - assert to_datetime(value, format=fmt, cache=cache) == expected + with pytest.raises(NotImplementedError): + assert to_datetime(value, format=fmt, cache=cache) == expected @pytest.mark.parametrize( "fmt,dates,expected_dates", @@ -477,11 +458,6 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): # with SqlCounter(query_count=1): tm.assert_equal(result, expected_dates) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) @pytest.mark.parametrize( "fmt,dates,expected_dates", [ @@ -505,15 +481,13 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): ], ], ) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") + @sql_count_checker(query_count=0) def test_to_datetime_parse_tzname_or_tzoffset_fallback( self, fmt, dates, expected_dates ): # GH 13486 - result = to_datetime(dates, format=fmt).to_list() - # with SqlCounter(query_count=1): - tm.assert_equal(result, expected_dates) + with pytest.raises(NotImplementedError): + to_datetime(dates, format=fmt).to_list() @sql_count_checker(query_count=4) def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): @@ -684,12 +658,6 @@ def test_to_datetime_dict(self, sample): native_pd.to_datetime(sample), ) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "sample", [ @@ -700,15 +668,16 @@ def test_to_datetime_dict(self, sample): }, # non int types ], ) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) + @sql_count_checker(query_count=0) def test_to_datetime_df_fallback(self, sample): - eval_snowpark_pandas_result( - pd.DataFrame(sample), - native_pd.DataFrame(sample), - lambda df: pd.to_datetime(df) - if isinstance(df, pd.DataFrame) - else native_pd.to_datetime(df), - ) + with pytest.raises(NotImplementedError): + eval_snowpark_pandas_result( + pd.DataFrame(sample), + native_pd.DataFrame(sample), + lambda df: pd.to_datetime(df) + if isinstance(df, pd.DataFrame) + else native_pd.to_datetime(df), + ) @pytest.mark.parametrize( "origin,unit", diff --git a/tests/integ/modin/tools/test_to_numeric.py b/tests/integ/modin/tools/test_to_numeric.py index c9497103842..e2176aee126 100644 --- a/tests/integ/modin/tools/test_to_numeric.py +++ b/tests/integ/modin/tools/test_to_numeric.py @@ -1,6 +1,7 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +import contextlib import logging from datetime import date, time @@ -27,14 +28,7 @@ def downcast(request): @pytest.fixture( params=[ - pytest.param( - "ignore", - marks=pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ), - ), + "ignore", "raise", "coerce", ] @@ -176,15 +170,9 @@ def test_to_numeric_errors(errors): expect_exception_match="Numeric value 'apple' is not recognized", ) else: - if errors == "ignore": - expected_query_count = 8 - expected_fallback_count = 1 - else: - expected_query_count = 1 - expected_fallback_count = 0 - with SqlCounter( - query_count=expected_query_count, fallback_count=expected_fallback_count - ): + with SqlCounter(query_count=0 if errors == "ignore" else 1), pytest.raises( + NotImplementedError + ) if errors == "ignore" else contextlib.nullcontext(): eval_snowpark_pandas_result( pd.Series(input), native_pd.Series(input), @@ -202,34 +190,19 @@ def test_to_numeric_errors(errors): ], ) @pytest.mark.parametrize( - "errors, expected_query_count, expected_fallback_count", + "errors, expected_query_count", [ - pytest.param( - "ignore", - 7, - 1, - marks=pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ), - ), - ["coerce", 0, 0], + ["ignore", 0], + ["coerce", 0], ], ) -def test_to_numeric_errors_dtype( - input, errors, expected_query_count, expected_fallback_count -): - with SqlCounter( - query_count=expected_query_count, fallback_count=expected_fallback_count - ): +def test_to_numeric_errors_dtype(input, errors, expected_query_count): + with SqlCounter(query_count=expected_query_count), pytest.raises( + NotImplementedError + ) if errors == "ignore" else contextlib.nullcontext(): ret = pd.to_numeric(input, errors=errors) - if errors == "ignore": - # since it includes original value so the dtype is object - assert ret.dtype == np.dtype("object") - else: - # since invalid parsing will be treated as null, the dtype will be float64 - assert ret.dtype == np.dtype("float64") + # since invalid parsing will be treated as null, the dtype will be float64 + assert ret.dtype == np.dtype("float64") @sql_count_checker(query_count=0) @@ -301,16 +274,9 @@ def test_type_check(): def test_datetime_like(errors): input = native_pd.date_range("20130101", periods=3) - if errors == "ignore": - expected_query_count = 8 - expected_fallback_count = 1 - else: - expected_query_count = 1 - expected_fallback_count = 0 - - with SqlCounter( - query_count=expected_query_count, fallback_count=expected_fallback_count - ): + with SqlCounter(query_count=0 if errors == "ignore" else 1), pytest.raises( + NotImplementedError + ) if errors == "ignore" else contextlib.nullcontext(): eval_snowpark_pandas_result( pd.Series(input), native_pd.Series(input), diff --git a/tests/unit/modin/test_series_dt.py b/tests/unit/modin/test_series_dt.py index 9a59cbab5cb..fe1ad7193b8 100644 --- a/tests/unit/modin/test_series_dt.py +++ b/tests/unit/modin/test_series_dt.py @@ -34,9 +34,7 @@ def mock_query_compiler_for_dt_series() -> SnowflakeQueryCompiler: return fake_query_compiler -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register") @pytest.mark.parametrize( "func, func_name", [ @@ -44,8 +42,6 @@ def mock_query_compiler_for_dt_series() -> SnowflakeQueryCompiler: (lambda s: s.dt.timetz, "timetz"), (lambda s: s.dt.microsecond, "microsecond"), (lambda s: s.dt.nanosecond, "nanosecond"), - (lambda s: s.dt.week, "week"), - (lambda s: s.dt.weekofyear, "weekofyear"), (lambda s: s.dt.dayofweek, "dayofweek"), (lambda s: s.dt.weekday, "weekday"), (lambda s: s.dt.dayofyear, "dayofyear"), @@ -92,9 +88,7 @@ def test_dt_methods( assert res._query_compiler == mock_query_compiler_for_dt_series, func_name -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register") def test_dt_components(mock_datetime_register, mock_query_compiler_for_dt_series): mock_series = pd.Series(query_compiler=mock_query_compiler_for_dt_series) return_callable = mock.create_autospec(Callable) @@ -106,9 +100,7 @@ def test_dt_components(mock_datetime_register, mock_query_compiler_for_dt_series assert res._query_compiler == mock_query_compiler_for_dt_series -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register") def test_dt_to_pytimedelta(mock_datetime_register, mock_query_compiler_for_dt_series): mock_series = pd.Series(query_compiler=mock_query_compiler_for_dt_series) result_query_compiler = mock.create_autospec(SnowflakeQueryCompiler) @@ -133,9 +125,7 @@ def test_dt_to_pytimedelta(mock_datetime_register, mock_query_compiler_for_dt_se assert res.tolist() == np.array([datetime.timedelta(0), 0]).tolist() -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register") def test_dt_to_pydatetime(mock_datetime_register, mock_query_compiler_for_dt_series): mock_series = pd.Series(query_compiler=mock_query_compiler_for_dt_series) result_query_compiler = mock.create_autospec(SnowflakeQueryCompiler) @@ -164,9 +154,7 @@ def test_dt_tz(): assert res == time_type.tz -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.DateTimeDefault.register") def test_dt_freq(mock_datetime_register, mock_query_compiler_for_dt_series): mock_series = pd.Series(query_compiler=mock_query_compiler_for_dt_series) result_query_compiler = mock.create_autospec(SnowflakeQueryCompiler) diff --git a/tests/unit/modin/test_series_strings.py b/tests/unit/modin/test_series_strings.py index 1b6d45c19aa..1c0754c537c 100644 --- a/tests/unit/modin/test_series_strings.py +++ b/tests/unit/modin/test_series_strings.py @@ -14,9 +14,7 @@ ) -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.StrDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.StrDefault.register") def test_str_cat_no_others(mock_str_register, mock_series): result_query_compiler = mock.create_autospec(SnowflakeQueryCompiler) result_query_compiler.to_pandas.return_value = native_pd.DataFrame(["abc"]) @@ -27,9 +25,7 @@ def test_str_cat_no_others(mock_str_register, mock_series): assert res == "abc" -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.StrDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.StrDefault.register") @pytest.mark.parametrize( "func, func_name", [ @@ -85,9 +81,7 @@ def test_str_methods_with_series_return( assert res._query_compiler == mock_single_col_query_compiler, func_name -@mock.patch( - "snowflake.snowpark.modin.core.dataframe.algebra.default2pandas.StrDefault.register" -) +@mock.patch("modin.core.dataframe.algebra.default2pandas.StrDefault.register") @pytest.mark.parametrize( "func, func_name", [