From 1309d807c273b92837fba297f054dff26f0f6ba9 Mon Sep 17 00:00:00 2001 From: Naresh Kumar <113932371+sfc-gh-nkumar@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:07:01 -0700 Subject: [PATCH] SNOW-1316977: Remove fallbacks and raise not implemented error (#1435) Please answer these questions before submitting your pull requests. Thanks! 1. What GitHub issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1316977 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency 3. Please describe how your code solves the related issue. Remove fallback for all dataframe and series APIs. This list is primarily driven from https://docs.google.com/document/d/1uMwNgLqFhtoAFeEj59XjR3uKQ77QjA84aTmv8Erbvw0/edit#heading=h.qhdbmdhgqdh5. There still might be more fallback which are not documented. This will be cleaned up in https://snowflakecomputing.atlassian.net/browse/SNOW-1347394 NOTE: I have kept the existing tests mostly unmodified even though there is redundancy. This can be useful when we implement these APIs. NOTE: This also require updates in docs/*supported.rst files. These files are not yet ported from snowpandas. So will do a follow up PR to update the docs. --- .../snowpark/modin/pandas/dataframe.py | 4 +- src/snowflake/snowpark/modin/pandas/series.py | 4 +- .../snowpark/modin/plugin/PANDAS_CHANGELOG.md | 22 + .../modin/plugin/_internal/timestamp_utils.py | 39 +- .../modin/plugin/compiler/query_compiler.py | 34 -- .../compiler/snowflake_query_compiler.py | 517 +++++++----------- .../binary/test_binary_default2pandas.py | 81 --- .../binary/test_binary_not_implemented.py | 54 ++ tests/integ/modin/frame/test_all_any.py | 99 +--- tests/integ/modin/frame/test_apply.py | 159 +----- tests/integ/modin/frame/test_applymap.py | 25 +- tests/integ/modin/frame/test_dropna.py | 20 +- tests/integ/modin/frame/test_fillna.py | 20 +- tests/integ/modin/frame/test_filter.py | 17 +- tests/integ/modin/frame/test_join.py | 33 +- tests/integ/modin/frame/test_mask.py | 25 +- tests/integ/modin/frame/test_merge.py | 54 +- tests/integ/modin/frame/test_nunique.py | 19 +- tests/integ/modin/frame/test_rename.py | 143 ++--- tests/integ/modin/frame/test_sort_values.py | 45 +- tests/integ/modin/frame/test_where.py | 25 +- tests/integ/modin/series/test_all_any.py | 81 +-- tests/integ/modin/series/test_apply.py | 91 +-- tests/integ/modin/series/test_astype.py | 17 +- tests/integ/modin/series/test_rename.py | 45 +- tests/integ/modin/series/test_sort_values.py | 24 +- .../integ/modin/strings/test_case_justify.py | 17 +- tests/unit/modin/test_series_strings.py | 3 +- 28 files changed, 561 insertions(+), 1156 deletions(-) delete mode 100644 tests/integ/modin/binary/test_binary_default2pandas.py create mode 100644 tests/integ/modin/binary/test_binary_not_implemented.py diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index a84748d7e8a..f1353834189 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -802,7 +802,9 @@ def dot(self, other): # noqa: PR01, RT01, D200 Compute the matrix multiplication between the ``DataFrame`` and `other`. """ # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions - ErrorMessage.not_implemented() # pragma: no cover + ErrorMessage.not_implemented( + "Snowpark pandas doesn't yet support 'dot' binary operation" + ) if isinstance(other, BasePandasDataset): common = self.columns.union(other.index) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 2e52624814c..3120e10e60a 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -998,7 +998,9 @@ def dot(self, other): # noqa: PR01, RT01, D200 Compute the dot product between the Series and the columns of `other`. """ # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions - ErrorMessage.not_implemented() # pragma: no cover + ErrorMessage.not_implemented( + "Snowpark pandas doesn't yet support 'dot' binary operation" + ) if isinstance(other, BasePandasDataset): common = self.index.union(other.index) diff --git a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md index a48ca9a795f..14d2785de34 100644 --- a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md +++ b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md @@ -7,6 +7,28 @@ - Fixed DataFrame's `__getitem__` with boolean DataFrame key. - Fixed incorrect regex used in `DataFrame/Series.replace`. +### Behavior Changes +- Raise not implemented error instead of fallback to pandas in following APIs: + - `pd.merge`, `DataFrame.merge` and `DataFrame.join` if given the `validate` parameter. + - `pd.to_numeric` if `error == 'ignore'`. + - `pd.to_datetime` if `format` is None or not supported in Snowflake or if `exact`, `infer_datetime_format` parameters are given or `origin == 'julian'` or `error == 'ignore'`. + - `DataFrame/Series.all` if called on non-integer/boolean columns. + - `DataFrame/Series.any` if called on non-integer/boolean columns. + - `DataFrame/Series.astype` if casting from string to datetime or `errors == 'ignore'`. + - `DataFrame/Series.dropna` if `axis == 1` + - `DataFrame/Series.mask` if given `axis` or `level` parameters. + - `DataFrame/Series.rename` if `mapper` is callable or the DataFrame/Series has MultiIndex. + - `DataFrame/Series.sort_values` if given the `key` parameter. + - `DataFrame/Series.sort_index` if given the `key` parameter. + - `DataFrame.nunique` if `axis == 1` + - `DataFrame.apply` if `axis == 0` or `func` is not callable or `result_type` is given or `args` and `kwargs` contain DataFrame or Series. + - `Series.apply` if `axis == 0` or `func` is not callable or `result_type` is given. + - `Series.applymap` if `na_action == 'igonre'`. + - `DataFrame/Series.ffill` if given the `limit` or `downcast` parameter. + - `DataFrame/Series.fillna` if given the `limit` or `downcast` parameter. + - `dot` binary operation between `DataFrame/Series`. + - `xor` binary operation between `DataFrame/Series`. + ## 1.14.0a2 (2024-04-18) ### Behavior Changes diff --git a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py index d5354343ca3..55eaf9f8b16 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py @@ -25,6 +25,7 @@ to_decimal, ) from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit +from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.types import ( BooleanType, @@ -304,44 +305,48 @@ def generate_timestamp_col( return new_col -def to_datetime_require_fallback( +def raise_if_to_datetime_not_supported( format: str, exact: Union[bool, lib.NoDefault] = lib.no_default, infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default, origin: DateTimeOrigin = "unix", errors: DateTimeErrorChoices = "raise", -) -> bool: +) -> None: """ - check whether to_datetime requires fallback + Raise not implemented error to_datetime API has any unsupported parameter or + parameter value Args: format: the format argument for to_datetime exact: the exact argument for to_datetime infer_datetime_format: the infer_datetime_format argument for to_datetime origin: the origin argument for to_datetime errors: the errors argument for to_datetime - - Returns: - True if fallback is required; otherwise False """ + error_message = None if format is not None and not is_snowflake_timestamp_format_valid( to_snowflake_timestamp_format(format) ): # if format is not given, Snowflake's auto format detection may be different from pandas behavior - return True - - if not exact: + error_message = ( + f"Snowpark pandas to_datetime API doesn't yet support given format {format}" + ) + elif not exact: # Snowflake does not allow the format to match anywhere in the target string when exact is False - return True - if infer_datetime_format != lib.no_default: + error_message = "Snowpark pandas to_datetime API doesn't yet support non exact format matching" + elif infer_datetime_format != lib.no_default: # infer_datetime_format is deprecated since version 2.0.0 - return True - if origin == "julian": + error_message = "Snowpark pandas to_datetime API doesn't support 'infer_datetime_format' parameter" + elif origin == "julian": # default for julian calendar support - return True - if errors == "ignore": + error_message = ( + "Snowpark pandas to_datetime API doesn't yet support julian calendar" + ) + elif errors == "ignore": # ignore requires return the whole original input which is not applicable in Snowfalke - return True - return False + error_message = "Snowpark pandas to_datetime API doesn't yet support 'ignore' value for errors parameter" + + if error_message: + ErrorMessage.not_implemented(error_message) def convert_dateoffset_to_interval( diff --git a/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py index a85f2be2245..271a6732469 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py @@ -1476,33 +1476,6 @@ def melt(self, *args, **kwargs): # noqa: PR02 """ return DataFrameDefault.register(pandas.DataFrame.melt)(self, *args, **kwargs) - @doc_utils.add_refer_to("DataFrame.sort_values") - def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): # noqa: PR02 - """ - Reorder the columns based on the lexicographic order of the given rows. - - Parameters - ---------- - rows : label or list of labels - The row or rows to sort by. - ascending : bool, default: True - Sort in ascending order (True) or descending order (False). - kind : {"quicksort", "mergesort", "heapsort"} - na_position : {"first", "last"} - ignore_index : bool - key : callable(pandas.Index) -> pandas.Index, optional - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. - - Returns - ------- - BaseQueryCompiler - New QueryCompiler that contains result of the sort. - """ - return DataFrameDefault.register(pandas.DataFrame.sort_values)( - self, by=rows, axis=1, ascending=ascending, **kwargs - ) - # END Abstract map across rows/columns # Map across rows/columns @@ -3610,13 +3583,6 @@ def str_cat(self, others, sep=None, na_rep=None, join="left"): self, others, sep, na_rep, join ) - @doc_utils.doc_str_method( - refer_to="casefold", - params="", - ) - def str_casefold(self): - return StrDefault.register(pandas.Series.str.casefold)(self) - @doc_utils.doc_str_method(refer_to="__getitem__", params="key : object") def str___getitem__(self, key): return StrDefault.register(pandas.Series.str.__getitem__)(self, key) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index fb845dad90a..30fba43335e 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -111,10 +111,7 @@ year, ) from snowflake.snowpark.modin.core.dataframe.algebra.default2pandas import ( - BinaryDefault, - DataFrameDefault, GroupByDefault, - SeriesDefault, ) from snowflake.snowpark.modin.plugin._internal import ( concat_utils, @@ -241,7 +238,7 @@ VALID_TO_DATETIME_DF_KEYS, DateTimeOrigin, generate_timestamp_col, - to_datetime_require_fallback, + raise_if_to_datetime_not_supported, to_snowflake_timestamp_format, ) from snowflake.snowpark.modin.plugin._internal.transpose_utils import ( @@ -1556,33 +1553,6 @@ def get_index_names(self, axis: int = 0) -> list[Hashable]: else self._modin_frame.data_column_pandas_index_names ) - def _binary_op_fallback( - self, - op: str, - other: Union[Scalar, AnyArrayLike, "pd.Series", "pd.DataFrame"], - axis: int, - squeeze_self: bool = False, - **kwargs: Any, - ) -> "SnowflakeQueryCompiler": - """this is a fallback till all binary operations are implemented.""" - - if axis == 1: - kwargs["axis"] = axis - if squeeze_self: - kwargs["squeeze_self"] = squeeze_self - pandas_op = getattr( - native_pd.Series if squeeze_self else native_pd.DataFrame, op - ) - - # Hotfix: SNOW-1062890 For some reason kwargs are passed for DataFrame.__and__ and DataFrame.__or__. - # However, pandas does not accept kwargs nor args for this function. Reset them here manually for affected - # operators. Could also add additional condition in axis==1 if condition, but better be explicit here - # for operator behavior. - if op in ["__and__", "__rand__", "__or__", "__ror__"]: - kwargs = {} - - return BinaryDefault.register(pandas_op)(self, other, **kwargs) - def _binary_op_scalar_rhs( self, op: str, other: Scalar, fill_value: Scalar ) -> "SnowflakeQueryCompiler": @@ -1787,11 +1757,11 @@ def binary_op( # In native pandas, single element list-like objects can be used as fill_value, however this does not # match pandas documentation; hence it is omitted in the Snowpark pandas implementation. raise ValueError("Only scalars can be used as fill_value.") - # add fill_value to kwargs in case fallback is called - kwargs["fill_value"] = fill_value if not is_binary_op_supported(op): - return self._binary_op_fallback(op, other, axis, squeeze_self, **kwargs) + ErrorMessage.not_implemented( + f"Snowpark pandas doesn't yet support '{op}' binary operation" + ) if is_scalar(other): # (Case 1): other is scalar @@ -1825,38 +1795,6 @@ def binary_op( # 10 NaN # dtype: float64 - # The logic should produce valid results but we do only have tests for the AND/OR/+ scenario, so - # conservatively use fallback here. TODO SNOW-913842 will remove this and add extensive testing. - if op not in { - "__or__", - "__ror__", - "__and__", - "__rand__", - "add", - "radd", - "sub", - "rsub", - "mul", - "rmul", - "truediv", - "rtruediv", - "floordiv", - "rfloordiv", - "mod", - "rmod", - "pow", - "rpow", - "eq", - "ne", - "gt", - "lt", - "ge", - "le", - }: - return self._binary_op_fallback( # pragma: no cover - op, other, axis, squeeze_self, **kwargs - ) - lhs_frame = self._modin_frame rhs_frame = other._query_compiler._modin_frame @@ -1969,17 +1907,11 @@ def _bool_reduce_helper( if not empty_columns and not all( is_bool_dtype(t) or is_integer_dtype(t) for t in self.dtypes ): - # Default if columns are non-integer/boolean - return DataFrameDefault.register( - native_pd.DataFrame.all - if reduce_op == "and" - else native_pd.DataFrame.any - )( - self, - axis=axis, - bool_only=_bool_only, - skipna=skipna, - ) # pragma: no cover + api_name = "all" if reduce_op == "and" else "any" + # Raise error if columns are non-integer/boolean + ErrorMessage.not_implemented( + f"Snowpark pandas {api_name} API doesn't yet support non-integer/boolean columns" + ) if axis == 1: # append a new column representing the reduction of all the columns @@ -2388,6 +2320,11 @@ def sort_index( ErrorMessage.not_implemented( "sort_index is not supported yet with inplace=True in Snowpark pandas." ) + if key: + ErrorMessage.not_implemented( + "Snowpark pandas sort_index API doesn't yet support 'key' parameter" + ) + if self._modin_frame.is_multiindex() or level is not None: ErrorMessage.not_implemented( "sort_index() with multi index is not supported yet in Snowpark pandas." @@ -2402,6 +2339,25 @@ def sort_index( key=key, ) + def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): + """ + Reorder the columns based on the lexicographic order of the given rows. + + Args: + rows : label or list of labels + The row or rows to sort by. + ascending : bool, default: True + Sort in ascending order (True) or descending order (False). + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns: + New QueryCompiler that contains result of the sort. + """ + ErrorMessage.not_implemented( + "Snowpark pandas sort_values API doesn't yet support axis == 1" + ) + def sort_rows_by_column_values( self, columns: list[Hashable], @@ -2433,20 +2389,8 @@ def sort_rows_by_column_values( return self if key: - # TODO SNOW-828589: Move all warning messages to single place. - logging.warning( - "Snowpark pandas doesn't currently support distributed computation of sort_values with 'key'." - ) - # This method will execute the sort operation using fallback on stored proc/vectorized udf. - return DataFrameDefault.register(native_pd.DataFrame.sort_values)( - self, - by=columns, - axis=0, - ascending=ascending, - kind=kind, - na_position=na_position, - ignore_index=ignore_index, - key=key, + ErrorMessage.not_implemented( + "Snowpark pandas sort_values API doesn't yet support 'key' parameter" ) # In native pandas, 'kind' option is only applied when sorting on a single column or label. @@ -4659,22 +4603,19 @@ def rename( # generate the new index columns in parallel if callable(index_renamer): # TODO: use df.apply() to handle callable - # currently use fallback and have to pull all index values - internal_frame = DataFrameDefault.register(native_pd.DataFrame.rename)( - self, index=index_renamer, level=level, errors=errors - )._modin_frame + ErrorMessage.not_implemented( + "Snowpark pandas rename API doesn't yet support callable mapper" + ) else: - # TODO: SNOW-841607 support multiindex in join_utils.join. Now all multiindex cases are fallback to SP + # TODO: SNOW-841607 support multiindex in join_utils.join. Now all multiindex cases are not supported. if ( self._modin_frame.is_multiindex(axis=0) or self._modin_frame.is_multiindex(axis=1) or index_renamer._query_compiler._modin_frame.is_multiindex(axis=0) ): - internal_frame = DataFrameDefault.register( - native_pd.DataFrame.rename - )( - self, index=index_renamer, level=level, errors=errors - )._modin_frame + ErrorMessage.not_implemented( + "Snowpark pandas rename API is not yet supported for multi-index objects" + ) else: index_col_id = ( internal_frame.index_column_snowflake_quoted_identifiers[0] @@ -4788,21 +4729,9 @@ def dataframe_to_datetime( SnowflakeQueryCompiler: QueryCompiler with a single data column converted to datetime dtype. """ - if to_datetime_require_fallback( + raise_if_to_datetime_not_supported( format, exact, infer_datetime_format, origin, errors - ): - return DataFrameDefault.register(native_pd.to_datetime)( - self, - errors=errors, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - format=format, - exact=exact, - unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin, - ) + ) if origin != "unix": """ Non-default values of the `origin` argument are only valid for scalars and 1D arrays. @@ -4847,22 +4776,13 @@ def dataframe_to_datetime( ) id_to_sf_type_map = self._modin_frame.quoted_identifier_to_snowflake_type() - # fallback if the original data type is not integer. Note pandas will always cast other types to integer and + # Raise error if the original data type is not integer. Note pandas will always cast other types to integer and # the way it does is not quite straightforward to implement. For example, a month value 3.1 will be cast to # March with 10 days and the 10 days will be added with what values in the day column. for sf_type in id_to_sf_type_map.values(): if not isinstance(sf_type, _IntegralType): - return DataFrameDefault.register(native_pd.to_datetime)( - self, - errors=errors, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - format=format, - exact=exact, - unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin, + ErrorMessage.not_implemented( + "Snowpark pandas to_datetime API doesn't yet support non integer types" ) # if the column is already integer, we can use Snowflake timestamp_ntz_from_parts function to handle it # since timestamp_ntz_from_parts only allows nanosecond as the fraction input, we generate it from the @@ -4938,21 +4858,9 @@ def series_to_datetime( SnowflakeQueryCompiler: QueryCompiler with a single data column converted to datetime dtype. """ - if to_datetime_require_fallback( + raise_if_to_datetime_not_supported( format, exact, infer_datetime_format, origin, errors - ): - return SeriesDefault.register(native_pd.to_datetime)( - self, - errors=errors, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - format=format, - exact=exact, - unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin, - ) + ) # convert format to sf_format which will be valid to use by to_timestamp functions in Snowflake sf_format = ( to_snowflake_timestamp_format(format) if format is not None else None @@ -5423,19 +5331,8 @@ def merge( SnowflakeQueryCompiler instance with merged result. """ if validate: - return DataFrameDefault.register(native_pd.DataFrame.merge)( - self, - right=right, - how=how, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - sort=sort, - suffixes=suffixes, - copy=copy, - indicator=indicator, - validate=validate, + ErrorMessage.not_implemented( + "Snowpark pandas merge API doesn't yet support 'validate' parameter" ) left = self @@ -6061,79 +5958,77 @@ def apply( Keyword arguments to pass to `func`. """ - # axis=0 will always be processed with stored procedure fallback. + # axis=0 is not supported, raise error. + if axis == 0: + ErrorMessage.not_implemented( + "Snowpark pandas apply API doesn't yet support axis == 0" + ) # Only callables are supported for axis=1 mode for now. - require_fallback = ( - (not callable(func) and not isinstance(func, UserDefinedFunction)) - or axis == 0 - or result_type is not None - or check_snowpark_pandas_object_in_arg(args) - or check_snowpark_pandas_object_in_arg(kwargs) - ) - if require_fallback: - return DataFrameDefault.register(native_pd.DataFrame.apply)( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - args=args, - **kwargs, + if not callable(func) and not isinstance(func, UserDefinedFunction): + ErrorMessage.not_implemented( + "Snowpark pandas apply API only supports callables func" + ) + if result_type is not None: + ErrorMessage.not_implemented( + "Snowpark pandas apply API doesn't yet support 'result_type' parameter" + ) + if check_snowpark_pandas_object_in_arg( + args + ) or check_snowpark_pandas_object_in_arg(kwargs): + ErrorMessage.not_implemented( + "Snowpark pandas apply API doesn't yet support DataFrame or Series in 'args' or 'kwargs' of 'func'" ) - else: - # Any logic below applies only to the axis = 1 case. Ensure this is true. - assert axis == 1 - - # get input types of all data columns from the dataframe directly - input_types = [ - datatype - for quoted_identifier, datatype in self._modin_frame.quoted_identifier_to_snowflake_type().items() - if quoted_identifier - in self._modin_frame.data_column_snowflake_quoted_identifiers - ] - # current columns - column_index = self._modin_frame.data_columns_index + # get input types of all data columns from the dataframe directly + input_types = [ + datatype + for quoted_identifier, datatype in self._modin_frame.quoted_identifier_to_snowflake_type().items() + if quoted_identifier + in self._modin_frame.data_column_snowflake_quoted_identifiers + ] - # Extract return type from annotations (or lookup for known pandas functions) for func object, - # if not return type could be extracted the variable will hold None. - return_type = deduce_return_type_from_function(func) + # current columns + column_index = self._modin_frame.data_columns_index - # Check whether return_type has been extracted. If return type is not - # a Series, tuple or list object, compute df.apply using a vUDF. In this case no column expansion needs to - # be performed which means that the result of df.apply(axis=1) is always a Series object. - if return_type and not ( - isinstance(return_type, PandasSeriesType) - or isinstance(return_type, ArrayType) - ): - return self._apply_udf_row_wise_and_reduce_to_series_along_axis_1( - func, - column_index, - input_types, - return_type, - udf_args=args, - udf_kwargs=kwargs, - session=self._modin_frame.ordered_dataframe.session, - ) - else: - # Issue actionable warning for users to consider annotating UDF with type annotations - # for better performance. - function_name = ( - func.__name__ if isinstance(func, Callable) else str(func) # type: ignore[arg-type] - ) - WarningMessage.single_warning( - f"Function {function_name} passed to apply does not have type annotations," - f" or Snowpark pandas could not extract type annotations. Executing apply" - f" in slow code path which may result in decreased performance. " - f"To disable this warning and improve performance, consider annotating" - f" {function_name} with type annotations." - ) + # Extract return type from annotations (or lookup for known pandas functions) for func object, + # if not return type could be extracted the variable will hold None. + return_type = deduce_return_type_from_function(func) - # Result may need to get expanded into multiple columns, or return type of func is not known. - # Process using UDTF together with dynamic pivot for either case. - return self._apply_with_udtf_and_dynamic_pivot_along_axis_1( - func, raw, result_type, args, column_index, input_types, **kwargs - ) + # Check whether return_type has been extracted. If return type is not + # a Series, tuple or list object, compute df.apply using a vUDF. In this case no column expansion needs to + # be performed which means that the result of df.apply(axis=1) is always a Series object. + if return_type and not ( + isinstance(return_type, PandasSeriesType) + or isinstance(return_type, ArrayType) + ): + return self._apply_udf_row_wise_and_reduce_to_series_along_axis_1( + func, + column_index, + input_types, + return_type, + udf_args=args, + udf_kwargs=kwargs, + session=self._modin_frame.ordered_dataframe.session, + ) + else: + # Issue actionable warning for users to consider annotating UDF with type annotations + # for better performance. + function_name = ( + func.__name__ if isinstance(func, Callable) else str(func) # type: ignore[arg-type] + ) + WarningMessage.single_warning( + f"Function {function_name} passed to apply does not have type annotations," + f" or Snowpark pandas could not extract type annotations. Executing apply" + f" in slow code path which may result in decreased performance. " + f"To disable this warning and improve performance, consider annotating" + f" {function_name} with type annotations." + ) + + # Result may need to get expanded into multiple columns, or return type of func is not known. + # Process using UDTF together with dynamic pivot for either case. + return self._apply_with_udtf_and_dynamic_pivot_along_axis_1( + func, raw, result_type, args, column_index, input_types, **kwargs + ) def applymap( self, @@ -6155,14 +6050,11 @@ def applymap( """ # Currently, NULL values are always passed into the udtf even if strict=True, # which is a bug on the server side SNOW-880105. - # The fix will not land soon, so in order to implement na_action=ignore, - # we will use fallback solution for now. + # The fix will not land soon, so we are going to raise not implemented error for now. + # TODO SNOW-1332314: linked jira is fixed now. Verify and enable this. if na_action == "ignore": - return DataFrameDefault.register(native_pd.DataFrame.applymap)( - self, - func=func, - na_action=na_action, - **kwargs, + ErrorMessage.not_implemented( + "Snowpark pandas applymap API doesn't yet support na_action == 'ignore'" ) return_type = deduce_return_type_from_function(func) if not return_type: @@ -6199,15 +6091,16 @@ def map( # TODO SNOW-801847: support series.map when arg is a dict/series # Currently, NULL values are always passed into the udtf even if strict=True, # which is a bug on the server side SNOW-880105. - # The fix will not land soon, so in order to implement na_action=ignore, - # we will use fallback solution for now. - if not callable(arg) or na_action == "ignore": - return SeriesDefault.register(native_pd.Series.map)( - self, - arg=arg, - na_action=na_action, + # The fix will not land soon, so we are going to raise not implemented error for now. + # TODO SNOW-1332314: linked jira is fixed now. Verify and enable this. + if na_action == "ignore": + ErrorMessage.not_implemented( + "Snowpark pandas map API doesn't yet support na_action == 'ignore'" + ) + if not callable(arg): + ErrorMessage.not_implemented( + "Snowpark pandas map API doesn't yet support non callable 'arg'" ) - return self.applymap(func=arg, na_action=na_action) def apply_on_series( @@ -6228,16 +6121,15 @@ def apply_on_series( assert self.is_series_like() # TODO SNOW-856682: support other types (str, list, dict) of func - if ( - not callable(func) - or check_snowpark_pandas_object_in_arg(args) - or check_snowpark_pandas_object_in_arg(kwargs) - ): - return SeriesDefault.register(native_pd.Series.apply)( - self, - func=func, - args=args, - **kwargs, + if not callable(func): + ErrorMessage.not_implemented( + "Snowpark pandas apply API only supports callables func" + ) + if check_snowpark_pandas_object_in_arg( + args + ) or check_snowpark_pandas_object_in_arg(kwargs): + ErrorMessage.not_implemented( + "Snowpark pandas apply API doesn't yet support DataFrame or Series in 'args' or 'kwargs' of 'func'" ) return self.applymap(func, args=args, **kwargs) @@ -6664,10 +6556,10 @@ def nunique( if not isinstance(dropna, bool): raise ValueError("dropna must be of type bool") # support axis=0 only where unique values per column are counted using COUNT(DISTINCT) - # fallback for axis=1 where unique values row-wise are counted + # raise error for axis=1 where unique values row-wise are counted if axis == 1: - return DataFrameDefault.register(native_pd.DataFrame.nunique)( - self, axis=axis, dropna=dropna, **kwargs + ErrorMessage.not_implemented( + "Snowpark pandas nunique API doesn't yet support axis == 1" ) else: # Result is basically a series with the column labels as index and the distinct count as values @@ -6741,9 +6633,12 @@ def to_numeric( return self if errors == "ignore": - # if any value is failed to parse, to_numeric returns the original series when error = 'ignore'. This - # requirement is hard to implement in Snowpark pandas so fallback for now. - return SeriesDefault.register(native_pd.to_numeric)(self, errors=errors) + # if any value is failed to parse, to_numeric returns the original series + # when error = 'ignore'. This requirement is hard to implement in Snowpark + # pandas raise error for now. + ErrorMessage.not_implemented( + "Snowpark pandas to_numeric API doesn't yet support errors == 'ignore'" + ) new_col = col(col_id) new_col_type_is_numeric = False @@ -7146,8 +7041,8 @@ def astype( New QueryCompiler with updated dtypes. """ if errors != "raise": - return DataFrameDefault.register(native_pd.DataFrame.astype)( - self, col_dtypes_map, errors=errors + ErrorMessage.not_implemented( + f"Snowpark pandas astype API doesn't yet support errors == '{errors}'" ) col_dtypes_curr = { k: v for k, v in self.dtypes.to_dict().items() if k in col_dtypes_map @@ -7403,13 +7298,13 @@ def mask( other : Optional Scalar or SnowflakeQueryCompiler Entries where cond is True are replaced with corresponding value from other. To keep things simple if the other is not a SnowflakeQueryCompiler or scalar primitive like int, float, str, bool then we - go through the fallback path. + raise not implemented error. axis : int, default None - Alignment axis if needed. This will fallback if not the default. + Alignment axis if needed. This will raise not implemented error if not the default. level : int, default None - Alignment level if needed. This will fallback if not the default. + Alignment level if needed. This will raise not implemented error if not the default. needs_positional_join_for_cond : bool, default False Align condition and self by position rather than labels. Necessary when condition is a NumPy object. @@ -7464,13 +7359,13 @@ def where( other : Optional Scalar or SnowflakeQueryCompiler Entries where cond is False are replaced with corresponding value from other. To keep things simple if the other is not a SnowflakeQueryCompiler or scalar primitive like int, float, str, bool then we - go through the fallback path. + raise not implemented error. axis : int, default None - Alignment axis if needed. This will fallback if not the default. + Alignment axis if needed. This will raise not implemented error if not the default. level : int, default None - Alignment level if needed. This will fallback if not the default. + Alignment level if needed. This will raise not implemented error if not the default. needs_positional_join_for_cond : bool, default False Align condition and self by position rather than labels. Necessary when condition is a NumPy object. @@ -7496,29 +7391,30 @@ def where( SnowflakeQueryCompiler New SnowflakeQueryCompiler with where result. """ - # Go through fallback path if level is specified, or other is not snowflake query compiler or - # involves more complex scalar type (not simple scalar types like int or float) then we defer to the fallback - # case to ensure better consistency with pandas. + # Raise not implemented error if level is specified, or other is not snowflake query compiler or + # involves more complex scalar type (not simple scalar types like int or float) from snowflake.snowpark.modin.pandas.utils import is_scalar other_is_series_self_is_not = (getattr(self, "_shape_hint", None) is None) and ( getattr(other, "_shape_hint", None) == "column" ) - if ( - (axis is not None and not other_is_series_self_is_not) - or level is not None - or ( - other is not None - and not isinstance(other, SnowflakeQueryCompiler) - and not is_scalar(other) + if axis is not None and not other_is_series_self_is_not: + ErrorMessage.not_implemented( + "Snowpark pandas where API doesn't yet support axis parameter when 'other' is Series" ) + + if level is not None: + ErrorMessage.not_implemented( + "Snowpark pandas where API doesn't yet support level parameter" + ) + + if ( + other is not None + and not isinstance(other, SnowflakeQueryCompiler) + and not is_scalar(other) ): - return DataFrameDefault.register(native_pd.DataFrame.where)( - self, - cond=cond, - other=other if other else None, - axis=axis, - level=level, + ErrorMessage.not_implemented( + "Snowpark pandas where API only supports scalar, DataFrame and Series as 'other' parameter" ) frame = self._modin_frame @@ -7922,15 +7818,15 @@ def fillna( BaseQueryCompiler New QueryCompiler with all null values filled. """ - default_class = SeriesDefault if self_is_series else DataFrameDefault - fallback_func = ( - native_pd.Series.fillna if self_is_series else native_pd.DataFrame.fillna - ) - use_fallback = False - if limit or downcast: - # fallback before having parallel implementation - # TODO: SNOW-891788 support limit - use_fallback = True + # TODO: SNOW-891788 support limit + if limit: + ErrorMessage.not_implemented( + "Snowpark pandas fillna API doesn't yet support 'limit' parameter" + ) + if downcast: + ErrorMessage.not_implemented( + "Snowpark pandas fillna API doesn't yet support 'downcast' parameter" + ) # case 1: fillna df with another df or fillna series with another series/dict if (self_is_series and isinstance(value, (dict, pd.Series))) or ( @@ -7940,16 +7836,6 @@ def fillna( value = pd.Series(value) return self.where(cond=self.notna(), other=value._query_compiler) - if use_fallback: - return default_class.register(fallback_func)( - self, - value, - method=method, - axis=axis, - limit=limit, - downcast=downcast, - ) - # case 2: fillna with a method if method is not None: method = FillNAMethod.get_enum_for_string_method(method) @@ -8048,12 +7934,8 @@ def dropna( New QueryCompiler with null values dropped along given axis. """ if axis == 1: - return DataFrameDefault.register(native_pd.DataFrame.dropna)( - self, - axis=axis, - how=how, - thresh=thresh, - subset=subset, + ErrorMessage.not_implemented( + "Snowpark pandas dropna API doesn't yet support axis == 1" ) # reuse Snowpark Dataframe's dropna API and make sure to define subset correctly, i.e., only contain data @@ -8145,29 +8027,13 @@ def setitem( New QueryCompiler with updated `key` value. """ - # use fallback for axis=1 which is similar to loc functionality. Setitem for axis=1 + # raise error for axis=1 which is similar to loc functionality. Setitem for axis=1 # should be done as part of write scenarios for .loc tracked in SNOW-812522. # Efficient implementation requires transpose of single-row. if 1 == axis: - - def setitem( - df: pd.DataFrame, - key: IndexLabel, - value: Union["SnowflakeQueryCompiler", list[Any], Any], - ) -> pd.DataFrame: - # no cover here, because executed remotely - from snowflake.snowpark.modin.pandas.utils import ( - is_scalar, # pragma: no cover - ) - - if is_scalar(key) and isinstance( - value, native_pd.DataFrame - ): # pragma: no cover - value = value.squeeze() # pragma: no cover - df.loc[key] = value # pragma: no cover - return df # pragma: no cover - - return DataFrameDefault.register(setitem)(self, key=key, value=value) + ErrorMessage.not_implemented( + "Snowpark pandas setitem API doesn't yet support axis == 1" + ) # for axis=0, update column for key loc = self._modin_frame.data_column_pandas_labels.index(key) @@ -12396,3 +12262,12 @@ def cut( ) return bins, SnowflakeQueryCompiler(ret_frame) + + def str_casefold(self) -> "SnowflakeQueryCompiler": + """ + Returns: + New query compiler with updated values. + """ + ErrorMessage.not_implemented( + "Snowpark pandas doesn't yet support casefold method" + ) diff --git a/tests/integ/modin/binary/test_binary_default2pandas.py b/tests/integ/modin/binary/test_binary_default2pandas.py deleted file mode 100644 index 759089beed1..00000000000 --- a/tests/integ/modin/binary/test_binary_default2pandas.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -import modin.pandas as pd -import numpy as np -import pandas as native_pd -import pytest - -import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci -from tests.integ.modin.sql_counter import sql_count_checker -from tests.integ.modin.utils import ( - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, - eval_snowpark_pandas_result, -) - - -@pytest.fixture(scope="function") -def snow_and_native_df(): - data = [[1, 2], [3, 4]] - snow_df = pd.DataFrame(data) - native_df = native_pd.DataFrame(data) - return snow_df, native_df - - -@pytest.fixture(scope="function") -def snow_and_native_df_nan(): - data = [[1, 2], [3, np.nan]] - snow_df = pd.DataFrame(data) - native_df = native_pd.DataFrame(data) - return snow_df, native_df - - -# TODO: SNOW-1056369 : Implement binary operation __xor__ -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.parametrize("func", [lambda df: df.__xor__([-1, 0])]) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_binary_op_on_list_like_value(snow_and_native_df, func): - eval_snowpark_pandas_result(*snow_and_native_df, func) - - -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker(query_count=10, fallback_count=1, sproc_count=1) -def test_binary_xor_on_df(snow_and_native_df): - eval_snowpark_pandas_result(*snow_and_native_df, lambda df: df ^ df) - - -@pytest.mark.skip( - reason="TODO: SNOW-896220 support dot. It raises NotImplementedError today" -) -@pytest.mark.parametrize( - "func", - [ - lambda df: df.dot(df), - lambda df: df[0].dot(df[1]), - lambda df: df[0] @ df, - lambda df: df @ df[1], - ], -) -def test_binary_op_dot(snow_and_native_df, func): - def compare(snow_result, pd_result, **kwargs): - if not isinstance( - pd_result, (native_pd.DataFrame, native_pd.Series) - ) and not isinstance(snow_result, (pd.DataFrame, pd.Series)): - assert pd_result == snow_result - else: - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_result, pd_result, **kwargs - ) - - eval_snowpark_pandas_result(*snow_and_native_df, func, comparator=compare) diff --git a/tests/integ/modin/binary/test_binary_not_implemented.py b/tests/integ/modin/binary/test_binary_not_implemented.py new file mode 100644 index 00000000000..a23e91055a5 --- /dev/null +++ b/tests/integ/modin/binary/test_binary_not_implemented.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker + + +@pytest.fixture(scope="function") +def snow_and_native_df(): + data = [[1, 2], [3, 4]] + snow_df = pd.DataFrame(data) + native_df = native_pd.DataFrame(data) + return snow_df, native_df + + +@pytest.fixture(scope="function") +def snow_and_native_df_nan(): + data = [[1, 2], [3, np.nan]] + snow_df = pd.DataFrame(data) + native_df = native_pd.DataFrame(data) + return snow_df, native_df + + +# TODO: SNOW-1056369 : Implement binary operation __xor__ +@pytest.mark.parametrize("func", [lambda df: df.__xor__([-1, 0]), lambda df: df ^ df]) +@sql_count_checker(query_count=0) +def test_binary_op_xor(snow_and_native_df, func): + snow_df, _ = snow_and_native_df + msg = "Snowpark pandas doesn't yet support '__xor__' binary operation" + with pytest.raises(NotImplementedError, match=msg): + func(snow_df) + + +@pytest.mark.parametrize( + "func", + [ + lambda df: df.dot(df), + lambda df: df[0].dot(df[1]), + lambda df: df[0] @ df, + lambda df: df @ df[1], + ], +) +@sql_count_checker(query_count=0) +def test_binary_op_dot(snow_and_native_df, func): + snow_df, _ = snow_and_native_df + msg = "Snowpark pandas doesn't yet support 'dot' binary operation" + with pytest.raises(NotImplementedError, match=msg): + func(snow_df) diff --git a/tests/integ/modin/frame/test_all_any.py b/tests/integ/modin/frame/test_all_any.py index c69b6adf4ac..c3a8befe5d7 100644 --- a/tests/integ/modin/frame/test_all_any.py +++ b/tests/integ/modin/frame/test_all_any.py @@ -10,8 +10,7 @@ from pytest import param import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci -from tests.integ.modin.sql_counter import SqlCounter +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, assert_values_equal, @@ -249,12 +248,6 @@ def test_any_bool_only(data, axis): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize("axis", [0, 1, None]) @pytest.mark.parametrize( "data", @@ -264,26 +257,14 @@ def test_any_bool_only(data, axis): ], ) @pytest.mark.parametrize("skipna", [True, False]) -def test_all_float_fallback(data, axis, skipna): - # Because axis=None calls the method with axis=0 twice, it incurs an extra query - # to check the length of the index after the first call is handled by a fallback - with SqlCounter( - query_count=9 if axis is None else 8, fallback_count=1, sproc_count=1 - ): - eval_snowpark_pandas_result( - pd.DataFrame(data), - native_pd.DataFrame(data), - lambda df: df.all(axis=axis, skipna=skipna), - comparator=boolagg_comparator(axis), - ) +@sql_count_checker(query_count=0) +def test_all_float_not_implemented(data, axis, skipna): + df = pd.DataFrame(data) + msg = "Snowpark pandas all API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + df.all(axis=axis, skipna=skipna) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize("axis", [0, 1, None]) @pytest.mark.parametrize( "data", @@ -293,26 +274,14 @@ def test_all_float_fallback(data, axis, skipna): ], ) @pytest.mark.parametrize("skipna", [True, False]) -def test_any_float_fallback(data, axis, skipna): - # Because axis=None calls the method with axis=0 twice, it incurs an extra query - # to check the length of the index after the first call is handled by a fallback - with SqlCounter( - query_count=9 if axis is None else 8, fallback_count=1, sproc_count=1 - ): - eval_snowpark_pandas_result( - pd.DataFrame(data), - native_pd.DataFrame(data), - lambda df: df.any(axis=axis, skipna=skipna), - comparator=boolagg_comparator(axis), - ) +@sql_count_checker(query_count=0) +def test_any_float_not_implemented(data, axis, skipna): + df = pd.DataFrame(data) + msg = "Snowpark pandas any API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + df.any(axis=axis, skipna=skipna) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize("axis", [0, 1, None]) @pytest.mark.parametrize( "data", @@ -320,26 +289,14 @@ def test_any_float_fallback(data, axis, skipna): {"a": ["", "b", "c"], "b": ["d", "e", "f"]}, ], ) -def test_all_str_fallback(data, axis): - # Because axis=None calls the method with axis=0 twice, it incurs an extra query - # to check the length of the index after the first call is handled by a fallback - with SqlCounter( - query_count=9 if axis is None else 8, fallback_count=1, sproc_count=1 - ): - eval_snowpark_pandas_result( - pd.DataFrame(data), - native_pd.DataFrame(data), - lambda df: df.all(axis=axis), - comparator=boolagg_comparator(axis), - ) +@sql_count_checker(query_count=0) +def test_all_str_not_implemented(data, axis): + df = pd.DataFrame(data) + msg = "Snowpark pandas all API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + df.all(axis=axis) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize("axis", [0, 1, None]) @pytest.mark.parametrize( "data", @@ -347,15 +304,9 @@ def test_all_str_fallback(data, axis): {"a": ["", "b", "c"], "b": ["", "e", "f"]}, ], ) -def test_any_str_fallback(data, axis): - # Because axis=None calls the method with axis=0 twice, it incurs an extra query - # to check the length of the index after the first call is handled by a fallback - with SqlCounter( - query_count=9 if axis is None else 8, fallback_count=1, sproc_count=1 - ): - eval_snowpark_pandas_result( - pd.DataFrame(data), - native_pd.DataFrame(data), - lambda df: df.any(axis=axis), - comparator=boolagg_comparator(axis), - ) +@sql_count_checker(query_count=0) +def test_any_str_not_implemented(data, axis): + df = pd.DataFrame(data) + msg = "Snowpark pandas any API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + df.any(axis=axis) diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py index e304c0f2507..83c64825871 100644 --- a/tests/integ/modin/frame/test_apply.py +++ b/tests/integ/modin/frame/test_apply.py @@ -21,7 +21,6 @@ DEFAULT_UDTF_PARTITION_SIZE, ) from snowflake.snowpark.types import DoubleType, PandasSeriesType -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.series.test_apply import create_func_with_return_type_hint from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( @@ -365,58 +364,34 @@ def f(x, y, z=1) -> int: ) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -class TestDefault2Pandas: - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) +class TestNotImplemented: @pytest.mark.parametrize("data, func, return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) + @sql_count_checker(query_count=0) def test_axis_0(self, data, func, return_type): - native_df = native_pd.DataFrame(data) snow_df = pd.DataFrame(data) - eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.apply(func)) + msg = "Snowpark pandas apply API doesn't yet support axis == 0" + with pytest.raises(NotImplementedError, match=msg): + snow_df.apply(func) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) @pytest.mark.parametrize("result_type", ["reduce", "expand", "broadcast"]) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) + @sql_count_checker(query_count=0) def test_result_type(self, result_type): snow_df = pd.DataFrame([[1, 2], [3, 4]]) - native_df = native_pd.DataFrame([[1, 2], [3, 4]]) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda x: x.apply(lambda x: [1, 2], result_type=result_type), - ) + msg = "Snowpark pandas apply API doesn't yet support 'result_type' parameter" + with pytest.raises(NotImplementedError, match=msg): + snow_df.apply(lambda x: [1, 2], axis=1, result_type=result_type) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @sql_count_checker( - query_count=20, fallback_count=2, sproc_count=2, expect_high_count=True - ) + @sql_count_checker(query_count=0) def test_axis_1_apply_args_kwargs_with_snowpandas_object(self): def f(x, y=None) -> native_pd.Series: return x + (y if y is not None else 0) - native_df = native_pd.DataFrame([[1, 2], [3, 4]]) snow_df = pd.DataFrame([[1, 2], [3, 4]]) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_df.apply(f, axis=1, args=(pd.Series([1, 2]),)), - native_df.apply(f, axis=1, args=(native_pd.Series([1, 2]),)), - ) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_df.apply(f, axis=1, y=pd.Series([1, 2])), - native_df.apply(f, axis=1, y=native_pd.Series([1, 2])), - ) + msg = "Snowpark pandas apply API doesn't yet support DataFrame or Series in 'args' or 'kwargs' of 'func'" + with pytest.raises(NotImplementedError, match=msg): + snow_df.apply(f, axis=1, args=(pd.Series([1, 2]),)) + with pytest.raises(NotImplementedError, match=msg): + snow_df.apply(f, axis=1, y=pd.Series([1, 2])) TEST_INDEX_1 = native_pd.MultiIndex.from_tuples( @@ -624,19 +599,13 @@ def g(v): ] -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) @pytest.mark.parametrize("data, apply_func", TRANSFORM_DATA_FUNC_MAP) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_basic_dataframe_transform(data, apply_func): - snow_df = pd.DataFrame(data) - native_df = native_pd.DataFrame(data) - eval_snowpark_pandas_result( - snow_df, native_df, lambda x: x.transform(apply_func), atol=0.1 - ) + msg = "Snowpark pandas apply API doesn't yet support axis == 0" + with pytest.raises(NotImplementedError, match=msg): + snow_df = pd.DataFrame(data) + snow_df.transform(apply_func) AGGREGATION_FUNCTIONS = [ @@ -661,17 +630,12 @@ def test_dataframe_transform_aggregation_negative(func): snow_df.transform(func) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=0) def test_dataframe_transform_invalid_function_name_negative(session): snow_df = pd.DataFrame([[0, 1, 2], [1, 2, 3]]) with pytest.raises( - SnowparkSQLException, - match="Python Interpreter Error", + NotImplementedError, + match="Snowpark pandas apply API doesn't yet support axis == 0", ): snow_df.transform("mxyzptlk") @@ -876,83 +840,6 @@ def test_apply_axis_1_frame_with_column_of_all_nulls_snow_1233832(null_value): import scipy.stats # noqa: E402 -# used for testing -import statsmodels # noqa: E402 - - -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=SnowparkSQLException, -) -@pytest.mark.parametrize( - "packages,expected_query_count", - [ - (["statsmodels", "numpy"], 11), - (["statsmodels==0.14.0", "numpy>=1.0"], 11), - pytest.param( - [statsmodels, np], - 15, - marks=pytest.mark.xfail( - reason="Snowpark package resolver mismatch, resolved to 2.2.1 from statsmodels but clashes with Snowpark pandas version 2.1.4.", - ), - ), - ], -) -def test_apply_axis0_with_3rd_party_libraries_and_decorator( - packages, expected_query_count -): - x = np.linspace(0, 5, 100) - y = x + np.random.normal(size=len(x)) - data = {"XY": list(zip(list(x), list(y)))} - - with SqlCounter( - query_count=expected_query_count, - fallback_count=1, - sproc_count=1, - high_count_expected=True, - high_count_reason="package upload", - ): - - df = pd.DataFrame(data) - # Capture setting. - custom_package_usage_config = pd.session.custom_package_usage_config.get( - "enabled", False - ) - - try: - pd.session.custom_package_usage_config["enabled"] = True - - @udf(packages=packages, return_type=PandasSeriesType(DoubleType())) - def func(column): - import pandas as pd # noqa: F401 - import statsmodels.api as sm - from statsmodels.stats.outliers_influence import OLSInfluence - - X = column.apply(lambda t: t[0]) - y = column.apply(lambda t: t[1]) - - X = sm.add_constant(X) - fit = sm.OLS(y, X).fit() - - influence = OLSInfluence(fit) - return influence.resid - - ans = df.apply(func, axis=0) - finally: - pd.session.clear_packages() - pd.session.clear_imports() - - # Restore setting. - pd.session.custom_package_usage_config[ - "enabled" - ] = custom_package_usage_config - - # apply same function via native pandas and compare results - native_ans = native_pd.DataFrame(data).apply(func.func, axis=0) - - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(ans, native_ans) - @pytest.mark.parametrize( "packages,expected_query_count", diff --git a/tests/integ/modin/frame/test_applymap.py b/tests/integ/modin/frame/test_applymap.py index 87a98e70782..95506d1bebc 100644 --- a/tests/integ/modin/frame/test_applymap.py +++ b/tests/integ/modin/frame/test_applymap.py @@ -93,30 +93,17 @@ def test_applymap_numpy(func): eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.applymap(func)) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker( - query_count=16, fallback_count=2, sproc_count=2, expect_high_count=True -) +@sql_count_checker(query_count=0) def test_applymap_na_action_ignore(): snow_df = pd.DataFrame([1, 1.1, "NaN", None], dtype="Float64") - - # In native pandas, the last two elements are NaN and pd.NA - assert snow_df.applymap( - lambda x: x is None, na_action="ignore" - ).values.tolist() == [[False], [False], [None], [None]] + msg = "Snowpark pandas applymap API doesn't yet support na_action == 'ignore'" + with pytest.raises(NotImplementedError, match=msg): + snow_df.applymap(lambda x: x is None, na_action="ignore") data = ["cat", "dog", np.nan, "rabbit"] snow_df = pd.DataFrame(data) - native_df = native_pd.DataFrame(data) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda x: x.applymap("I am a {}".format, na_action="ignore"), - ) + with pytest.raises(NotImplementedError, match=msg): + snow_df.applymap("I am a {}".format, na_action="ignore") @pytest.mark.parametrize("invalid_input", ["min", [np.min], {"a": np.max}]) diff --git a/tests/integ/modin/frame/test_dropna.py b/tests/integ/modin/frame/test_dropna.py index 3c600c70070..e5fb2085417 100644 --- a/tests/integ/modin/frame/test_dropna.py +++ b/tests/integ/modin/frame/test_dropna.py @@ -8,7 +8,6 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -76,19 +75,12 @@ def test_how_all_with_subset(test_dropna_df): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_axis_1_fallback(test_dropna_df): - eval_snowpark_pandas_result( - pd.DataFrame(test_dropna_df), - test_dropna_df, - lambda df: df.dropna(axis="columns"), - ) +@sql_count_checker(query_count=0) +def test_axis_1_not_implemented(test_dropna_df): + msg = "Snowpark pandas dropna API doesn't yet support axis == 1" + with pytest.raises(NotImplementedError, match=msg): + df = pd.DataFrame(test_dropna_df) + df.dropna(axis="columns") @sql_count_checker(query_count=1) diff --git a/tests/integ/modin/frame/test_fillna.py b/tests/integ/modin/frame/test_fillna.py index c1e26a8e0da..d3b8d0a523b 100644 --- a/tests/integ/modin/frame/test_fillna.py +++ b/tests/integ/modin/frame/test_fillna.py @@ -11,7 +11,6 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -261,19 +260,12 @@ def test_value_scalar_inplace(test_fillna_df): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_value_scalar_limit_fallback(test_fillna_df): - eval_snowpark_pandas_result( - pd.DataFrame(test_fillna_df), - test_fillna_df, - lambda df: df.fillna(1, limit=1), - ) +@sql_count_checker(query_count=0) +def test_value_scalar_limit_not_implemented(test_fillna_df): + df = pd.DataFrame(test_fillna_df) + msg = "Snowpark pandas fillna API doesn't yet support 'limit' parameter" + with pytest.raises(NotImplementedError, match=msg): + df.fillna(1, limit=1) @sql_count_checker(query_count=0) diff --git a/tests/integ/modin/frame/test_filter.py b/tests/integ/modin/frame/test_filter.py index 352e55f1ef0..6c062e27f2c 100644 --- a/tests/integ/modin/frame/test_filter.py +++ b/tests/integ/modin/frame/test_filter.py @@ -71,28 +71,25 @@ def test_filtering_with_self(func): eval_snowpark_pandas_result(snow_df, native_df, func) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker(query_count=16, join_count=1, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) @pytest.mark.parametrize( "func", [ # Note 1: can't use B here e.g., because column contains None - but Snowflake would allow this. - # Note 2: Make sure that some unsupported operation is used below for the fallback to happen. + # Note 2: Make sure that some unsupported operation is used below lambda df: df[df.A.str.casefold().str.startswith("P")], lambda df: df[df.A.str.casefold().str.lower() == "zebra"], ], ) -def test_filtering_with_self_fallback( +def test_filtering_with_self_not_implemented( func, ): data = _generate_data() snow_df = pd.DataFrame(data) - native_df = native_pd.DataFrame(data) - eval_snowpark_pandas_result(snow_df, native_df, func) + with pytest.raises( + NotImplementedError, match="Snowpark pandas doesn't yet support casefold method" + ): + func(snow_df) @pytest.mark.parametrize( diff --git a/tests/integ/modin/frame/test_join.py b/tests/integ/modin/frame/test_join.py index b992ff7951b..91500189d12 100644 --- a/tests/integ/modin/frame/test_join.py +++ b/tests/integ/modin/frame/test_join.py @@ -7,8 +7,6 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from snowflake.snowpark.exceptions import SnowparkSQLException -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import assert_frame_equal, eval_snowpark_pandas_result @@ -215,12 +213,6 @@ def test_cross_join(left, right): assert_frame_equal(result, expected) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "lvalues, rvalues, validate", # 'one' should also validate as 'many'. If actual join is one-to-one @@ -239,26 +231,15 @@ def test_cross_join(left, right): ([1, 2, 1], [2, 3, 2], "m:m"), # m:m join ], ) -@sql_count_checker(query_count=12, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_join_validate(lvalues, rvalues, validate): left = pd.DataFrame({"A": [1, 1, 2]}, index=lvalues) right = pd.DataFrame({"B": [1, 4, 2]}, index=rvalues) - eval_snowpark_pandas_result( - left, - left.to_pandas(), - lambda df: df.join( - right if isinstance(df, pd.DataFrame) else right.to_pandas(), - validate=validate, - ), - ) + msg = "Snowpark pandas merge API doesn't yet support 'validate' parameter" + with pytest.raises(NotImplementedError, match=msg): + left.join(right, validate=validate) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "lvalues, rvalues, validate", [ @@ -271,10 +252,10 @@ def test_join_validate(lvalues, rvalues, validate): ([1, 2, 1], [2, 3, 2], "m:1"), # m:m join ], ) -@sql_count_checker(query_count=5) +@sql_count_checker(query_count=0) def test_join_validate_negative(lvalues, rvalues, validate): left = pd.DataFrame({"A": [1, 1, 2]}, index=lvalues) right = pd.DataFrame({"B": [1, 4, 2]}, index=rvalues) - # TODO: SNOW-863059 expect MergeError instead of SnowparkSqlException - with pytest.raises(SnowparkSQLException, match="Merge keys are not unique"): + msg = "Snowpark pandas merge API doesn't yet support 'validate' parameter" + with pytest.raises(NotImplementedError, match=msg): left.join(right, validate=validate) diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 3b513a00b78..1fc2d725b73 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -11,7 +11,6 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, @@ -425,21 +424,17 @@ def test_dataframe_mask_cond_is_none_negative(test_data): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker(query_count=12, fallback_count=1, sproc_count=1) -def test_dataframe_mask_with_fallback(test_data, test_cond, test_others): +@sql_count_checker(query_count=0) +def test_dataframe_mask_not_implemented(test_data, test_cond, test_others): index_data = [["A", "B"], ["C", "D", "E"]] - - mask_test_helper( - [test_data, test_cond, test_others], - [index_data, index_data, index_data], - extra_mask_args={"axis": 1}, - ) + df_data_list = [test_data, test_cond, test_others] + df_data_args = [index_data, index_data, index_data] + snow_dfs = [ + make_snow_dataframe(data, data_args) + for data, data_args in zip(df_data_list, df_data_args) + ] + with pytest.raises(NotImplementedError): + snow_dfs[0].mask(snow_dfs[1], snow_dfs[2], axis=1) @sql_count_checker(query_count=3, join_count=1) diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 8ebf40a3096..c2ebc7c2de6 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -13,7 +13,6 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_frame_equal, @@ -1173,12 +1172,6 @@ def test_merge_with_indicator_explicit_name_negative(left_df, right_df): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "lvalues, rvalues, validate", # 'one' should also validate as 'many'. If actual join is one-to-one @@ -1197,48 +1190,15 @@ def test_merge_with_indicator_explicit_name_negative(left_df, right_df): ([1, 2, 1], [2, 3, 2], "m:m"), # m:m join ], ) -@sql_count_checker(query_count=12, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_merge_validate(lvalues, rvalues, validate): left = pd.DataFrame({"A": lvalues}) right = pd.DataFrame({"B": rvalues}) - eval_snowpark_pandas_result( - left, - left.to_pandas(), - lambda df: df.merge( - right if isinstance(df, pd.DataFrame) else right.to_pandas(), - left_on="A", - right_on="B", - validate=validate, - ), - ) - - -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -# Single test to pass code coverage in CI -@sql_count_checker(query_count=12, fallback_count=1, sproc_count=1) -def test_merge_validate_for_ci(left_df, right_df): - eval_snowpark_pandas_result( - left_df, - left_df.to_pandas(), - lambda df: df.merge( - right_df if isinstance(df, pd.DataFrame) else right_df.to_pandas(), - left_on="A", - right_on="B", - validate="m:m", - ), - ) + msg = "Snowpark pandas merge API doesn't yet support 'validate' parameter" + with pytest.raises(NotImplementedError, match=msg): + left.merge(right, left_on="A", right_on="B", validate=validate) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "lvalues, rvalues, validate", [ @@ -1251,10 +1211,10 @@ def test_merge_validate_for_ci(left_df, right_df): ([1, 2, 1], [2, 3, 2], "m:1"), # m:m join ], ) -@sql_count_checker(query_count=5) +@sql_count_checker(query_count=0) def test_merge_validate_negative(lvalues, rvalues, validate): left = pd.DataFrame({"A": lvalues}) right = pd.DataFrame({"B": rvalues}) - # TODO: SNOW-863059 expect MergeError instead of SnowparkSqlException - with pytest.raises(SnowparkSQLException, match="Merge keys are not unique"): + msg = "Snowpark pandas merge API doesn't yet support 'validate' parameter" + with pytest.raises(NotImplementedError, match=msg): left.merge(right, left_on="A", right_on="B", validate=validate) diff --git a/tests/integ/modin/frame/test_nunique.py b/tests/integ/modin/frame/test_nunique.py index 07f97643867..d0cad8ec2ad 100644 --- a/tests/integ/modin/frame/test_nunique.py +++ b/tests/integ/modin/frame/test_nunique.py @@ -107,19 +107,10 @@ def test_dataframe_unique_dropna_negative(): df.nunique(dropna=42) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) @pytest.mark.parametrize("dropna", [True, False]) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_dataframe_unique_axis1_fallback(dropna): +@sql_count_checker(query_count=0) +def test_dataframe_unique_axis1_not_implemented(dropna): df = pd.DataFrame(TEST_DATA, columns=TEST_LABELS) - native_df = native_pd.DataFrame(TEST_DATA, columns=TEST_LABELS) - - eval_snowpark_pandas_result( - df, - native_df, - lambda df: df.nunique(axis=1, dropna=dropna), - ) + msg = "Snowpark pandas nunique API doesn't yet support axis == 1" + with pytest.raises(NotImplementedError, match=msg): + df.nunique(axis=1, dropna=dropna) diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index ffc584514cd..c3b42b17861 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -14,7 +14,6 @@ from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import assert_frame_equal, eval_snowpark_pandas_result @@ -36,21 +35,21 @@ def test_rename_signature(self): "errors", } - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) @pytest.mark.parametrize("klass", [Series, DataFrame]) - @sql_count_checker(query_count=9, fallback_count=1, sproc_count=1) def test_rename_mi(self, klass): obj = klass( [11, 21, 31], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), ) - # obj.rename(str.lower) - native_obj = obj.to_pandas() - eval_snowpark_pandas_result(obj, native_obj, lambda x: x.rename(str.lower)) + msg = "Snowpark pandas rename API is not yet supported for multi-index objects" + if klass == DataFrame: + with SqlCounter(query_count=0): + with pytest.raises(NotImplementedError, match=msg): + obj.rename(["A"]) + else: + with SqlCounter(query_count=2): + native_obj = obj.to_pandas() + eval_snowpark_pandas_result(obj, native_obj, lambda x: x.rename("A")) @pytest.fixture(scope="function") def snow_float_frame(self, float_frame): @@ -107,16 +106,13 @@ def test_rename(self, snow_float_frame): assert_index_equal(renamed.index, Index(["bar", "foo"], name="name")) assert renamed.index.name == renamer.index.name - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) - def test_rename_str_upper_fallback(self): + @sql_count_checker(query_count=0) + def test_rename_str_upper_not_implemented(self): data = {"A": {"foo": 0, "bar": 1}} - renamed = DataFrame(data).rename(index=str.upper) - assert_index_equal(renamed.index, Index(["FOO2", "BAR2"])) + df = DataFrame(data) + msg = "Snowpark pandas rename API doesn't yet support callable mapper" + with pytest.raises(NotImplementedError, match=msg): + df.rename(index=str.upper) @pytest.mark.parametrize( "args,kwargs", @@ -137,41 +133,14 @@ def test_rename_chainmap(self, args, kwargs): expected = native_pd.DataFrame({"a": colAData, "b": colBdata}) assert_frame_equal(result, expected, check_dtype=False, check_index_type=False) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") - @sql_count_checker(query_count=22, fallback_count=2, sproc_count=2) - def test_rename_multiindex_fallback(self): + @sql_count_checker(query_count=0) + def test_rename_multiindex_with_level(self): tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) - # - # without specifying level -> across all levels - - renamed = df.rename( - index={"foo1": "foo3", "bar2": "bar3"}, - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, - ) - new_index = MultiIndex.from_tuples( - [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"] - ) - new_columns = MultiIndex.from_tuples( - [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] - ) - assert_index_equal(renamed.index, new_index) - assert_index_equal(renamed.columns, new_columns) - assert renamed.index.names == df.index.names - assert renamed.columns.names == df.columns.names - - # - # with specifying a level (GH13766) - # dict new_columns = MultiIndex.from_tuples( [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] ) @@ -206,12 +175,24 @@ def test_rename_multiindex_fallback(self): renamed = df.rename(columns=func, level="buzz") assert_index_equal(renamed.columns, new_columns) - # index - new_index = MultiIndex.from_tuples( - [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] - ) - renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) - assert_index_equal(renamed.index, new_index) + @sql_count_checker(query_count=0) + def test_rename_multiindex_not_implemented(self): + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) + df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) + # without specifying level -> across all levels + + with pytest.raises(NotImplementedError): + df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + + # specifying level + with pytest.raises(NotImplementedError): + df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) @sql_count_checker(query_count=2) def test_rename_nocopy(self, snow_float_frame): @@ -332,51 +313,33 @@ def test_rename_axis_style(self): result = df.rename({"X": "x", "Y": "y"}, axis="index") assert_frame_equal(result, expected, check_dtype=False, check_index_type=False) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") - @sql_count_checker(query_count=24, fallback_count=3, sproc_count=3) - def test_rename_axis_style_fallback(self): + @sql_count_checker(query_count=0) + def test_rename_axis_style_not_implemented(self): df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) - expected = native_pd.DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) - result = df.rename(str.lower, axis=0) - assert_frame_equal(result, expected, check_dtype=False, check_index_type=False) - - result = df.rename(str.lower, axis="index") - assert_frame_equal(result, expected, check_dtype=False, check_index_type=False) - - result = df.rename(mapper=str.lower, axis="index") - assert_frame_equal(result, expected, check_dtype=False, check_index_type=False) + msg = "Snowpark pandas rename API doesn't yet support callable mapper" + with pytest.raises(NotImplementedError, match=msg): + df.rename(str.lower, axis=0) + with pytest.raises(NotImplementedError, match=msg): + df.rename(str.lower, axis="index") + with pytest.raises(NotImplementedError, match=msg): + df.rename(mapper=str.lower, axis="index") - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @sql_count_checker(query_count=16, fallback_count=2, sproc_count=2) + @sql_count_checker(query_count=0) def test_rename_mapper_multi(self): df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index( ["A", "B"] ) - result = df.rename(str.upper) - expected = df.rename(index=str.upper) - assert_frame_equal(result, expected) - - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) + msg = "Snowpark pandas rename API is not yet supported for multi-index objects" + with pytest.raises(NotImplementedError, match=msg): + df.rename(["X"]) + + @sql_count_checker(query_count=0) def test_rename_positional_named(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) - result = df.rename(index=str.lower, columns=str.upper) - expected = native_pd.DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) - assert_frame_equal(result, expected, check_dtype=False, check_index_type=False) + msg = "Snowpark pandas rename API doesn't yet support callable mapper" + with pytest.raises(NotImplementedError, match=msg): + df.rename(index=str.lower, columns=str.upper) @sql_count_checker(query_count=0) def test_rename_axis_style_raises(self): diff --git a/tests/integ/modin/frame/test_sort_values.py b/tests/integ/modin/frame/test_sort_values.py index f7b0be03fc7..896f73cd71d 100644 --- a/tests/integ/modin/frame/test_sort_values.py +++ b/tests/integ/modin/frame/test_sort_values.py @@ -138,11 +138,6 @@ def test_sort_values_by_ascending_length_mismatch_negative(native_df_simple): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) @pytest.mark.parametrize( "sort_op", [ @@ -153,20 +148,16 @@ def test_sort_values_by_ascending_length_mismatch_negative(native_df_simple): lambda df: df.sort_values(by=[1, 3], axis=1, ascending=False), ], ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_sort_values_axis_1(sort_op): - native_df = native_pd.DataFrame( + df = pd.DataFrame( [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") ) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result(snow_df, native_df, sort_op) + msg = "Snowpark pandas sort_values API doesn't yet support axis == 1" + with pytest.raises(NotImplementedError, match=msg): + sort_op(df) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) @pytest.mark.parametrize( "sort_op", [ @@ -179,13 +170,14 @@ def test_sort_values_axis_1(sort_op): lambda df: df.sort_values(by=[1, 3], axis=1, ascending=False, inplace=True), ], ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_sort_values_axis_1_inplace(sort_op): - native_df = native_pd.DataFrame( + df = pd.DataFrame( [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") ) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result(snow_df, native_df, sort_op, inplace=True) + msg = "Snowpark pandas sort_values API doesn't yet support axis == 1" + with pytest.raises(NotImplementedError, match=msg): + sort_op(df) @sql_count_checker(query_count=0) @@ -409,11 +401,6 @@ def test_sort_values_ignore_index(native_df_simple, ascending, ignore_index): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) @pytest.mark.parametrize( "op", [ @@ -421,16 +408,12 @@ def test_sort_values_ignore_index(native_df_simple, ascending, ignore_index): lambda df: df.sort_values(by="A", key=lambda x: -x), ], ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_sort_values_key(native_df_simple, op): - # The high query count here is a result of a stored procedure fallback - # due to the key being a lambda function after snow_df gets materialized. snow_df = pd.DataFrame(native_df_simple) - eval_snowpark_pandas_result( - snow_df, - native_df_simple, - op, - ) + msg = "Snowpark pandas sort_values API doesn't yet support 'key' parameter" + with pytest.raises(NotImplementedError, match=msg): + op(snow_df) @pytest.mark.parametrize("label", VALID_PANDAS_LABELS) diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 5626d080e98..33ca2b75f3b 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -11,7 +11,6 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, @@ -431,21 +430,17 @@ def test_dataframe_where_cond_is_none_negative(test_data): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker(query_count=12, fallback_count=1, sproc_count=1) -def test_dataframe_where_with_fallback(test_data, test_cond, test_others): +@sql_count_checker(query_count=0) +def test_dataframe_where_not_implemented(test_data, test_cond, test_others): index_data = [["A", "B"], ["C", "D", "E"]] - - where_test_helper( - [test_data, test_cond, test_others], - [index_data, index_data, index_data], - extra_where_args={"axis": 1}, - ) + df_data_list = [test_data, test_cond, test_others] + df_data_args = [index_data, index_data, index_data] + snow_dfs = [ + make_snow_dataframe(data, data_args) + for data, data_args in zip(df_data_list, df_data_args) + ] + with pytest.raises(NotImplementedError): + snow_dfs[0].where(snow_dfs[1], snow_dfs[2], axis=1) @sql_count_checker(query_count=3, join_count=1) diff --git a/tests/integ/modin/series/test_all_any.py b/tests/integ/modin/series/test_all_any.py index 9a6148953cc..114dac2924c 100644 --- a/tests/integ/modin/series/test_all_any.py +++ b/tests/integ/modin/series/test_all_any.py @@ -9,7 +9,6 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import assert_values_equal, eval_snowpark_pandas_result @@ -91,12 +90,6 @@ def test_any_named_index(): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "data", [ @@ -107,22 +100,14 @@ def test_any_named_index(): ], ) @pytest.mark.parametrize("skipna", [True, False]) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_all_float_fallback(data, skipna): - eval_snowpark_pandas_result( - pd.Series(data), - native_pd.Series(data), - lambda df: df.all(skipna=skipna), - comparator=assert_values_equal, - ) +@sql_count_checker(query_count=0) +def test_all_float_not_implemented(data, skipna): + series = pd.Series(data) + msg = "Snowpark pandas all API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + series.all(skipna=skipna) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "data", [ @@ -133,51 +118,33 @@ def test_all_float_fallback(data, skipna): ], ) @pytest.mark.parametrize("skipna", [True, False]) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_any_float_fallback(data, skipna): - eval_snowpark_pandas_result( - pd.Series(data), - native_pd.Series(data), - lambda df: df.any(skipna=skipna), - comparator=assert_values_equal, - ) +@sql_count_checker(query_count=0) +def test_any_float_not_implemented(data, skipna): + series = pd.Series(data) + msg = "Snowpark pandas any API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + series.any(skipna=skipna) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "data", [["", "b", "c"], ["d", "e", "f"]], ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_all_str_fallback(data): - eval_snowpark_pandas_result( - pd.Series(data), - native_pd.Series(data), - lambda df: df.all(), - comparator=assert_values_equal, - ) +@sql_count_checker(query_count=0) +def test_all_str_not_implemented(data): + series = pd.Series(data) + msg = "Snowpark pandas all API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + series.all() -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "data", [["", "b", "c"], ["d", "e", "f"]], ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_any_str_fallback(data): - eval_snowpark_pandas_result( - pd.Series(data), - native_pd.Series(data), - lambda df: df.any(), - comparator=assert_values_equal, - ) +@sql_count_checker(query_count=0) +def test_any_str_not_implemented(data): + series = pd.Series(data) + msg = "Snowpark pandas any API doesn't yet support non-integer/boolean columns" + with pytest.raises(NotImplementedError, match=msg): + series.any() diff --git a/tests/integ/modin/series/test_apply.py b/tests/integ/modin/series/test_apply.py index 16c8a62ad96..6613e43e8c0 100644 --- a/tests/integ/modin/series/test_apply.py +++ b/tests/integ/modin/series/test_apply.py @@ -21,7 +21,6 @@ from snowflake.snowpark.functions import udf from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.types import DoubleType, StringType, VariantType -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( ColumnSchema, @@ -374,29 +373,17 @@ def f(x, y, z=1) -> int: ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker( - query_count=20, fallback_count=2, sproc_count=2, expect_high_count=True -) -def test_apply_args_kwargs_with_snowpark_pandas_object_fallback(): +@sql_count_checker(query_count=0) +def test_apply_args_kwargs_with_snowpark_pandas_object_not_implemented(): def f(x, y=None) -> int: return x + (y.sum() if y is not None else 0) - native_series = native_pd.Series([1, 2, 3]) snow_series = pd.Series([1, 2, 3]) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_series.apply(f, args=(pd.Series([1, 2]),)), - native_series.apply(f, args=(native_pd.Series([1, 2]),)), - ) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_series.apply(f, y=pd.Series([1, 2])), - native_series.apply(f, y=native_pd.Series([1, 2])), - ) + msg = "Snowpark pandas apply API doesn't yet support DataFrame or Series in 'args' or 'kwargs' of 'func'" + with pytest.raises(NotImplementedError, match=msg): + snow_series.apply(f, args=(pd.Series([1, 2]),)) + with pytest.raises(NotImplementedError, match=msg): + snow_series.apply(f, y=pd.Series([1, 2])) @pytest.mark.parametrize("func", [str, int, float, bytes, list, dict]) @@ -456,67 +443,39 @@ def test_apply_convert_dtype(caplog): assert "convert_dtype is ignored in Snowflake backend" in caplog.text -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) @pytest.mark.parametrize( "func", [[np.min], {2: np.min, 1: "max"}] # TODO SNOW-864025: enable following after str in df.apply is supported # ["min", "mode", "abs"] ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) +@sql_count_checker(query_count=0) def test_apply_input_type_str_list_dict(func): - data = [1.0, 2.0, 3.0] - native_series = native_pd.Series(data) - snow_series = pd.Series(data) - eval_snowpark_pandas_result( - snow_series, native_series, lambda x: x.apply(func), check_index=False - ) + snow_series = pd.Series([1.0, 2.0, 3.0]) + msg = "Snowpark pandas apply API only supports callables func" + with pytest.raises(NotImplementedError, match=msg): + snow_series.apply(func) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker( - query_count=16, fallback_count=2, sproc_count=2, expect_high_count=True -) -def test_map_na_action_ignore(): +@sql_count_checker(query_count=0) +def test_map_na_action_ignore_not_implemented(): snow_series = pd.Series([1, 1.1, "NaN", None], dtype="Float64") - # In native pandas, the last two elements are NaN and pd.NA - assert snow_series.map( - lambda x: x is None, na_action="ignore" - ).to_pandas().to_list() == [False, False, None, None] + msg = "Snowpark pandas map API doesn't yet support na_action == 'ignore'" + with pytest.raises(NotImplementedError, match=msg): + snow_series.map(lambda x: x is None, na_action="ignore") - data = ["cat", "dog", np.nan, "rabbit"] - snow_series = pd.Series(data) - native_series = native_pd.Series(data) - eval_snowpark_pandas_result( - snow_series, - native_series, - lambda x: x.map("I am a {}".format, na_action="ignore"), - ) + snow_series = pd.Series(["cat", "dog", np.nan, "rabbit"]) + with pytest.raises(NotImplementedError, match=msg): + snow_series.map("I am a {}".format, na_action="ignore") -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_map_dict(): +@sql_count_checker(query_count=0) +def test_map_dict_not_implemented(): s = pd.Series(["cat", "dog", np.nan, "rabbit"]) - assert s.map({"cat": "kitten", "dog": "puppy"}).to_pandas().tolist() == [ - "kitten", - "puppy", - None, - None, - ] + msg = "Snowpark pandas map API doesn't yet support non callable 'arg'" + with pytest.raises(NotImplementedError, match=msg): + s.map({"cat": "kitten", "dog": "puppy"}) @sql_count_checker(query_count=8, udf_count=2) diff --git a/tests/integ/modin/series/test_astype.py b/tests/integ/modin/series/test_astype.py index 9deff99e242..ff69e2d4944 100644 --- a/tests/integ/modin/series/test_astype.py +++ b/tests/integ/modin/series/test_astype.py @@ -32,7 +32,6 @@ ) from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.types import _FractionalType, _IntegralType -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_series_equal, @@ -416,12 +415,6 @@ def test_astype_copy(): assert s2 is None -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") @pytest.mark.parametrize( "data, expected", [ @@ -435,10 +428,12 @@ def test_astype_copy(): ), ], ) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_astype_errors_ignore_fallback(data, expected): - s1 = pd.Series(data).astype("datetime64[ns]", errors="ignore") - assert_snowpark_pandas_equal_to_pandas(s1, expected) +@sql_count_checker(query_count=0) +def test_astype_errors_ignore_not_implemented(data, expected): + s1 = pd.Series(data) + msg = "Snowpark pandas astype API doesn't yet support errors == 'ignore'" + with pytest.raises(NotImplementedError, match=msg): + s1.astype("datetime64[ns]", errors="ignore") @sql_count_checker(query_count=0) diff --git a/tests/integ/modin/series/test_rename.py b/tests/integ/modin/series/test_rename.py index ba3187b06cb..7766b5f8800 100644 --- a/tests/integ/modin/series/test_rename.py +++ b/tests/integ/modin/series/test_rename.py @@ -24,23 +24,20 @@ class TestRename: def snow_datetime_series(self, datetime_series): return pd.Series(datetime_series) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) def test_rename(self, snow_datetime_series): ts = snow_datetime_series def renamer(x): return x.strftime("%Y%m%d") - with SqlCounter(query_count=9, fallback_count=1, sproc_count=1): - renamed = ts.rename(renamer) - assert renamed.index[0] == renamer(ts.index[0]) + with SqlCounter(query_count=0): + msg = "Snowpark pandas rename API doesn't yet support callable mapper" + with pytest.raises(NotImplementedError, match=msg): + ts.rename(renamer) # dict - with SqlCounter(query_count=4, join_count=1): + with SqlCounter(query_count=3, join_count=1): + renamed = ts.to_pandas().rename(renamer) rename_dict = dict(zip(ts.index, renamed.index)) renamed2 = ts.rename(rename_dict) # Note: renaming index with dict on Snowflake will use variant as the new data type if rename includes type @@ -101,20 +98,14 @@ def test_rename_axis_supported(self): with pytest.raises(ValueError, match="No axis named 5"): ser.rename({}, axis=5) - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @sql_count_checker(query_count=9, fallback_count=1, sproc_count=1) + @sql_count_checker(query_count=0) def test_rename_inplace(self, snow_datetime_series): def renamer(x): return x.strftime("%Y%m%d") - expected = renamer(snow_datetime_series.index[0]) - - snow_datetime_series.rename(renamer, inplace=True) - assert snow_datetime_series.index[0] == expected + msg = "Snowpark pandas rename API doesn't yet support callable mapper" + with pytest.raises(NotImplementedError, match=msg): + snow_datetime_series.rename(renamer, inplace=True) @sql_count_checker(query_count=0) def test_rename_with_custom_indexer(self): @@ -137,21 +128,13 @@ class MyIndexer: ser.rename(ix, inplace=True) assert ser.name is ix - @pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, - ) - @pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") - @sql_count_checker(query_count=16, fallback_count=2, sproc_count=2) + @sql_count_checker(query_count=0) def test_rename_callable(self): # GH 17407 ser = Series(range(1, 6), index=Index(range(2, 7), name="IntIndex")) - result = ser.rename(str) - expected = ser.rename(lambda i: str(i)) - assert_series_equal(result, expected) - - assert result.name == expected.name + msg = "Snowpark pandas rename API doesn't yet support callable mapper" + with pytest.raises(NotImplementedError, match=msg): + ser.rename(str) @sql_count_checker(query_count=2) def test_rename_none(self): diff --git a/tests/integ/modin/series/test_sort_values.py b/tests/integ/modin/series/test_sort_values.py index 14659f4943e..307d5592341 100644 --- a/tests/integ/modin/series/test_sort_values.py +++ b/tests/integ/modin/series/test_sort_values.py @@ -7,7 +7,6 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -163,24 +162,13 @@ def test_sort_values_ignore_index(snow_series, ascending, ignore_index): ) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@pytest.mark.skipif(running_on_public_ci(), reason="slow fallback test") -@sql_count_checker(query_count=18, fallback_count=2, sproc_count=2) +@sql_count_checker(query_count=0) def test_sort_values_key(snow_series): - eval_snowpark_pandas_result( - snow_series, - snow_series.to_pandas(), - lambda s: s.sort_values(key=lambda x: x + 5), - ) - eval_snowpark_pandas_result( - snow_series, - snow_series.to_pandas(), - lambda s: s.sort_values(key=lambda x: -x), - ) + msg = "Snowpark pandas sort_values API doesn't yet support 'key' parameter" + with pytest.raises(NotImplementedError, match=msg): + snow_series.sort_values(key=lambda x: x + 5) + with pytest.raises(NotImplementedError, match=msg): + snow_series.sort_values(key=lambda x: -x) @sql_count_checker(query_count=2) diff --git a/tests/integ/modin/strings/test_case_justify.py b/tests/integ/modin/strings/test_case_justify.py index aa8f5000829..da2b6a5804e 100644 --- a/tests/integ/modin/strings/test_case_justify.py +++ b/tests/integ/modin/strings/test_case_justify.py @@ -19,16 +19,9 @@ def test_title(): assert_snowpark_pandas_equal_to_pandas(result, expected) -@pytest.mark.xfail( - reason="SNOW-1336091: Snowpark pandas cannot run in sprocs until modin 0.28.1 is available in conda", - strict=True, - raises=RuntimeError, -) -@sql_count_checker(query_count=8, fallback_count=1, sproc_count=1) -def test_casefold(): - # GH25405 - expected = native_pd.Series(["ss", "case", "ssd"]) +@sql_count_checker(query_count=0) +def test_casefold_not_implemented(): s = pd.Series(["ß", "case", "ßd"]) - result = s.str.casefold() - - assert_snowpark_pandas_equal_to_pandas(result, expected) + msg = "Snowpark pandas doesn't yet support casefold method" + with pytest.raises(NotImplementedError, match=msg): + s.str.casefold() diff --git a/tests/unit/modin/test_series_strings.py b/tests/unit/modin/test_series_strings.py index a87e0f8015f..1b6d45c19aa 100644 --- a/tests/unit/modin/test_series_strings.py +++ b/tests/unit/modin/test_series_strings.py @@ -33,7 +33,8 @@ def test_str_cat_no_others(mock_str_register, mock_series): @pytest.mark.parametrize( "func, func_name", [ - (lambda s: s.str.casefold(), "casefold"), + # TODO: SNOW-1347401 cleanup all str methods that fallback + # (lambda s: s.str.casefold(), "casefold"), (lambda s: s.str.cat(["a", "b", "d", "foo"], na_rep="-"), "cat"), (lambda s: s.str.decode("utf-8"), "decode"), (lambda s: s.str.encode("utf-8"), "encode"),