SNOW-1316977: Remove fallbacks and raise not implemented error (#1435)

Please answer these questions before submitting your pull requests. Thanks! 1. What GitHub issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1316977 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency 3. Please describe how your code solves the related issue. Remove fallback for all dataframe and series APIs. This list is primarily driven from https://docs.google.com/document/d/1uMwNgLqFhtoAFeEj59XjR3uKQ77QjA84aTmv8Erbvw0/edit#heading=h.qhdbmdhgqdh5. There still might be more fallback which are not documented. This will be cleaned up in https://snowflakecomputing.atlassian.net/browse/SNOW-1347394 NOTE: I have kept the existing tests mostly unmodified even though there is redundancy. This can be useful when we implement these APIs. NOTE: This also require updates in docs/*supported.rst files. These files are not yet ported from snowpandas. So will do a follow up PR to update the docs.
snowflakedb · Apr 30, 2024 · 1309d80 · 1309d80
1 parent c470057
commit 1309d80
Show file tree

Hide file tree

Showing 28 changed files with 561 additions and 1,156 deletions.
diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py
@@ -802,7 +802,9 @@ def dot(self, other):  # noqa: PR01, RT01, D200
         Compute the matrix multiplication between the ``DataFrame`` and `other`.
         """
         # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
-        ErrorMessage.not_implemented()  # pragma: no cover
+        ErrorMessage.not_implemented(
+            "Snowpark pandas doesn't yet support 'dot' binary operation"
+        )
 
         if isinstance(other, BasePandasDataset):
             common = self.columns.union(other.index)

diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py
@@ -998,7 +998,9 @@ def dot(self, other):  # noqa: PR01, RT01, D200
         Compute the dot product between the Series and the columns of `other`.
         """
         # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions
-        ErrorMessage.not_implemented()  # pragma: no cover
+        ErrorMessage.not_implemented(
+            "Snowpark pandas doesn't yet support 'dot' binary operation"
+        )
 
         if isinstance(other, BasePandasDataset):
             common = self.index.union(other.index)

diff --git a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
@@ -7,6 +7,28 @@
 - Fixed DataFrame's `__getitem__` with boolean DataFrame key.
 - Fixed incorrect regex used in `DataFrame/Series.replace`.
 
+### Behavior Changes
+- Raise not implemented error instead of fallback to pandas in following APIs:
+  - `pd.merge`, `DataFrame.merge` and `DataFrame.join` if given the `validate` parameter.
+  - `pd.to_numeric` if `error == 'ignore'`.
+  - `pd.to_datetime` if `format` is None or not supported in Snowflake or if `exact`, `infer_datetime_format` parameters are given or `origin == 'julian'` or `error == 'ignore'`.
+  - `DataFrame/Series.all` if called on non-integer/boolean columns.
+  - `DataFrame/Series.any` if called on non-integer/boolean columns.
+  - `DataFrame/Series.astype` if casting from string to datetime or `errors == 'ignore'`.
+  - `DataFrame/Series.dropna` if `axis == 1`
+  - `DataFrame/Series.mask` if given `axis` or `level` parameters.
+  - `DataFrame/Series.rename` if `mapper` is callable or the DataFrame/Series has MultiIndex.
+  - `DataFrame/Series.sort_values` if given the `key` parameter.
+  - `DataFrame/Series.sort_index` if given the `key` parameter.
+  - `DataFrame.nunique` if `axis == 1`
+  - `DataFrame.apply` if `axis == 0` or `func` is not callable or `result_type` is given or `args` and `kwargs` contain DataFrame or Series.
+  - `Series.apply` if `axis == 0` or `func` is not callable or `result_type` is given.
+  - `Series.applymap` if `na_action == 'igonre'`.
+  - `DataFrame/Series.ffill` if given the `limit` or `downcast` parameter.
+  - `DataFrame/Series.fillna` if given the `limit` or `downcast` parameter.
+  - `dot` binary operation between `DataFrame/Series`.
+  - `xor` binary operation between `DataFrame/Series`.
+
 ## 1.14.0a2 (2024-04-18)
 
 ### Behavior Changes

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py
@@ -25,6 +25,7 @@
     to_decimal,
 )
 from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit
+from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage
 from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage
 from snowflake.snowpark.types import (
     BooleanType,
@@ -304,44 +305,48 @@ def generate_timestamp_col(
     return new_col
 
 
-def to_datetime_require_fallback(
+def raise_if_to_datetime_not_supported(
     format: str,
     exact: Union[bool, lib.NoDefault] = lib.no_default,
     infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default,
     origin: DateTimeOrigin = "unix",
     errors: DateTimeErrorChoices = "raise",
-) -> bool:
+) -> None:
     """
-    check whether to_datetime requires fallback
+    Raise not implemented error to_datetime API has any unsupported parameter or
+    parameter value
     Args:
         format: the format argument for to_datetime
         exact: the exact argument for to_datetime
         infer_datetime_format: the infer_datetime_format argument for to_datetime
         origin: the origin argument for to_datetime
         errors: the errors argument for to_datetime
-
-    Returns:
-        True if fallback is required; otherwise False
     """
+    error_message = None
     if format is not None and not is_snowflake_timestamp_format_valid(
         to_snowflake_timestamp_format(format)
     ):
         # if format is not given, Snowflake's auto format detection may be different from pandas behavior
-        return True
-
-    if not exact:
+        error_message = (
+            f"Snowpark pandas to_datetime API doesn't yet support given format {format}"
+        )
+    elif not exact:
         # Snowflake does not allow the format to match anywhere in the target string when exact is False
-        return True
-    if infer_datetime_format != lib.no_default:
+        error_message = "Snowpark pandas to_datetime API doesn't yet support non exact format matching"
+    elif infer_datetime_format != lib.no_default:
         # infer_datetime_format is deprecated since version 2.0.0
-        return True
-    if origin == "julian":
+        error_message = "Snowpark pandas to_datetime API doesn't support 'infer_datetime_format' parameter"
+    elif origin == "julian":
         # default for julian calendar support
-        return True
-    if errors == "ignore":
+        error_message = (
+            "Snowpark pandas to_datetime API doesn't yet support julian calendar"
+        )
+    elif errors == "ignore":
         # ignore requires return the whole original input which is not applicable in Snowfalke
-        return True
-    return False
+        error_message = "Snowpark pandas to_datetime API doesn't yet support 'ignore' value for errors parameter"
+
+    if error_message:
+        ErrorMessage.not_implemented(error_message)
 
 
 def convert_dateoffset_to_interval(

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/query_compiler.py
@@ -1476,33 +1476,6 @@ def melt(self, *args, **kwargs):  # noqa: PR02
         """
         return DataFrameDefault.register(pandas.DataFrame.melt)(self, *args, **kwargs)
 
-    @doc_utils.add_refer_to("DataFrame.sort_values")
-    def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):  # noqa: PR02
-        """
-        Reorder the columns based on the lexicographic order of the given rows.
-
-        Parameters
-        ----------
-        rows : label or list of labels
-            The row or rows to sort by.
-        ascending : bool, default: True
-            Sort in ascending order (True) or descending order (False).
-        kind : {"quicksort", "mergesort", "heapsort"}
-        na_position : {"first", "last"}
-        ignore_index : bool
-        key : callable(pandas.Index) -> pandas.Index, optional
-        **kwargs : dict
-            Serves the compatibility purpose. Does not affect the result.
-
-        Returns
-        -------
-        BaseQueryCompiler
-            New QueryCompiler that contains result of the sort.
-        """
-        return DataFrameDefault.register(pandas.DataFrame.sort_values)(
-            self, by=rows, axis=1, ascending=ascending, **kwargs
-        )
-
     # END Abstract map across rows/columns
 
     # Map across rows/columns
@@ -3610,13 +3583,6 @@ def str_cat(self, others, sep=None, na_rep=None, join="left"):
             self, others, sep, na_rep, join
         )
 
-    @doc_utils.doc_str_method(
-        refer_to="casefold",
-        params="",
-    )
-    def str_casefold(self):
-        return StrDefault.register(pandas.Series.str.casefold)(self)
-
     @doc_utils.doc_str_method(refer_to="__getitem__", params="key : object")
     def str___getitem__(self, key):
         return StrDefault.register(pandas.Series.str.__getitem__)(self, key)