From 27ae233bf9e07f2c2671a53dd3e16ba6745cc274 Mon Sep 17 00:00:00 2001 From: Xiaoxiao He Date: Fri, 12 Jan 2024 13:01:11 -0800 Subject: [PATCH] [SNOW-990542] When converting snowpark dataframe to pandas, cast decimal columns to float type (#1201) * When converting snowpark dataframe to pandas, cast decimal columns to float64 * addressing comments * changelog * use to_numeric * Revert "use to_numeric" This reverts commit 21441fdcadc881ed5f5d6ade4999e0fca19626a5. * comment * try downcast and force float64 --------- Co-authored-by: sfc-gh-sfan --- CHANGELOG.md | 1 + .../snowpark/_internal/server_connection.py | 43 +++++++++++-------- src/snowflake/snowpark/mock/_connection.py | 8 ++-- tests/integ/test_df_to_pandas.py | 22 ++++++++++ 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd99b6f9a15..5332448fa79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ - Earlier timestamp columns without a timezone would be converted to nanosecond epochs and inferred as `LongType()`, but will now be correctly be maintained as timestamp values and be inferred as `TimestampType(TimestampTimeZone.NTZ)`. - Earlier timestamp columns with a timezone would be inferred as `TimestampType(TimestampTimeZone.NTZ)` and loose timezone information but will now be correctly inferred as `TimestampType(TimestampTimeZone.LTZ)` and timezone information is retained correctly. - Set session parameter `PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME` to revert back to old behavior. It is recommended that you update your code soon to align with correct behavior as the parameter will be removed in the future. +- Fixed a bug that `DataFrame.to_pandas` gets decimal type when scale is not 0, and creates an object dtype in `pandas`. Instead, we cast the value to a float64 type. ### Behavior Changes (API Compatible) diff --git a/src/snowflake/snowpark/_internal/server_connection.py b/src/snowflake/snowpark/_internal/server_connection.py index cd15ff0f137..a442d724553 100644 --- a/src/snowflake/snowpark/_internal/server_connection.py +++ b/src/snowflake/snowpark/_internal/server_connection.py @@ -441,12 +441,12 @@ def _to_data_or_iter( data_or_iter = ( map( functools.partial( - _fix_pandas_df_integer, results_cursor=results_cursor + _fix_pandas_df_fixed_type, results_cursor=results_cursor ), results_cursor.fetch_pandas_batches(), ) if to_iter - else _fix_pandas_df_integer( + else _fix_pandas_df_fixed_type( results_cursor.fetch_pandas_all(), results_cursor ) ) @@ -677,7 +677,7 @@ def _get_client_side_session_parameter(self, name: str, default_value: Any) -> A ) -def _fix_pandas_df_integer( +def _fix_pandas_df_fixed_type( pd_df: "pandas.DataFrame", results_cursor: SnowflakeCursor ) -> "pandas.DataFrame": """The compiler does not make any guarantees about the return types - only that they will be large enough for the result. @@ -695,24 +695,31 @@ def _fix_pandas_df_integer( if ( FIELD_ID_TO_NAME.get(column_metadata.type_code) == "FIXED" and column_metadata.precision is not None - and column_metadata.scale == 0 - and not str(pandas_dtype).startswith("int") ): - # When scale = 0 and precision values are between 10-20, the integers fit into int64. - # If we rely only on pandas.to_numeric, it loses precision value on large integers, therefore - # we try to strictly use astype("int64") in this scenario. If the values are too large to - # fit in int64, an OverflowError is thrown and we rely on to_numeric to choose and appropriate - # floating datatype to represent the number. - if column_metadata.precision > 10: - try: - pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("int64") - except OverflowError: + if column_metadata.scale == 0 and not str(pandas_dtype).startswith("int"): + # When scale = 0 and precision values are between 10-20, the integers fit into int64. + # If we rely only on pandas.to_numeric, it loses precision value on large integers, therefore + # we try to strictly use astype("int64") in this scenario. If the values are too large to + # fit in int64, an OverflowError is thrown and we rely on to_numeric to choose and appropriate + # floating datatype to represent the number. + if column_metadata.precision > 10: + try: + pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("int64") + except OverflowError: + pd_df[pandas_col_name] = pandas.to_numeric( + pd_df[pandas_col_name], downcast="integer" + ) + else: pd_df[pandas_col_name] = pandas.to_numeric( pd_df[pandas_col_name], downcast="integer" ) - else: - pd_df[pandas_col_name] = pandas.to_numeric( - pd_df[pandas_col_name], downcast="integer" - ) + elif column_metadata.scale > 0 and not str(pandas_dtype).startswith( + "float" + ): + # For decimal columns, we want to cast it into float64 because pandas doesn't + # recognize decimal type. + pandas.to_numeric(pd_df[pandas_col_name], downcast="float") + if pd_df[pandas_col_name].dtype == "O": + pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("float64") return pd_df diff --git a/src/snowflake/snowpark/mock/_connection.py b/src/snowflake/snowpark/mock/_connection.py index bc858f0a092..27c54c166c6 100644 --- a/src/snowflake/snowpark/mock/_connection.py +++ b/src/snowflake/snowpark/mock/_connection.py @@ -337,12 +337,12 @@ def _to_data_or_iter( data_or_iter = ( map( functools.partial( - _fix_pandas_df_integer, results_cursor=results_cursor + _fix_pandas_df_fixed_type, results_cursor=results_cursor ), results_cursor.fetch_pandas_batches(), ) if to_iter - else _fix_pandas_df_integer( + else _fix_pandas_df_fixed_type( results_cursor.fetch_pandas_all(), results_cursor ) ) @@ -436,7 +436,7 @@ def execute( pandas_df = pandas.DataFrame() for col_name in res.columns: pandas_df[unquote_if_quoted(col_name)] = res[col_name].tolist() - rows = _fix_pandas_df_integer(res) + rows = _fix_pandas_df_fixed_type(res) # the following implementation is just to make DataFrame.to_pandas_batches API workable # in snowflake, large data result are split into multiple data chunks @@ -580,7 +580,7 @@ def get_result_query_id(self, plan: SnowflakePlan, **kwargs) -> str: return result_set["sfqid"] -def _fix_pandas_df_integer(table_res: TableEmulator) -> "pandas.DataFrame": +def _fix_pandas_df_fixed_type(table_res: TableEmulator) -> "pandas.DataFrame": pd_df = pandas.DataFrame() for col_name in table_res.columns: col_sf_type = table_res.sf_types[col_name] diff --git a/tests/integ/test_df_to_pandas.py b/tests/integ/test_df_to_pandas.py index 23f861f4c94..6dd1ecb4345 100644 --- a/tests/integ/test_df_to_pandas.py +++ b/tests/integ/test_df_to_pandas.py @@ -141,6 +141,28 @@ def test_to_pandas_precision_for_number_38_0(session): assert pdf["A"].min() == -9223372036854775808 +def test_to_pandas_precision_for_non_zero_scale(session): + df = session.sql( + """ + SELECT + num1, + num2, + DIV0(num1, num2) AS A, + DIV0(CAST(num1 AS INTEGER), CAST(num2 AS INTEGER)) AS B, + ROUND(B, 2) as C + FROM (VALUES + (1, 11) + ) X(num1, num2); + """ + ) + + pdf = df.to_pandas() + + assert pdf["A"].dtype == "float64" + assert pdf["B"].dtype == "float64" + assert pdf["C"].dtype == "float64" + + def test_to_pandas_non_select(session): # `with ... select ...` is also a SELECT statement isinstance(session.sql("select 1").to_pandas(), PandasDF)