When converting snowpark dataframe to pandas, cast decimal columns to…

… float64
snowflakedb · Jan 11, 2024 · 77dab49 · 77dab49
1 parent d924d78
commit 77dab49
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 13 deletions.
diff --git a/src/snowflake/snowpark/_internal/server_connection.py b/src/snowflake/snowpark/_internal/server_connection.py
@@ -695,24 +695,28 @@ def _fix_pandas_df_integer(
         if (
             FIELD_ID_TO_NAME.get(column_metadata.type_code) == "FIXED"
             and column_metadata.precision is not None
-            and column_metadata.scale == 0
             and not str(pandas_dtype).startswith("int")
         ):
-            # When scale = 0 and precision values are between 10-20, the integers fit into int64.
-            # If we rely only on pandas.to_numeric, it loses precision value on large integers, therefore
-            # we try to strictly use astype("int64") in this scenario. If the values are too large to
-            # fit in int64, an OverflowError is thrown and we rely on to_numeric to choose and appropriate
-            # floating datatype to represent the number.
-            if column_metadata.precision > 10:
-                try:
-                    pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("int64")
-                except OverflowError:
+            if column_metadata.scale == 0:
+                # When scale = 0 and precision values are between 10-20, the integers fit into int64.
+                # If we rely only on pandas.to_numeric, it loses precision value on large integers, therefore
+                # we try to strictly use astype("int64") in this scenario. If the values are too large to
+                # fit in int64, an OverflowError is thrown and we rely on to_numeric to choose and appropriate
+                # floating datatype to represent the number.
+                if column_metadata.precision > 10:
+                    try:
+                        pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("int64")
+                    except OverflowError:
+                        pd_df[pandas_col_name] = pandas.to_numeric(
+                            pd_df[pandas_col_name], downcast="integer"
+                        )
+                else:
                     pd_df[pandas_col_name] = pandas.to_numeric(
                         pd_df[pandas_col_name], downcast="integer"
                     )
             else:
-                pd_df[pandas_col_name] = pandas.to_numeric(
-                    pd_df[pandas_col_name], downcast="integer"
-                )
+                # For decimal columns, we want to cast it into float64 because pandas doesn't
+                # recognize decimal type.
+                pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("float64")
 
     return pd_df
diff --git a/tests/integ/test_df_to_pandas.py b/tests/integ/test_df_to_pandas.py
@@ -141,6 +141,28 @@ def test_to_pandas_precision_for_number_38_0(session):
     assert pdf["A"].min() == -9223372036854775808
 
 
+def test_to_pandas_precision_for_number_38_6_and_others(session):
+    df = session.sql(
+        """
+        SELECT
+            num1,
+            num2,
+            DIV0(num1, num2) AS division,
+            DIV0(CAST(num1 AS INTEGER), CAST(num2 AS INTEGER)) AS division_cast,
+            ROUND(division_cast, 2) as rnd_cast
+        FROM (VALUES
+            (1, 11)
+        ) X(num1, num2);
+        """
+    )
+
+    pdf = df.to_pandas()
+
+    assert pdf["division"].dtype == "float64"
+    assert pdf["division_cast"].dtype == "float64"
+    assert pdf["rnd_cast"].dtype == "float64"
+
+
 def test_to_pandas_non_select(session):
     # `with ... select ...` is also a SELECT statement
     isinstance(session.sql("select 1").to_pandas(), PandasDF)