Skip to content

Commit

Permalink
When converting snowpark dataframe to pandas, cast decimal columns to…
Browse files Browse the repository at this point in the history
… float64
  • Loading branch information
sfc-gh-xhe committed Jan 11, 2024
1 parent d924d78 commit 77dab49
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 13 deletions.
30 changes: 17 additions & 13 deletions src/snowflake/snowpark/_internal/server_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,24 +695,28 @@ def _fix_pandas_df_integer(
if (
FIELD_ID_TO_NAME.get(column_metadata.type_code) == "FIXED"
and column_metadata.precision is not None
and column_metadata.scale == 0
and not str(pandas_dtype).startswith("int")
):
# When scale = 0 and precision values are between 10-20, the integers fit into int64.
# If we rely only on pandas.to_numeric, it loses precision value on large integers, therefore
# we try to strictly use astype("int64") in this scenario. If the values are too large to
# fit in int64, an OverflowError is thrown and we rely on to_numeric to choose and appropriate
# floating datatype to represent the number.
if column_metadata.precision > 10:
try:
pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("int64")
except OverflowError:
if column_metadata.scale == 0:
# When scale = 0 and precision values are between 10-20, the integers fit into int64.
# If we rely only on pandas.to_numeric, it loses precision value on large integers, therefore
# we try to strictly use astype("int64") in this scenario. If the values are too large to
# fit in int64, an OverflowError is thrown and we rely on to_numeric to choose and appropriate
# floating datatype to represent the number.
if column_metadata.precision > 10:
try:
pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("int64")
except OverflowError:
pd_df[pandas_col_name] = pandas.to_numeric(
pd_df[pandas_col_name], downcast="integer"
)
else:
pd_df[pandas_col_name] = pandas.to_numeric(
pd_df[pandas_col_name], downcast="integer"
)
else:
pd_df[pandas_col_name] = pandas.to_numeric(
pd_df[pandas_col_name], downcast="integer"
)
# For decimal columns, we want to cast it into float64 because pandas doesn't
# recognize decimal type.
pd_df[pandas_col_name] = pd_df[pandas_col_name].astype("float64")

return pd_df
22 changes: 22 additions & 0 deletions tests/integ/test_df_to_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,28 @@ def test_to_pandas_precision_for_number_38_0(session):
assert pdf["A"].min() == -9223372036854775808


def test_to_pandas_precision_for_number_38_6_and_others(session):
df = session.sql(
"""
SELECT
num1,
num2,
DIV0(num1, num2) AS division,
DIV0(CAST(num1 AS INTEGER), CAST(num2 AS INTEGER)) AS division_cast,
ROUND(division_cast, 2) as rnd_cast
FROM (VALUES
(1, 11)
) X(num1, num2);
"""
)

pdf = df.to_pandas()

assert pdf["division"].dtype == "float64"
assert pdf["division_cast"].dtype == "float64"
assert pdf["rnd_cast"].dtype == "float64"


def test_to_pandas_non_select(session):
# `with ... select ...` is also a SELECT statement
isinstance(session.sql("select 1").to_pandas(), PandasDF)
Expand Down

0 comments on commit 77dab49

Please sign in to comment.