From 1eb7a17c938a60e17715c993be534211cbf97cf8 Mon Sep 17 00:00:00 2001 From: Adam Ling Date: Fri, 19 Apr 2024 17:39:19 -0700 Subject: [PATCH] SNOW-1284674: fix timestamp data comparison beyond datetime64[ns] range (#1387) --- CHANGELOG.md | 1 + src/snowflake/snowpark/mock/_util.py | 21 ++++++++++++++++++++- tests/integ/scala/test_column_suite.py | 14 ++++++++++---- tests/utils.py | 13 +++++++++++++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4a099d10f7..e70f8f72e03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ - Fixed a bug where `statement_params` was not passed to query executions that register stored procedures and user defined functions. - Fixed a bug causing `snowflake.snowpark.Session.file.get_stream` to fail for quoted stage locations - Fixed a bug in local testing implementation of to_object, to_array and to_binary to better handle null inputs. +- Fixed a bug in local testing implementation that timestamp data comparison can not handle year beyond 2262. - Fixed a bug in local testing that `Session.builder.getOrCreate` should return the created mock session. ## 1.14.0 (2024-03-20) diff --git a/src/snowflake/snowpark/mock/_util.py b/src/snowflake/snowpark/mock/_util.py index d45a8cf7691..fe771bbae60 100644 --- a/src/snowflake/snowpark/mock/_util.py +++ b/src/snowflake/snowpark/mock/_util.py @@ -201,6 +201,25 @@ def fix_drift_between_column_sf_type_and_dtype(col: ColumnEmulator): and col.apply(lambda x: x is None).any() ): # non-object dtype converts None to NaN for numeric columns return col + """ + notes for the timestamp object type drift here, ideally datetime64[us] should be used here because: + 1. python doesn't have built-in datetime nanosecond support: + https://github.com/python/cpython/blob/3.12/Lib/_pydatetime.py + + 2. numpy datetime64 restrictions, https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units: + datetime64[ns] supports nanoseconds, the year range is limited to [ 1678 AD, 2262 AD] + datetime64[us] supports milliseconds, the year range is more relaxed [290301 BC, 294241 AD] + + 3. snowflake date range recommendation + according to snowflake https://docs.snowflake.com/en/sql-reference/data-types-datetime#date + the recommend year range is 1582, 9999 + + however, on Python 3.8 max supported version pandas 2.0.3 + version numpy 1.24.4 does not recognize datetime64[us], + always defaults to unit ns, leading to time out of band error. + + based upon these information and for simplicity, we can use object for now, then move onto datetime64[us], + then seek solution for nanosecond. + """ sf_type_to_dtype = { ArrayType: object, BinaryType: object, @@ -215,7 +234,7 @@ def fix_drift_between_column_sf_type_and_dtype(col: ColumnEmulator): NullType: object, ShortType: numpy.int8 if not col.sf_type.nullable else "Int8", StringType: object, - TimestampType: "datetime64[ns]", + TimestampType: object, # "datetime64[us]", not working on Python3.8 pandas 2.0.8 + numpy 1.24.4 TimeType: object, VariantType: object, MapType: object, diff --git a/tests/integ/scala/test_column_suite.py b/tests/integ/scala/test_column_suite.py index 64d613bdac5..facd5c2167c 100644 --- a/tests/integ/scala/test_column_suite.py +++ b/tests/integ/scala/test_column_suite.py @@ -130,6 +130,15 @@ def test_gt_and_lt(session): assert test_data1.where(test_data1["NUM"] < 2).collect() == [Row(1, True, "a")] assert test_data1.where(test_data1["NUM"] < lit(2)).collect() == [Row(1, True, "a")] + test_data_datetime = TestData.datetime_primitives2(session) + res = datetime.datetime(2000, 5, 6, 0, 0, 0) + assert test_data_datetime.where( + test_data_datetime["timestamp"] > res + ).collect() == [Row(datetime.datetime(9999, 12, 31, 0, 0, 0, 123456))] + assert test_data_datetime.where( + test_data_datetime["timestamp"] < res + ).collect() == [Row(datetime.datetime(1583, 1, 1, 23, 59, 59, 567890))] + @pytest.mark.localtest def test_leq_and_geq(session): @@ -754,11 +763,8 @@ def test_in_expression_2_in_with_subquery(session): Utils.check_answer(df4, [Row(False), Row(True), Row(True)]) +@pytest.mark.localtest def test_in_expression_3_with_all_types(session, local_testing_mode): - # TODO: local testing support to_timestamp_ntz - # stored proc by default uses timestime type according to: - # https://docs.snowflake.com/en/sql-reference/parameters#timestamp-type-mapping - # we keep the test here for future reference schema = StructType( [ StructField("id", LongType()), diff --git a/tests/utils.py b/tests/utils.py index 4bc8e7c475f..84c7623fc22 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -803,6 +803,19 @@ def datetime_primitives1(cls, session: "Session") -> DataFrame: ) return session.create_dataframe(data, schema) + @classmethod + def datetime_primitives2(cls, session: "Session") -> DataFrame: + data = [ + "9999-12-31 00:00:00.123456", + "1583-01-01 23:59:59.56789", + ] + schema = StructType( + [ + StructField("timestamp", TimestampType(TimestampTimeZone.NTZ)), + ] + ) + return session.create_dataframe(data, schema) + @classmethod def time_primitives1(cls, session: "Session") -> DataFrame: # simple string data