diff --git a/CHANGELOG.md b/CHANGELOG.md index 98feea9cdaa..b48523e8f51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,24 @@ ## 1.27.0 (TBD) +### Snowpark Python API Updates + +#### New Features + +- Added support for the following functions in `functions.py` + - `array_reverse` + - `divnull` + - `map_cat` + - `map_contains_key` + - `map_keys` + - `nullifzero` + - `snowflake_cortex_sentiment` +- Added `Catalog` class to manage snowflake objects. It can be accessed via `Session.catalog`. + +#### Improvements + +- Updated README.md to include instructions on how to verify package signatures using `cosign`. + ### Snowpark pandas API Updates #### New Features @@ -9,7 +27,34 @@ - Added support for `Series.str.ljust` and `Series.str.rjust`. - Added support for `Series.str.center`. - Added support for `Series.str.pad`. +- Added support for applying Snowpark Python function `snowflake_cortex_sentiment`. +- Added support for `DataFrame.map`. +- Added support for `DataFrame.from_dict` and `DataFrame.from_records`. +- Added support for mixed case field names in struct type columns. +- Added support for `SeriesGroupBy.unique` +- Added support for `Series.dt.strftime` with the following directives: + - %d: Day of the month as a zero-padded decimal number. + - %m: Month as a zero-padded decimal number. + - %Y: Year with century as a decimal number. + - %H: Hour (24-hour clock) as a zero-padded decimal number. + - %M: Minute as a zero-padded decimal number. + - %S: Second as a zero-padded decimal number. + - %f: Microsecond as a decimal number, zero-padded to 6 digits. + - %j: Day of the year as a zero-padded decimal number. + - %X: Locale’s appropriate time representation. + - %%: A literal '%' character. +- Added support for `Series.between`. + +#### Bug Fixes + +- Fixed a bug that system function called through `session.call` have incorrect type conversion. +#### Improvements +- Improve performance of `DataFrame.map`, `Series.apply` and `Series.map` methods by mapping numpy functions to snowpark functions if possible. +- Updated integration testing for `session.lineage.trace` to exclude deleted objects +- Added documentation for `DataFrame.map`. +- Improve performance of `DataFrame.apply` by mapping numpy functions to snowpark functions if possible. +- Added documentation on the extent of Snowpark pandas interoperability with scikit-learn ## 1.26.0 (2024-12-05) diff --git a/README.md b/README.md index 6bf3563b662..a0514136ebb 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,28 @@ pandas_df = df.to_pandas() Note that the above Snowpark pandas commands will work if Snowpark is installed with the `[modin]` option, the additional `[pandas]` installation is not required. +## Verifying Package Signatures + +To ensure the authenticity and integrity of the Python package, follow the steps below to verify the package signature using `cosign`. + +**Steps to verify the signature:** +- Install cosign: + - This example is using golang installation: [installing-cosign-with-go](https://edu.chainguard.dev/open-source/sigstore/cosign/how-to-install-cosign/#installing-cosign-with-go) +- Download the file from the repository like pypi: + - https://pypi.org/project/snowflake-snowpark-python/#files +- Download the signature files from the release tag, replace the version number with the version you are verifying: + - https://github.com/snowflakedb/snowpark-python/releases/tag/v1.22.1 +- Verify signature: + ````bash + # replace the version number with the version you are verifying + ./cosign verify-blob snowflake_snowpark_python-1.22.1-py3-none-any.whl \ + --certificate snowflake_snowpark_python-1.22.1-py3-none-any.whl.crt \ + --certificate-identity https://github.com/snowflakedb/snowpark-python/.github/workflows/python-publish.yml@refs/tags/v1.22.1 \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + --signature snowflake_snowpark_python-1.22.1-py3-none-any.whl.sig + Verified OK + ```` + ## Contributing Please refer to [CONTRIBUTING.md][contributing]. diff --git a/docs/source/modin/index.rst b/docs/source/modin/index.rst index 99ea3881bd8..68e8ef7c8e9 100644 --- a/docs/source/modin/index.rst +++ b/docs/source/modin/index.rst @@ -19,5 +19,6 @@ For your convenience, here is all the :doc:`Supported APIs ` window groupby resampling + interoperability numpy performance \ No newline at end of file diff --git a/docs/source/modin/interoperability.rst b/docs/source/modin/interoperability.rst new file mode 100644 index 00000000000..c928eebb117 --- /dev/null +++ b/docs/source/modin/interoperability.rst @@ -0,0 +1,153 @@ +=========================================== +Interoperability with third party libraries +=========================================== + +Many third party libraries are interoperable with pandas, for example by accepting pandas dataframes objects as function +inputs. Here we have a non-exhaustive list of third party library use cases with pandas and note whether each method +works in Snowpark pandas as well. + +Snowpark pandas supports the `dataframe interchange protocol `_, which +some libraries use to interoperate with Snowpark pandas to the same level of support as pandas. + +plotly.express +============== + +The following table is structured as follows: The first column contains the name of a method in the ``plotly.express`` module. +The second column is a flag for whether or not interoperability is guaranteed with Snowpark pandas. For each of these +operations, we validate that passing in Snowpark pandas dataframes or series as the data inputs behaves equivalently +to passing in pandas dataframes or series. + +.. note:: + ``Y`` stands for yes, i.e., interoperability is guaranteed with this method, and ``N`` stands for no. + + +.. note:: + Currently only plotly versions <6.0.0 are supported through the dataframe interchange protocol. + ++-------------------------+---------------------------------------------+--------------------------------------------+ +| Method name | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``scatter`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``line`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``area`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``timeline`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``violin`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``bar`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``histogram`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``pie`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``treemap`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``sunburst`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``icicle`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``scatter_matrix`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``funnel`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``density_heatmap`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``boxplot`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ +| ``imshow`` | Y | | ++-------------------------+---------------------------------------------+--------------------------------------------+ + + +scikit-learn +============ + +We break down scikit-learn interoperability by categories of scikit-learn +operations. + +For each category, we provide a table of interoperability with the following +structure: The first column describes a scikit-learn operation that may include +multiple method calls. The second column is a flag for whether or not +interoperability is guaranteed with Snowpark pandas. For each of these methods, +we validate that passing in Snowpark pandas objects behaves equivalently to +passing in pandas objects. + +.. note:: + ``Y`` stands for yes, i.e., interoperability is guaranteed with this method, and ``N`` stands for no. + +.. note:: + While some scikit-learn methods accept Snowpark pandas inputs, their + performance with Snowpark pandas inputs is often much worse than their + performance with native pandas inputs. Generally we recommend converting + Snowpark pandas inputs to pandas with ``to_pandas()`` before passing them + to scikit-learn. + + +Classification +-------------- + ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Operation | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation| ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Fitting a ``LinearDiscriminantAnalysis`` | Y | | +| classifier with the ``fit()`` method and | | | +| classifying data with the ``predict()`` | | | +| method. | | | ++--------------------------------------------+---------------------------------------------+---------------------------------+ + + +Regression +---------- + ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Operation | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation| ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Fitting a ``LogisticRegression`` model | Y | | +| with the ``fit()`` method and predicting | | | +| results with the ``predict()`` method. | | | ++--------------------------------------------+---------------------------------------------+---------------------------------+ + +Clustering +---------- + ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Clustering method | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation| ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| ``KMeans.fit()`` | Y | | ++--------------------------------------------+---------------------------------------------+---------------------------------+ + + +Dimensionality reduction +------------------------ + ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Operation | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation| ++--------------------------------------------+---------------------------------------------+---------------------------------+ +| Getting the principal components of a | Y | | +| numerical dataset with ``PCA.fit()``. | | | ++--------------------------------------------+---------------------------------------------+---------------------------------+ + + +Model selection +------------------------ + ++--------------------------------------------+---------------------------------------------+-----------------------------------------------+ +| Operation | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation | ++--------------------------------------------+---------------------------------------------+-----------------------------------------------+ +| Choosing parameters for a | Y | ``RandomizedSearchCV`` causes Snowpark pandas | +| ``LogisticRegression`` model with | | to issue many queries. We strongly recommend | +| ``RandomizedSearchCV.fit()``. | | converting Snowpark pandas inputs to pandas | +| | | before using ``RandomizedSearchCV`` | ++--------------------------------------------+---------------------------------------------+-----------------------------------------------+ + +Preprocessing +------------- + ++--------------------------------------------+---------------------------------------------+-----------------------------------------------+ +| Operation | Interoperable with Snowpark pandas? (Y/N) | Notes for current implementation | ++--------------------------------------------+---------------------------------------------+-----------------------------------------------+ +| Scaling training data with | Y | | +| ``MaxAbsScaler.fit_transform()``. | | | ++--------------------------------------------+---------------------------------------------+-----------------------------------------------+ diff --git a/docs/source/modin/supported/agg_supp.rst b/docs/source/modin/supported/agg_supp.rst new file mode 100644 index 00000000000..5b3a2c174c0 --- /dev/null +++ b/docs/source/modin/supported/agg_supp.rst @@ -0,0 +1,62 @@ +:orphan: + +Supported Aggregation Functions +==================================== + +This page lists which aggregation functions are supported by ``DataFrame.agg``, +``Series.agg``, ``DataFrameGroupBy.agg``, and ``SeriesGroupBy.agg``. +The following table is structured as follows: The first column contains the aggregation function's name. +The second column is a flag for whether or not the aggregation is supported by ``DataFrame.agg``. The +third column is a flag for whether or not the aggregation is supported by ``Series.agg``. The fourth column +is whether or not the aggregation is supported by ``DataFrameGroupBy.agg``. The fifth column is whether or not +the aggregation is supported by ``SeriesGroupBy.agg``. + +.. note:: + ``Y`` stands for yes (supports distributed implementation), ``N`` stands for no (API simply errors out), + and ``P`` stands for partial (meaning some parameters may not be supported yet). + + Both Python builtin and NumPy functions are supported for ``DataFrameGroupBy.agg`` and ``SeriesGroupBy.agg``. + ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| Aggregation Function | ``DataFrame.agg`` supports? (Y/N/P) | ``Series.agg`` supports? (Y/N/P) | ``DataFrameGroupBy.agg`` supports? (Y/N/P) | ``SeriesGroupBy.agg`` supports? (Y/N/P) | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``count`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``mean`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``min`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``max`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``sum`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``median`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``size`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``std`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | +| | ``ddof=0`` or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``var`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | +| | ``ddof=0`` or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``quantile`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``q`` is the | ``P`` - only when ``q`` is the | ``P`` - only when ``q`` is the | +| | ``q`` is the default value or | default value or a scalar. | default value or a scalar. | default value or a scalar. | +| | a scalar. | | | | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``len`` | ``N`` | ``N`` | ``Y`` | ``Y`` | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 8f139ec5d36..f86e0f0e772 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -65,15 +65,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | ``margins``, ``observed``, | If ``axis == 0``: ``Y`` when function is one of | -| | | ``sort`` | ``count``, ``mean``, ``min``, ``max``, ``sum``, | -| | | | ``median``, ``size``; ``std`` and ``var`` | -| | | | supported with ``ddof=0`` or ``ddof=1``; | -| | | | ``quantile`` is supported when ``q`` is the | -| | | | default value or a scalar. | -| | | | If ``axis == 1``: ``Y`` when function is | -| | | | ``count``, ``min``, ``max``, or ``sum`` and the | -| | | | index is not a MultiIndex. | +| ``agg`` | P | ``margins``, ``observed``, | Check | +| | | ``sort`` | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | ``margins``, ``observed``, | See ``agg`` | | | | ``sort`` | | @@ -193,9 +187,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``floordiv`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``from_dict`` | N | | | +| ``from_dict`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``from_records`` | N | | | +| ``from_records`` | P | | ``N`` if parameter ``data`` is set to a DataFrame | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``ge`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -258,7 +252,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``lt`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``map`` | N | | | +| ``map`` | P | | ``N`` if ``na_action == "ignore"`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``mask`` | P | | ``N`` if given ``axis`` when ``other`` is a | | | | | ``DataFrame`` or ``level`` parameters; | diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 695301bcc1e..dde67fbdc1c 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -30,10 +30,9 @@ Function application +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | GroupBy method | Snowpark implemented? (Y/N/P/D) | Missing parameters | Notes for current implementation | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | ``axis`` other than 0 is not | ``Y``, support functions are count, mean, min, max,| -| | | implemented. | sum, median, std, size, len, and var | -| | | | (including both Python and NumPy functions) | -| | | | otherwise ``N``. | +| ``agg`` | P | ``axis`` other than 0 is not | Check | +| | | implemented. | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | ``axis`` other than 0 is not | See ``agg`` | | | | implemented. | | @@ -170,6 +169,8 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ +| ``unique`` | Y | | ++-----------------------------+---------------------------------+----------------------------------------------------+ | ``value_counts`` | P | ``N`` if ``bins`` is given for SeriesGroupBy | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | See ``std`` | diff --git a/docs/source/modin/supported/series_dt_supported.rst b/docs/source/modin/supported/series_dt_supported.rst index f3d969fe23e..6dfcee5815d 100644 --- a/docs/source/modin/supported/series_dt_supported.rst +++ b/docs/source/modin/supported/series_dt_supported.rst @@ -98,7 +98,9 @@ the method in the left column. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``normalize`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``strftime`` | N | | +| ``strftime`` | P | ``N`` if `date_format` contains directives other | +| | | than (`%d`, `%m`, `%Y`, `%H`, `%M`, `%S`, `%f`, | +| | | `%j`, `%X`, `%%`). | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``round`` | P | ``N`` if `ambiguous` or `nonexistent` are set to a | | | | non-default value. | diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 6521d9ffd39..ec4a6665ca4 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -76,12 +76,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | | ``Y`` when function is one of ``count``, | -| | | | ``mean``, ``min``, ``max``, ``sum``, ``median``, | -| | | | ``size``; ``std`` and ``var`` supported with | -| | | | ``ddof=0`` or ``ddof=1``; ``quantile`` is | -| | | | supported when ``q`` is the default value | -| | | | or a scalar. | +| ``agg`` | P | | Check | +| | | | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | | See ``agg`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -119,7 +116,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``backfill`` | P | | ``N`` if param ``downcast`` is set. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``between`` | N | | | +| ``between`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``between_time`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/docs/source/snowpark/catalog.rst b/docs/source/snowpark/catalog.rst new file mode 100644 index 00000000000..20c568f6bb4 --- /dev/null +++ b/docs/source/snowpark/catalog.rst @@ -0,0 +1,67 @@ +============= +Catalog +============= +Catalog module for Snowpark. + +.. currentmodule:: snowflake.snowpark.catalog + +.. rubric:: Catalog + +.. autosummary:: + :toctree: api/ + + Catalog.databaseExists + Catalog.database_exists + Catalog.dropDatabase + Catalog.dropSchema + Catalog.dropTable + Catalog.dropView + Catalog.drop_database + Catalog.drop_schema + Catalog.drop_table + Catalog.drop_view + Catalog.getCurrentDatabase + Catalog.getCurrentSchema + Catalog.getDatabase + Catalog.getProcedure + Catalog.getSchema + Catalog.getTable + Catalog.getUserDefinedFunction + Catalog.getView + Catalog.get_current_database + Catalog.get_current_schema + Catalog.get_database + Catalog.get_procedure + Catalog.get_schema + Catalog.get_table + Catalog.get_user_defined_function + Catalog.get_view + Catalog.listColumns + Catalog.listDatabases + Catalog.listProcedures + Catalog.listSchemas + Catalog.listTables + Catalog.listUserDefinedFunctions + Catalog.listViews + Catalog.list_columns + Catalog.list_databases + Catalog.list_procedures + Catalog.list_schemas + Catalog.list_tables + Catalog.list_user_defined_functions + Catalog.list_views + Catalog.procedureExists + Catalog.procedure_exists + Catalog.schemaExists + Catalog.schema_exists + Catalog.setCurrentDatabase + Catalog.setCurrentSchema + Catalog.set_current_database + Catalog.set_current_schema + Catalog.tableExists + Catalog.table_exists + Catalog.userDefinedFunctionExists + Catalog.user_defined_function_exists + Catalog.viewExists + Catalog.view_exists + diff --git a/docs/source/snowpark/functions.rst b/docs/source/snowpark/functions.rst index 71e83093565..fcd96f47e19 100644 --- a/docs/source/snowpark/functions.rst +++ b/docs/source/snowpark/functions.rst @@ -35,21 +35,26 @@ Functions array_construct_compact array_contains array_distinct + array_except array_flatten array_generate_range array_insert array_intersection + array_join array_max array_min array_position array_prepend array_remove + array_reverse array_size array_slice array_sort array_to_string + array_union array_unique_agg arrays_overlap + arrays_zip as_array as_binary as_char @@ -143,6 +148,7 @@ Functions desc_nulls_first desc_nulls_last div0 + divnull endswith equal_nan exp @@ -204,6 +210,10 @@ Functions lpad ltrim make_interval + map_cat + map_concat + map_contains_key + map_keys max md5 mean @@ -219,6 +229,7 @@ Functions next_day not_ ntile + nullifzero object_agg object_construct object_construct_keep_null @@ -260,6 +271,7 @@ Functions sinh size skew + snowflake_cortex_sentiment snowflake_cortex_summarize sort_array soundex diff --git a/docs/source/snowpark/index.rst b/docs/source/snowpark/index.rst index ad3ad563e39..ab8125d4058 100644 --- a/docs/source/snowpark/index.rst +++ b/docs/source/snowpark/index.rst @@ -9,9 +9,9 @@ Snowpark APIs column types row - functions - window - grouping + functions + window + grouping table_function table async_job @@ -21,6 +21,7 @@ Snowpark APIs udtf observability files + catalog lineage context exceptions diff --git a/docs/source/snowpark/session.rst b/docs/source/snowpark/session.rst index 21da1f76849..3e9b046a521 100644 --- a/docs/source/snowpark/session.rst +++ b/docs/source/snowpark/session.rst @@ -38,6 +38,7 @@ Snowpark Session Session.append_query_tag Session.call Session.cancel_all + Session.catalog Session.clear_imports Session.clear_packages Session.close diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 467593287a3..b357eda695a 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -43,6 +43,7 @@ requirements: - protobuf >=3.20,<6 - python-dateutil - tzlocal + - snowflake.core >=1.0.0,<2 test: imports: diff --git a/setup.py b/setup.py index a1be8a8eda7..24f1dbe15ef 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ "protobuf>=3.20, <6", # Snowpark IR "python-dateutil", # Snowpark IR "tzlocal", # Snowpark IR + "snowflake.core>=1.0.0, <2", # Catalog ] REQUIRED_PYTHON_VERSION = ">=3.8, <3.12" @@ -199,7 +200,9 @@ def run(self): *DEVELOPMENT_REQUIREMENTS, "scipy", # Snowpark pandas 3rd party library testing "statsmodels", # Snowpark pandas 3rd party library testing - "scikit-learn==1.5.2", # Snowpark pandas scikit-learn tests + "scikit-learn", # Snowpark pandas 3rd party library testing + # plotly version restricted due to foreseen change in query counts in version 6.0.0+ + "plotly<6.0.0", # Snowpark pandas 3rd party library testing ], "localtest": [ "pandas", diff --git a/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py b/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py index 21a33c99af8..930898e1180 100644 --- a/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py +++ b/src/snowflake/snowpark/_internal/analyzer/datatype_mapper.py @@ -63,9 +63,55 @@ def float_nan_inf_to_sql(value: float) -> str: return f"{cast_value} :: FLOAT" -def to_sql(value: Any, datatype: DataType, from_values_statement: bool = False) -> str: - """Convert a value with DataType to a snowflake compatible sql""" +def to_sql_no_cast( + value: Any, + datatype: DataType, +) -> str: + if value is None: + return "NULL" + if isinstance(datatype, VariantType): + # PARSE_JSON returns VARIANT, so no need to append :: VARIANT here explicitly. + return f"PARSE_JSON({str_to_sql(json.dumps(value, cls=PythonObjJSONEncoder))})" + if isinstance(value, str): + if isinstance(datatype, GeographyType): + return f"TO_GEOGRAPHY({str_to_sql(value)})" + if isinstance(datatype, GeometryType): + return f"TO_GEOMETRY({str_to_sql(value)})" + return str_to_sql(value) + if isinstance(value, float) and (math.isnan(value) or math.isinf(value)): + cast_value = float_nan_inf_to_sql(value) + return cast_value[:-9] + if isinstance(value, (list, bytes, bytearray)) and isinstance(datatype, BinaryType): + return str(bytes(value)) + if isinstance(value, (list, tuple, array)) and isinstance(datatype, ArrayType): + return f"PARSE_JSON({str_to_sql(json.dumps(value, cls=PythonObjJSONEncoder))})" + if isinstance(value, dict) and isinstance(datatype, MapType): + return f"PARSE_JSON({str_to_sql(json.dumps(value, cls=PythonObjJSONEncoder))})" + if isinstance(datatype, DateType): + if isinstance(value, int): + # add value as number of days to 1970-01-01 + target_date = date(1970, 1, 1) + timedelta(days=value) + return f"'{target_date.isoformat()}'" + elif isinstance(value, date): + return f"'{value.isoformat()}'" + if isinstance(datatype, TimestampType): + if isinstance(value, (int, datetime)): + if isinstance(value, int): + # add value as microseconds to 1970-01-01 00:00:00.00. + value = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta( + microseconds=value + ) + return f"'{value}'" + return f"{value}" + + +def to_sql( + value: Any, + datatype: DataType, + from_values_statement: bool = False, +) -> str: + """Convert a value with DataType to a snowflake compatible sql""" # Handle null values if isinstance( datatype, diff --git a/src/snowflake/snowpark/_internal/ast/utils.py b/src/snowflake/snowpark/_internal/ast/utils.py index ce6f206a2e7..01fcfa26d31 100644 --- a/src/snowflake/snowpark/_internal/ast/utils.py +++ b/src/snowflake/snowpark/_internal/ast/utils.py @@ -380,6 +380,33 @@ def build_sp_table_name( # type: ignore[no-untyped-def] # TODO(SNOW-1491199) # raise ValueError(f"Invalid name type {type(name)} for SpTableName entity.") +def build_function_expr( + builtin_name: str, + args: List[Any], + ignore_null_args: bool = False, +) -> proto.Expr: + """ + Creates AST encoding for the methods in function.py. + Args: + builtin_name: Name of the builtin function to call. + args: Positional arguments to pass to function, in the form of a list. + ignore_null_args: If True, null arguments will be ignored. + Returns: + The AST encoding of the function. + """ + ast = proto.Expr() + args_list = [arg for arg in args if arg is not None] if ignore_null_args else args + build_builtin_fn_apply( + ast, + builtin_name, + *tuple( + snowpark_expression_to_ast(arg) if isinstance(arg, Expression) else arg + for arg in args_list + ), + ) + return ast + + # TODO(SNOW-1491199) - This method is not covered by tests until the end of phase 0. Drop the pragma when it is covered. def build_builtin_fn_apply( ast: proto.Expr, @@ -395,7 +422,6 @@ def build_builtin_fn_apply( builtin_name: Name of the builtin function to call. *args: Positional arguments to pass to function. **kwargs: Keyword arguments to pass to function. - """ expr = with_src_position(ast.apply_expr) # type: ignore[arg-type] # TODO(SNOW-1491199) # Argument 1 to "with_src_position" has incompatible type "ApplyExpr"; expected "Expr" _set_fn_name(builtin_name, expr.fn.builtin_fn) # type: ignore[attr-defined] # TODO(SNOW-1491199) # "Expr" has no attribute "fn" diff --git a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py index eb6bbeded1d..1f1971b9f99 100644 --- a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py +++ b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py @@ -477,6 +477,14 @@ def _is_relaxed_pipeline_breaker(self, node: LogicalPlan) -> bool: if isinstance(node, SelectStatement): return True + if isinstance(node, SnowflakePlan): + return node.source_plan is not None and self._is_relaxed_pipeline_breaker( + node.source_plan + ) + + if isinstance(node, SelectSnowflakePlan): + return self._is_relaxed_pipeline_breaker(node.snowflake_plan) + return False def _is_node_pipeline_breaker(self, node: LogicalPlan) -> bool: diff --git a/src/snowflake/snowpark/_internal/compiler/query_generator.py b/src/snowflake/snowpark/_internal/compiler/query_generator.py index c78db41ad19..c9e61e6c850 100644 --- a/src/snowflake/snowpark/_internal/compiler/query_generator.py +++ b/src/snowflake/snowpark/_internal/compiler/query_generator.py @@ -6,7 +6,10 @@ from snowflake.snowpark._internal.analyzer.analyzer import Analyzer from snowflake.snowpark._internal.analyzer.expression import Attribute -from snowflake.snowpark._internal.analyzer.select_statement import Selectable +from snowflake.snowpark._internal.analyzer.select_statement import ( + SelectSnowflakePlan, + Selectable, +) from snowflake.snowpark._internal.analyzer.snowflake_plan import ( PlanQueryType, Query, @@ -66,6 +69,16 @@ def __init__( # between the CTE definition is satisfied. self.resolved_with_query_block: Dict[str, Query] = {} + def to_selectable(self, plan: LogicalPlan) -> Selectable: + """Given a LogicalPlan, convert it to a Selectable.""" + if isinstance(plan, Selectable): + return plan + + snowflake_plan = self.resolve(plan) + selectable = SelectSnowflakePlan(snowflake_plan, analyzer=self) + selectable._is_valid_for_replacement = snowflake_plan._is_valid_for_replacement + return selectable + def generate_queries( self, logical_plans: List[LogicalPlan] ) -> Dict[PlanQueryType, List[Query]]: diff --git a/src/snowflake/snowpark/_internal/compiler/utils.py b/src/snowflake/snowpark/_internal/compiler/utils.py index 24f413a1ba9..1521bde1604 100644 --- a/src/snowflake/snowpark/_internal/compiler/utils.py +++ b/src/snowflake/snowpark/_internal/compiler/utils.py @@ -118,14 +118,6 @@ def replace_child( based on the parent node type. """ - def to_selectable(plan: LogicalPlan, query_generator: QueryGenerator) -> Selectable: - """Given a LogicalPlan, convert it to a Selectable.""" - if isinstance(plan, Selectable): - return plan - - snowflake_plan = query_generator.resolve(plan) - return SelectSnowflakePlan(snowflake_plan, analyzer=query_generator) - if not parent._is_valid_for_replacement: raise ValueError(f"parent node {parent} is not valid for replacement.") @@ -143,13 +135,13 @@ def to_selectable(plan: LogicalPlan, query_generator: QueryGenerator) -> Selecta replace_child(parent.source_plan, old_child, new_child, query_generator) elif isinstance(parent, SelectStatement): - parent.from_ = to_selectable(new_child, query_generator) + parent.from_ = query_generator.to_selectable(new_child) # once the subquery is updated, set _merge_projection_complexity_with_subquery to False to # disable the projection complexity merge parent._merge_projection_complexity_with_subquery = False elif isinstance(parent, SetStatement): - new_child_as_selectable = to_selectable(new_child, query_generator) + new_child_as_selectable = query_generator.to_selectable(new_child) parent._nodes = [ node if node != old_child else new_child_as_selectable for node in parent._nodes @@ -421,6 +413,9 @@ def get_name(node: Optional[LogicalPlan]) -> str: name = f"{name} :: ({'| '.join(properties)})" score = get_complexity_score(node) + num_ref_ctes = "nil" + if isinstance(node, (SnowflakePlan, Selectable)): + num_ref_ctes = len(node.referenced_ctes) sql_text = "" if isinstance(node, Selectable): sql_text = node.sql_query @@ -429,7 +424,7 @@ def get_name(node: Optional[LogicalPlan]) -> str: sql_size = len(sql_text) sql_preview = sql_text[:50] - return f"{name=}\n{score=}, {sql_size=}\n{sql_preview=}" + return f"{name=}\n{score=}, {num_ref_ctes=}, {sql_size=}\n{sql_preview=}" g = graphviz.Graph(format="png") @@ -439,7 +434,8 @@ def get_name(node: Optional[LogicalPlan]) -> str: next_level = [] for node in curr_level: node_id = hex(id(node)) - g.node(node_id, get_stat(node)) + color = "lightblue" if node._is_valid_for_replacement else "red" + g.node(node_id, get_stat(node), color=color) if isinstance(node, (Selectable, SnowflakePlan)): children = node.children_plan_nodes else: diff --git a/src/snowflake/snowpark/_internal/proto/ast.proto b/src/snowflake/snowpark/_internal/proto/ast.proto index f8e612cf341..09e5aebfd96 100644 --- a/src/snowflake/snowpark/_internal/proto/ast.proto +++ b/src/snowflake/snowpark/_internal/proto/ast.proto @@ -1,4 +1,4 @@ -// N.B. This file is generated by `ir-dsl-c`. DO NOT EDIT! +// N.B. This file is generated by `//Snowpark/ir-dsl-c`. DO NOT EDIT! // Generated from `{git@github.com:snowflakedb/snowflake.git}/Snowpark/ast`. syntax = "proto3"; @@ -283,6 +283,15 @@ message SpJoinType { } } +// sp-col-expr.ir:70 +message SpNullOrder { + oneof variant { + bool sp_null_order_default = 1; + bool sp_null_order_nulls_first = 2; + bool sp_null_order_nulls_last = 3; + } +} + // sp-type.ir:88 message SpPivotValue { oneof sealed_value { @@ -673,142 +682,141 @@ message Expr { SpColumnDesc sp_column_desc = 63; SpColumnEqualNan sp_column_equal_nan = 64; SpColumnEqualNull sp_column_equal_null = 65; - SpColumnIn_Dataframe sp_column_in__dataframe = 66; - SpColumnIn_Seq sp_column_in__seq = 67; - SpColumnIsNotNull sp_column_is_not_null = 68; - SpColumnIsNull sp_column_is_null = 69; - SpColumnOver sp_column_over = 70; - SpColumnRef sp_column_ref = 71; - SpColumnSqlExpr sp_column_sql_expr = 72; - SpColumnStringCollate sp_column_string_collate = 73; - SpColumnStringContains sp_column_string_contains = 74; - SpColumnStringEndsWith sp_column_string_ends_with = 75; - SpColumnStringLike sp_column_string_like = 76; - SpColumnStringRegexp sp_column_string_regexp = 77; - SpColumnStringStartsWith sp_column_string_starts_with = 78; - SpColumnStringSubstr sp_column_string_substr = 79; - SpColumnTryCast sp_column_try_cast = 80; - SpColumnWithinGroup sp_column_within_group = 81; - SpCreateDataframe sp_create_dataframe = 82; - SpDataframeAgg sp_dataframe_agg = 83; - SpDataframeAlias sp_dataframe_alias = 84; - SpDataframeAnalyticsComputeLag sp_dataframe_analytics_compute_lag = 85; - SpDataframeAnalyticsComputeLead sp_dataframe_analytics_compute_lead = 86; - SpDataframeAnalyticsCumulativeAgg sp_dataframe_analytics_cumulative_agg = 87; - SpDataframeAnalyticsMovingAgg sp_dataframe_analytics_moving_agg = 88; - SpDataframeAnalyticsTimeSeriesAgg sp_dataframe_analytics_time_series_agg = 89; - SpDataframeApply sp_dataframe_apply = 90; - SpDataframeCacheResult sp_dataframe_cache_result = 91; - SpDataframeCol sp_dataframe_col = 92; - SpDataframeCollect sp_dataframe_collect = 93; - SpDataframeCopyIntoTable sp_dataframe_copy_into_table = 94; - SpDataframeCount sp_dataframe_count = 95; - SpDataframeCreateOrReplaceDynamicTable sp_dataframe_create_or_replace_dynamic_table = 96; - SpDataframeCreateOrReplaceView sp_dataframe_create_or_replace_view = 97; - SpDataframeCrossJoin sp_dataframe_cross_join = 98; - SpDataframeCube sp_dataframe_cube = 99; - SpDataframeDescribe sp_dataframe_describe = 100; - SpDataframeDistinct sp_dataframe_distinct = 101; - SpDataframeDrop sp_dataframe_drop = 102; - SpDataframeDropDuplicates sp_dataframe_drop_duplicates = 103; - SpDataframeExcept sp_dataframe_except = 104; - SpDataframeFilter sp_dataframe_filter = 105; - SpDataframeFirst sp_dataframe_first = 106; - SpDataframeFlatten sp_dataframe_flatten = 107; - SpDataframeGroupBy sp_dataframe_group_by = 108; - SpDataframeGroupByGroupingSets sp_dataframe_group_by_grouping_sets = 109; - SpDataframeGroupBy_Columns sp_dataframe_group_by__columns = 110; - SpDataframeGroupBy_Strings sp_dataframe_group_by__strings = 111; - SpDataframeIntersect sp_dataframe_intersect = 112; - SpDataframeJoin sp_dataframe_join = 113; - SpDataframeJoinTableFunction sp_dataframe_join_table_function = 114; - SpDataframeJoin_Dataframe_JoinExprs sp_dataframe_join__dataframe__join_exprs = 115; - SpDataframeJoin_Dataframe_UsingColumns sp_dataframe_join__dataframe__using_columns = 116; - SpDataframeLimit sp_dataframe_limit = 117; - SpDataframeNaDrop_Python sp_dataframe_na_drop__python = 118; - SpDataframeNaDrop_Scala sp_dataframe_na_drop__scala = 119; - SpDataframeNaFill sp_dataframe_na_fill = 120; - SpDataframeNaReplace sp_dataframe_na_replace = 121; - SpDataframeNaturalJoin sp_dataframe_natural_join = 122; - SpDataframePivot sp_dataframe_pivot = 123; - SpDataframeRandomSplit sp_dataframe_random_split = 124; - SpDataframeRef sp_dataframe_ref = 125; - SpDataframeRename sp_dataframe_rename = 126; - SpDataframeRollup sp_dataframe_rollup = 127; - SpDataframeRollup_Columns sp_dataframe_rollup__columns = 128; - SpDataframeRollup_Strings sp_dataframe_rollup__strings = 129; - SpDataframeSample sp_dataframe_sample = 130; - SpDataframeSelect_Columns sp_dataframe_select__columns = 131; - SpDataframeSelect_Exprs sp_dataframe_select__exprs = 132; - SpDataframeShow sp_dataframe_show = 133; - SpDataframeSort sp_dataframe_sort = 134; - SpDataframeStatApproxQuantile sp_dataframe_stat_approx_quantile = 135; - SpDataframeStatCorr sp_dataframe_stat_corr = 136; - SpDataframeStatCov sp_dataframe_stat_cov = 137; - SpDataframeStatCrossTab sp_dataframe_stat_cross_tab = 138; - SpDataframeStatSampleBy sp_dataframe_stat_sample_by = 139; - SpDataframeToDf sp_dataframe_to_df = 140; - SpDataframeToLocalIterator sp_dataframe_to_local_iterator = 141; - SpDataframeToPandas sp_dataframe_to_pandas = 142; - SpDataframeToPandasBatches sp_dataframe_to_pandas_batches = 143; - SpDataframeUnion sp_dataframe_union = 144; - SpDataframeUnionAll sp_dataframe_union_all = 145; - SpDataframeUnionAllByName sp_dataframe_union_all_by_name = 146; - SpDataframeUnionByName sp_dataframe_union_by_name = 147; - SpDataframeUnpivot sp_dataframe_unpivot = 148; - SpDataframeWhere sp_dataframe_where = 149; - SpDataframeWithColumn sp_dataframe_with_column = 150; - SpDataframeWithColumnRenamed sp_dataframe_with_column_renamed = 151; - SpDataframeWithColumns sp_dataframe_with_columns = 152; - SpDataframeWrite sp_dataframe_write = 153; - SpDatatypeVal sp_datatype_val = 154; - SpFlatten sp_flatten = 155; - SpFnRef sp_fn_ref = 156; - SpGenerator sp_generator = 157; - SpGroupingSets sp_grouping_sets = 158; - SpMergeDeleteWhenMatchedClause sp_merge_delete_when_matched_clause = 159; - SpMergeInsertWhenNotMatchedClause sp_merge_insert_when_not_matched_clause = 160; - SpMergeUpdateWhenMatchedClause sp_merge_update_when_matched_clause = 161; - SpRange sp_range = 162; - SpReadAvro sp_read_avro = 163; - SpReadCsv sp_read_csv = 164; - SpReadJson sp_read_json = 165; - SpReadOrc sp_read_orc = 166; - SpReadParquet sp_read_parquet = 167; - SpReadTable sp_read_table = 168; - SpReadXml sp_read_xml = 169; - SpRelationalGroupedDataframeAgg sp_relational_grouped_dataframe_agg = 170; - SpRelationalGroupedDataframeApplyInPandas sp_relational_grouped_dataframe_apply_in_pandas = 171; - SpRelationalGroupedDataframeBuiltin sp_relational_grouped_dataframe_builtin = 172; - SpRelationalGroupedDataframePivot sp_relational_grouped_dataframe_pivot = 173; - SpRelationalGroupedDataframeRef sp_relational_grouped_dataframe_ref = 174; - SpRow sp_row = 175; - SpSessionTableFunction sp_session_table_function = 176; - SpSql sp_sql = 177; - SpTable sp_table = 178; - SpTableDelete sp_table_delete = 179; - SpTableDropTable sp_table_drop_table = 180; - SpTableFnCallAlias sp_table_fn_call_alias = 181; - SpTableFnCallOver sp_table_fn_call_over = 182; - SpTableMerge sp_table_merge = 183; - SpTableSample sp_table_sample = 184; - SpTableUpdate sp_table_update = 185; - SpToSnowparkPandas sp_to_snowpark_pandas = 186; - SpWriteCopyIntoLocation sp_write_copy_into_location = 187; - SpWriteCsv sp_write_csv = 188; - SpWriteJson sp_write_json = 189; - SpWritePandas sp_write_pandas = 190; - SpWriteParquet sp_write_parquet = 191; - SpWriteTable sp_write_table = 192; - StoredProcedure stored_procedure = 193; - StringVal string_val = 194; - Sub sub = 195; - TimeVal time_val = 196; - TimestampVal timestamp_val = 197; - TupleVal tuple_val = 198; - Udaf udaf = 199; - Udf udf = 200; - Udtf udtf = 201; + SpColumnIn sp_column_in = 66; + SpColumnIsNotNull sp_column_is_not_null = 67; + SpColumnIsNull sp_column_is_null = 68; + SpColumnOver sp_column_over = 69; + SpColumnRef sp_column_ref = 70; + SpColumnSqlExpr sp_column_sql_expr = 71; + SpColumnStringCollate sp_column_string_collate = 72; + SpColumnStringContains sp_column_string_contains = 73; + SpColumnStringEndsWith sp_column_string_ends_with = 74; + SpColumnStringLike sp_column_string_like = 75; + SpColumnStringRegexp sp_column_string_regexp = 76; + SpColumnStringStartsWith sp_column_string_starts_with = 77; + SpColumnStringSubstr sp_column_string_substr = 78; + SpColumnTryCast sp_column_try_cast = 79; + SpColumnWithinGroup sp_column_within_group = 80; + SpCreateDataframe sp_create_dataframe = 81; + SpDataframeAgg sp_dataframe_agg = 82; + SpDataframeAlias sp_dataframe_alias = 83; + SpDataframeAnalyticsComputeLag sp_dataframe_analytics_compute_lag = 84; + SpDataframeAnalyticsComputeLead sp_dataframe_analytics_compute_lead = 85; + SpDataframeAnalyticsCumulativeAgg sp_dataframe_analytics_cumulative_agg = 86; + SpDataframeAnalyticsMovingAgg sp_dataframe_analytics_moving_agg = 87; + SpDataframeAnalyticsTimeSeriesAgg sp_dataframe_analytics_time_series_agg = 88; + SpDataframeApply sp_dataframe_apply = 89; + SpDataframeCacheResult sp_dataframe_cache_result = 90; + SpDataframeCol sp_dataframe_col = 91; + SpDataframeCollect sp_dataframe_collect = 92; + SpDataframeCopyIntoTable sp_dataframe_copy_into_table = 93; + SpDataframeCount sp_dataframe_count = 94; + SpDataframeCreateOrReplaceDynamicTable sp_dataframe_create_or_replace_dynamic_table = 95; + SpDataframeCreateOrReplaceView sp_dataframe_create_or_replace_view = 96; + SpDataframeCrossJoin sp_dataframe_cross_join = 97; + SpDataframeCube sp_dataframe_cube = 98; + SpDataframeDescribe sp_dataframe_describe = 99; + SpDataframeDistinct sp_dataframe_distinct = 100; + SpDataframeDrop sp_dataframe_drop = 101; + SpDataframeDropDuplicates sp_dataframe_drop_duplicates = 102; + SpDataframeExcept sp_dataframe_except = 103; + SpDataframeFilter sp_dataframe_filter = 104; + SpDataframeFirst sp_dataframe_first = 105; + SpDataframeFlatten sp_dataframe_flatten = 106; + SpDataframeGroupBy sp_dataframe_group_by = 107; + SpDataframeGroupByGroupingSets sp_dataframe_group_by_grouping_sets = 108; + SpDataframeGroupBy_Columns sp_dataframe_group_by__columns = 109; + SpDataframeGroupBy_Strings sp_dataframe_group_by__strings = 110; + SpDataframeIntersect sp_dataframe_intersect = 111; + SpDataframeJoin sp_dataframe_join = 112; + SpDataframeJoinTableFunction sp_dataframe_join_table_function = 113; + SpDataframeJoin_Dataframe_JoinExprs sp_dataframe_join__dataframe__join_exprs = 114; + SpDataframeJoin_Dataframe_UsingColumns sp_dataframe_join__dataframe__using_columns = 115; + SpDataframeLimit sp_dataframe_limit = 116; + SpDataframeNaDrop_Python sp_dataframe_na_drop__python = 117; + SpDataframeNaDrop_Scala sp_dataframe_na_drop__scala = 118; + SpDataframeNaFill sp_dataframe_na_fill = 119; + SpDataframeNaReplace sp_dataframe_na_replace = 120; + SpDataframeNaturalJoin sp_dataframe_natural_join = 121; + SpDataframePivot sp_dataframe_pivot = 122; + SpDataframeRandomSplit sp_dataframe_random_split = 123; + SpDataframeRef sp_dataframe_ref = 124; + SpDataframeRename sp_dataframe_rename = 125; + SpDataframeRollup sp_dataframe_rollup = 126; + SpDataframeRollup_Columns sp_dataframe_rollup__columns = 127; + SpDataframeRollup_Strings sp_dataframe_rollup__strings = 128; + SpDataframeSample sp_dataframe_sample = 129; + SpDataframeSelect_Columns sp_dataframe_select__columns = 130; + SpDataframeSelect_Exprs sp_dataframe_select__exprs = 131; + SpDataframeShow sp_dataframe_show = 132; + SpDataframeSort sp_dataframe_sort = 133; + SpDataframeStatApproxQuantile sp_dataframe_stat_approx_quantile = 134; + SpDataframeStatCorr sp_dataframe_stat_corr = 135; + SpDataframeStatCov sp_dataframe_stat_cov = 136; + SpDataframeStatCrossTab sp_dataframe_stat_cross_tab = 137; + SpDataframeStatSampleBy sp_dataframe_stat_sample_by = 138; + SpDataframeToDf sp_dataframe_to_df = 139; + SpDataframeToLocalIterator sp_dataframe_to_local_iterator = 140; + SpDataframeToPandas sp_dataframe_to_pandas = 141; + SpDataframeToPandasBatches sp_dataframe_to_pandas_batches = 142; + SpDataframeUnion sp_dataframe_union = 143; + SpDataframeUnionAll sp_dataframe_union_all = 144; + SpDataframeUnionAllByName sp_dataframe_union_all_by_name = 145; + SpDataframeUnionByName sp_dataframe_union_by_name = 146; + SpDataframeUnpivot sp_dataframe_unpivot = 147; + SpDataframeWhere sp_dataframe_where = 148; + SpDataframeWithColumn sp_dataframe_with_column = 149; + SpDataframeWithColumnRenamed sp_dataframe_with_column_renamed = 150; + SpDataframeWithColumns sp_dataframe_with_columns = 151; + SpDataframeWrite sp_dataframe_write = 152; + SpDatatypeVal sp_datatype_val = 153; + SpFlatten sp_flatten = 154; + SpFnRef sp_fn_ref = 155; + SpGenerator sp_generator = 156; + SpGroupingSets sp_grouping_sets = 157; + SpMergeDeleteWhenMatchedClause sp_merge_delete_when_matched_clause = 158; + SpMergeInsertWhenNotMatchedClause sp_merge_insert_when_not_matched_clause = 159; + SpMergeUpdateWhenMatchedClause sp_merge_update_when_matched_clause = 160; + SpRange sp_range = 161; + SpReadAvro sp_read_avro = 162; + SpReadCsv sp_read_csv = 163; + SpReadJson sp_read_json = 164; + SpReadOrc sp_read_orc = 165; + SpReadParquet sp_read_parquet = 166; + SpReadTable sp_read_table = 167; + SpReadXml sp_read_xml = 168; + SpRelationalGroupedDataframeAgg sp_relational_grouped_dataframe_agg = 169; + SpRelationalGroupedDataframeApplyInPandas sp_relational_grouped_dataframe_apply_in_pandas = 170; + SpRelationalGroupedDataframeBuiltin sp_relational_grouped_dataframe_builtin = 171; + SpRelationalGroupedDataframePivot sp_relational_grouped_dataframe_pivot = 172; + SpRelationalGroupedDataframeRef sp_relational_grouped_dataframe_ref = 173; + SpRow sp_row = 174; + SpSessionTableFunction sp_session_table_function = 175; + SpSql sp_sql = 176; + SpTable sp_table = 177; + SpTableDelete sp_table_delete = 178; + SpTableDropTable sp_table_drop_table = 179; + SpTableFnCallAlias sp_table_fn_call_alias = 180; + SpTableFnCallOver sp_table_fn_call_over = 181; + SpTableMerge sp_table_merge = 182; + SpTableSample sp_table_sample = 183; + SpTableUpdate sp_table_update = 184; + SpToSnowparkPandas sp_to_snowpark_pandas = 185; + SpWriteCopyIntoLocation sp_write_copy_into_location = 186; + SpWriteCsv sp_write_csv = 187; + SpWriteJson sp_write_json = 188; + SpWritePandas sp_write_pandas = 189; + SpWriteParquet sp_write_parquet = 190; + SpWriteTable sp_write_table = 191; + StoredProcedure stored_procedure = 192; + StringVal string_val = 193; + Sub sub = 194; + TimeVal time_val = 195; + TimestampVal timestamp_val = 196; + TupleVal tuple_val = 197; + Udaf udaf = 198; + Udf udf = 199; + Udtf udtf = 200; } } @@ -951,152 +959,151 @@ message HasSrcPosition { SpColumnDesc sp_column_desc = 67; SpColumnEqualNan sp_column_equal_nan = 68; SpColumnEqualNull sp_column_equal_null = 69; - SpColumnIn_Dataframe sp_column_in__dataframe = 70; - SpColumnIn_Seq sp_column_in__seq = 71; - SpColumnIsNotNull sp_column_is_not_null = 72; - SpColumnIsNull sp_column_is_null = 73; - SpColumnOver sp_column_over = 74; - SpColumnRef sp_column_ref = 75; - SpColumnSqlExpr sp_column_sql_expr = 76; - SpColumnStringCollate sp_column_string_collate = 77; - SpColumnStringContains sp_column_string_contains = 78; - SpColumnStringEndsWith sp_column_string_ends_with = 79; - SpColumnStringLike sp_column_string_like = 80; - SpColumnStringRegexp sp_column_string_regexp = 81; - SpColumnStringStartsWith sp_column_string_starts_with = 82; - SpColumnStringSubstr sp_column_string_substr = 83; - SpColumnTryCast sp_column_try_cast = 84; - SpColumnWithinGroup sp_column_within_group = 85; - SpCreateDataframe sp_create_dataframe = 86; - SpDataframeAgg sp_dataframe_agg = 87; - SpDataframeAlias sp_dataframe_alias = 88; - SpDataframeAnalyticsComputeLag sp_dataframe_analytics_compute_lag = 89; - SpDataframeAnalyticsComputeLead sp_dataframe_analytics_compute_lead = 90; - SpDataframeAnalyticsCumulativeAgg sp_dataframe_analytics_cumulative_agg = 91; - SpDataframeAnalyticsMovingAgg sp_dataframe_analytics_moving_agg = 92; - SpDataframeAnalyticsTimeSeriesAgg sp_dataframe_analytics_time_series_agg = 93; - SpDataframeApply sp_dataframe_apply = 94; - SpDataframeCacheResult sp_dataframe_cache_result = 95; - SpDataframeCol sp_dataframe_col = 96; - SpDataframeCollect sp_dataframe_collect = 97; - SpDataframeCopyIntoTable sp_dataframe_copy_into_table = 98; - SpDataframeCount sp_dataframe_count = 99; - SpDataframeCreateOrReplaceDynamicTable sp_dataframe_create_or_replace_dynamic_table = 100; - SpDataframeCreateOrReplaceView sp_dataframe_create_or_replace_view = 101; - SpDataframeCrossJoin sp_dataframe_cross_join = 102; - SpDataframeCube sp_dataframe_cube = 103; - SpDataframeDescribe sp_dataframe_describe = 104; - SpDataframeDistinct sp_dataframe_distinct = 105; - SpDataframeDrop sp_dataframe_drop = 106; - SpDataframeDropDuplicates sp_dataframe_drop_duplicates = 107; - SpDataframeExcept sp_dataframe_except = 108; - SpDataframeFilter sp_dataframe_filter = 109; - SpDataframeFirst sp_dataframe_first = 110; - SpDataframeFlatten sp_dataframe_flatten = 111; - SpDataframeGroupBy sp_dataframe_group_by = 112; - SpDataframeGroupByGroupingSets sp_dataframe_group_by_grouping_sets = 113; - SpDataframeGroupBy_Columns sp_dataframe_group_by__columns = 114; - SpDataframeGroupBy_Strings sp_dataframe_group_by__strings = 115; - SpDataframeIntersect sp_dataframe_intersect = 116; - SpDataframeJoin sp_dataframe_join = 117; - SpDataframeJoinTableFunction sp_dataframe_join_table_function = 118; - SpDataframeJoin_Dataframe_JoinExprs sp_dataframe_join__dataframe__join_exprs = 119; - SpDataframeJoin_Dataframe_UsingColumns sp_dataframe_join__dataframe__using_columns = 120; - SpDataframeLimit sp_dataframe_limit = 121; - SpDataframeNaDrop_Python sp_dataframe_na_drop__python = 122; - SpDataframeNaDrop_Scala sp_dataframe_na_drop__scala = 123; - SpDataframeNaFill sp_dataframe_na_fill = 124; - SpDataframeNaReplace sp_dataframe_na_replace = 125; - SpDataframeNaturalJoin sp_dataframe_natural_join = 126; - SpDataframePivot sp_dataframe_pivot = 127; - SpDataframeRandomSplit sp_dataframe_random_split = 128; - SpDataframeReaderInit sp_dataframe_reader_init = 129; - SpDataframeReaderOption sp_dataframe_reader_option = 130; - SpDataframeReaderOptions sp_dataframe_reader_options = 131; - SpDataframeReaderSchema sp_dataframe_reader_schema = 132; - SpDataframeReaderWithMetadata sp_dataframe_reader_with_metadata = 133; - SpDataframeRef sp_dataframe_ref = 134; - SpDataframeRename sp_dataframe_rename = 135; - SpDataframeRollup sp_dataframe_rollup = 136; - SpDataframeRollup_Columns sp_dataframe_rollup__columns = 137; - SpDataframeRollup_Strings sp_dataframe_rollup__strings = 138; - SpDataframeSample sp_dataframe_sample = 139; - SpDataframeSelect_Columns sp_dataframe_select__columns = 140; - SpDataframeSelect_Exprs sp_dataframe_select__exprs = 141; - SpDataframeShow sp_dataframe_show = 142; - SpDataframeSort sp_dataframe_sort = 143; - SpDataframeStatApproxQuantile sp_dataframe_stat_approx_quantile = 144; - SpDataframeStatCorr sp_dataframe_stat_corr = 145; - SpDataframeStatCov sp_dataframe_stat_cov = 146; - SpDataframeStatCrossTab sp_dataframe_stat_cross_tab = 147; - SpDataframeStatSampleBy sp_dataframe_stat_sample_by = 148; - SpDataframeToDf sp_dataframe_to_df = 149; - SpDataframeToLocalIterator sp_dataframe_to_local_iterator = 150; - SpDataframeToPandas sp_dataframe_to_pandas = 151; - SpDataframeToPandasBatches sp_dataframe_to_pandas_batches = 152; - SpDataframeUnion sp_dataframe_union = 153; - SpDataframeUnionAll sp_dataframe_union_all = 154; - SpDataframeUnionAllByName sp_dataframe_union_all_by_name = 155; - SpDataframeUnionByName sp_dataframe_union_by_name = 156; - SpDataframeUnpivot sp_dataframe_unpivot = 157; - SpDataframeWhere sp_dataframe_where = 158; - SpDataframeWithColumn sp_dataframe_with_column = 159; - SpDataframeWithColumnRenamed sp_dataframe_with_column_renamed = 160; - SpDataframeWithColumns sp_dataframe_with_columns = 161; - SpDataframeWrite sp_dataframe_write = 162; - SpDatatypeVal sp_datatype_val = 163; - SpFlatten sp_flatten = 164; - SpFnRef sp_fn_ref = 165; - SpGenerator sp_generator = 166; - SpGroupingSets sp_grouping_sets = 167; - SpMergeDeleteWhenMatchedClause sp_merge_delete_when_matched_clause = 168; - SpMergeInsertWhenNotMatchedClause sp_merge_insert_when_not_matched_clause = 169; - SpMergeUpdateWhenMatchedClause sp_merge_update_when_matched_clause = 170; - SpRange sp_range = 171; - SpReadAvro sp_read_avro = 172; - SpReadCsv sp_read_csv = 173; - SpReadJson sp_read_json = 174; - SpReadOrc sp_read_orc = 175; - SpReadParquet sp_read_parquet = 176; - SpReadTable sp_read_table = 177; - SpReadXml sp_read_xml = 178; - SpRelationalGroupedDataframeAgg sp_relational_grouped_dataframe_agg = 179; - SpRelationalGroupedDataframeApplyInPandas sp_relational_grouped_dataframe_apply_in_pandas = 180; - SpRelationalGroupedDataframeBuiltin sp_relational_grouped_dataframe_builtin = 181; - SpRelationalGroupedDataframePivot sp_relational_grouped_dataframe_pivot = 182; - SpRelationalGroupedDataframeRef sp_relational_grouped_dataframe_ref = 183; - SpRow sp_row = 184; - SpSessionTableFunction sp_session_table_function = 185; - SpSql sp_sql = 186; - SpTable sp_table = 187; - SpTableDelete sp_table_delete = 188; - SpTableDropTable sp_table_drop_table = 189; - SpTableFnCallAlias sp_table_fn_call_alias = 190; - SpTableFnCallOver sp_table_fn_call_over = 191; - SpTableMerge sp_table_merge = 192; - SpTableSample sp_table_sample = 193; - SpTableUpdate sp_table_update = 194; - SpToSnowparkPandas sp_to_snowpark_pandas = 195; - SpWindowSpecEmpty sp_window_spec_empty = 196; - SpWindowSpecOrderBy sp_window_spec_order_by = 197; - SpWindowSpecPartitionBy sp_window_spec_partition_by = 198; - SpWindowSpecRangeBetween sp_window_spec_range_between = 199; - SpWindowSpecRowsBetween sp_window_spec_rows_between = 200; - SpWriteCopyIntoLocation sp_write_copy_into_location = 201; - SpWriteCsv sp_write_csv = 202; - SpWriteJson sp_write_json = 203; - SpWritePandas sp_write_pandas = 204; - SpWriteParquet sp_write_parquet = 205; - SpWriteTable sp_write_table = 206; - StoredProcedure stored_procedure = 207; - StringVal string_val = 208; - Sub sub = 209; - TimeVal time_val = 210; - TimestampVal timestamp_val = 211; - TupleVal tuple_val = 212; - Udaf udaf = 213; - Udf udf = 214; - Udtf udtf = 215; + SpColumnIn sp_column_in = 70; + SpColumnIsNotNull sp_column_is_not_null = 71; + SpColumnIsNull sp_column_is_null = 72; + SpColumnOver sp_column_over = 73; + SpColumnRef sp_column_ref = 74; + SpColumnSqlExpr sp_column_sql_expr = 75; + SpColumnStringCollate sp_column_string_collate = 76; + SpColumnStringContains sp_column_string_contains = 77; + SpColumnStringEndsWith sp_column_string_ends_with = 78; + SpColumnStringLike sp_column_string_like = 79; + SpColumnStringRegexp sp_column_string_regexp = 80; + SpColumnStringStartsWith sp_column_string_starts_with = 81; + SpColumnStringSubstr sp_column_string_substr = 82; + SpColumnTryCast sp_column_try_cast = 83; + SpColumnWithinGroup sp_column_within_group = 84; + SpCreateDataframe sp_create_dataframe = 85; + SpDataframeAgg sp_dataframe_agg = 86; + SpDataframeAlias sp_dataframe_alias = 87; + SpDataframeAnalyticsComputeLag sp_dataframe_analytics_compute_lag = 88; + SpDataframeAnalyticsComputeLead sp_dataframe_analytics_compute_lead = 89; + SpDataframeAnalyticsCumulativeAgg sp_dataframe_analytics_cumulative_agg = 90; + SpDataframeAnalyticsMovingAgg sp_dataframe_analytics_moving_agg = 91; + SpDataframeAnalyticsTimeSeriesAgg sp_dataframe_analytics_time_series_agg = 92; + SpDataframeApply sp_dataframe_apply = 93; + SpDataframeCacheResult sp_dataframe_cache_result = 94; + SpDataframeCol sp_dataframe_col = 95; + SpDataframeCollect sp_dataframe_collect = 96; + SpDataframeCopyIntoTable sp_dataframe_copy_into_table = 97; + SpDataframeCount sp_dataframe_count = 98; + SpDataframeCreateOrReplaceDynamicTable sp_dataframe_create_or_replace_dynamic_table = 99; + SpDataframeCreateOrReplaceView sp_dataframe_create_or_replace_view = 100; + SpDataframeCrossJoin sp_dataframe_cross_join = 101; + SpDataframeCube sp_dataframe_cube = 102; + SpDataframeDescribe sp_dataframe_describe = 103; + SpDataframeDistinct sp_dataframe_distinct = 104; + SpDataframeDrop sp_dataframe_drop = 105; + SpDataframeDropDuplicates sp_dataframe_drop_duplicates = 106; + SpDataframeExcept sp_dataframe_except = 107; + SpDataframeFilter sp_dataframe_filter = 108; + SpDataframeFirst sp_dataframe_first = 109; + SpDataframeFlatten sp_dataframe_flatten = 110; + SpDataframeGroupBy sp_dataframe_group_by = 111; + SpDataframeGroupByGroupingSets sp_dataframe_group_by_grouping_sets = 112; + SpDataframeGroupBy_Columns sp_dataframe_group_by__columns = 113; + SpDataframeGroupBy_Strings sp_dataframe_group_by__strings = 114; + SpDataframeIntersect sp_dataframe_intersect = 115; + SpDataframeJoin sp_dataframe_join = 116; + SpDataframeJoinTableFunction sp_dataframe_join_table_function = 117; + SpDataframeJoin_Dataframe_JoinExprs sp_dataframe_join__dataframe__join_exprs = 118; + SpDataframeJoin_Dataframe_UsingColumns sp_dataframe_join__dataframe__using_columns = 119; + SpDataframeLimit sp_dataframe_limit = 120; + SpDataframeNaDrop_Python sp_dataframe_na_drop__python = 121; + SpDataframeNaDrop_Scala sp_dataframe_na_drop__scala = 122; + SpDataframeNaFill sp_dataframe_na_fill = 123; + SpDataframeNaReplace sp_dataframe_na_replace = 124; + SpDataframeNaturalJoin sp_dataframe_natural_join = 125; + SpDataframePivot sp_dataframe_pivot = 126; + SpDataframeRandomSplit sp_dataframe_random_split = 127; + SpDataframeReaderInit sp_dataframe_reader_init = 128; + SpDataframeReaderOption sp_dataframe_reader_option = 129; + SpDataframeReaderOptions sp_dataframe_reader_options = 130; + SpDataframeReaderSchema sp_dataframe_reader_schema = 131; + SpDataframeReaderWithMetadata sp_dataframe_reader_with_metadata = 132; + SpDataframeRef sp_dataframe_ref = 133; + SpDataframeRename sp_dataframe_rename = 134; + SpDataframeRollup sp_dataframe_rollup = 135; + SpDataframeRollup_Columns sp_dataframe_rollup__columns = 136; + SpDataframeRollup_Strings sp_dataframe_rollup__strings = 137; + SpDataframeSample sp_dataframe_sample = 138; + SpDataframeSelect_Columns sp_dataframe_select__columns = 139; + SpDataframeSelect_Exprs sp_dataframe_select__exprs = 140; + SpDataframeShow sp_dataframe_show = 141; + SpDataframeSort sp_dataframe_sort = 142; + SpDataframeStatApproxQuantile sp_dataframe_stat_approx_quantile = 143; + SpDataframeStatCorr sp_dataframe_stat_corr = 144; + SpDataframeStatCov sp_dataframe_stat_cov = 145; + SpDataframeStatCrossTab sp_dataframe_stat_cross_tab = 146; + SpDataframeStatSampleBy sp_dataframe_stat_sample_by = 147; + SpDataframeToDf sp_dataframe_to_df = 148; + SpDataframeToLocalIterator sp_dataframe_to_local_iterator = 149; + SpDataframeToPandas sp_dataframe_to_pandas = 150; + SpDataframeToPandasBatches sp_dataframe_to_pandas_batches = 151; + SpDataframeUnion sp_dataframe_union = 152; + SpDataframeUnionAll sp_dataframe_union_all = 153; + SpDataframeUnionAllByName sp_dataframe_union_all_by_name = 154; + SpDataframeUnionByName sp_dataframe_union_by_name = 155; + SpDataframeUnpivot sp_dataframe_unpivot = 156; + SpDataframeWhere sp_dataframe_where = 157; + SpDataframeWithColumn sp_dataframe_with_column = 158; + SpDataframeWithColumnRenamed sp_dataframe_with_column_renamed = 159; + SpDataframeWithColumns sp_dataframe_with_columns = 160; + SpDataframeWrite sp_dataframe_write = 161; + SpDatatypeVal sp_datatype_val = 162; + SpFlatten sp_flatten = 163; + SpFnRef sp_fn_ref = 164; + SpGenerator sp_generator = 165; + SpGroupingSets sp_grouping_sets = 166; + SpMergeDeleteWhenMatchedClause sp_merge_delete_when_matched_clause = 167; + SpMergeInsertWhenNotMatchedClause sp_merge_insert_when_not_matched_clause = 168; + SpMergeUpdateWhenMatchedClause sp_merge_update_when_matched_clause = 169; + SpRange sp_range = 170; + SpReadAvro sp_read_avro = 171; + SpReadCsv sp_read_csv = 172; + SpReadJson sp_read_json = 173; + SpReadOrc sp_read_orc = 174; + SpReadParquet sp_read_parquet = 175; + SpReadTable sp_read_table = 176; + SpReadXml sp_read_xml = 177; + SpRelationalGroupedDataframeAgg sp_relational_grouped_dataframe_agg = 178; + SpRelationalGroupedDataframeApplyInPandas sp_relational_grouped_dataframe_apply_in_pandas = 179; + SpRelationalGroupedDataframeBuiltin sp_relational_grouped_dataframe_builtin = 180; + SpRelationalGroupedDataframePivot sp_relational_grouped_dataframe_pivot = 181; + SpRelationalGroupedDataframeRef sp_relational_grouped_dataframe_ref = 182; + SpRow sp_row = 183; + SpSessionTableFunction sp_session_table_function = 184; + SpSql sp_sql = 185; + SpTable sp_table = 186; + SpTableDelete sp_table_delete = 187; + SpTableDropTable sp_table_drop_table = 188; + SpTableFnCallAlias sp_table_fn_call_alias = 189; + SpTableFnCallOver sp_table_fn_call_over = 190; + SpTableMerge sp_table_merge = 191; + SpTableSample sp_table_sample = 192; + SpTableUpdate sp_table_update = 193; + SpToSnowparkPandas sp_to_snowpark_pandas = 194; + SpWindowSpecEmpty sp_window_spec_empty = 195; + SpWindowSpecOrderBy sp_window_spec_order_by = 196; + SpWindowSpecPartitionBy sp_window_spec_partition_by = 197; + SpWindowSpecRangeBetween sp_window_spec_range_between = 198; + SpWindowSpecRowsBetween sp_window_spec_rows_between = 199; + SpWriteCopyIntoLocation sp_write_copy_into_location = 200; + SpWriteCsv sp_write_csv = 201; + SpWriteJson sp_write_json = 202; + SpWritePandas sp_write_pandas = 203; + SpWriteParquet sp_write_parquet = 204; + SpWriteTable sp_write_table = 205; + StoredProcedure stored_procedure = 206; + StringVal string_val = 207; + Sub sub = 208; + TimeVal time_val = 209; + TimestampVal timestamp_val = 210; + TupleVal tuple_val = 211; + Udaf udaf = 212; + Udf udf = 213; + Udtf udtf = 214; } } @@ -1326,7 +1333,7 @@ message SpColumnApply_String { // sp-col-expr.ir:49 message SpColumnAsc { Expr col = 1; - google.protobuf.BoolValue nulls_first = 2; + SpNullOrder null_order = 2; SrcPosition src = 3; } @@ -1354,17 +1361,17 @@ message SpColumnCast { // sp-col-expr.ir:66 message SpColumnDesc { Expr col = 1; - google.protobuf.BoolValue nulls_first = 2; + SpNullOrder null_order = 2; SrcPosition src = 3; } -// sp-col-expr.ir:70 +// sp-col-expr.ir:72 message SpColumnEqualNan { Expr col = 1; SrcPosition src = 2; } -// sp-col-expr.ir:72 +// sp-col-expr.ir:74 message SpColumnEqualNull { Expr lhs = 1; Expr rhs = 2; @@ -1392,50 +1399,42 @@ message SpColumnFn { SpColumnCast sp_column_cast = 6; SpColumnDesc sp_column_desc = 7; SpColumnEqualNan sp_column_equal_nan = 8; - SpColumnIn_Dataframe sp_column_in__dataframe = 9; - SpColumnIn_Seq sp_column_in__seq = 10; - SpColumnIsNotNull sp_column_is_not_null = 11; - SpColumnIsNull sp_column_is_null = 12; - SpColumnOver sp_column_over = 13; - SpColumnStringCollate sp_column_string_collate = 14; - SpColumnStringContains sp_column_string_contains = 15; - SpColumnStringEndsWith sp_column_string_ends_with = 16; - SpColumnStringLike sp_column_string_like = 17; - SpColumnStringRegexp sp_column_string_regexp = 18; - SpColumnStringStartsWith sp_column_string_starts_with = 19; - SpColumnStringSubstr sp_column_string_substr = 20; - SpColumnTryCast sp_column_try_cast = 21; - SpColumnWithinGroup sp_column_within_group = 22; + SpColumnIn sp_column_in = 9; + SpColumnIsNotNull sp_column_is_not_null = 10; + SpColumnIsNull sp_column_is_null = 11; + SpColumnOver sp_column_over = 12; + SpColumnStringCollate sp_column_string_collate = 13; + SpColumnStringContains sp_column_string_contains = 14; + SpColumnStringEndsWith sp_column_string_ends_with = 15; + SpColumnStringLike sp_column_string_like = 16; + SpColumnStringRegexp sp_column_string_regexp = 17; + SpColumnStringStartsWith sp_column_string_starts_with = 18; + SpColumnStringSubstr sp_column_string_substr = 19; + SpColumnTryCast sp_column_try_cast = 20; + SpColumnWithinGroup sp_column_within_group = 21; } } -// sp-col-expr.ir:77 -message SpColumnIn_Dataframe { - Expr col = 1; - SpDataframeExpr df = 2; - SrcPosition src = 3; -} - -// sp-col-expr.ir:81 -message SpColumnIn_Seq { +// sp-col-expr.ir:79 +message SpColumnIn { Expr col = 1; SrcPosition src = 2; repeated Expr values = 3; } -// sp-col-expr.ir:85 +// sp-col-expr.ir:83 message SpColumnIsNotNull { Expr col = 1; SrcPosition src = 2; } -// sp-col-expr.ir:87 +// sp-col-expr.ir:85 message SpColumnIsNull { Expr col = 1; SrcPosition src = 2; } -// sp-col-expr.ir:89 +// sp-col-expr.ir:87 message SpColumnOver { Expr col = 1; SrcPosition src = 2; @@ -1455,35 +1454,35 @@ message SpColumnSqlExpr { SrcPosition src = 3; } -// sp-col-expr.ir:119 +// sp-col-expr.ir:117 message SpColumnStringCollate { Expr col = 1; Expr collation_spec = 2; SrcPosition src = 3; } -// sp-col-expr.ir:123 +// sp-col-expr.ir:121 message SpColumnStringContains { Expr col = 1; Expr pattern = 2; SrcPosition src = 3; } -// sp-col-expr.ir:110 +// sp-col-expr.ir:108 message SpColumnStringEndsWith { Expr col = 1; SrcPosition src = 2; Expr suffix = 3; } -// sp-col-expr.ir:97 +// sp-col-expr.ir:95 message SpColumnStringLike { Expr col = 1; Expr pattern = 2; SrcPosition src = 3; } -// sp-col-expr.ir:101 +// sp-col-expr.ir:99 message SpColumnStringRegexp { Expr col = 1; Expr parameters = 2; @@ -1491,14 +1490,14 @@ message SpColumnStringRegexp { SrcPosition src = 4; } -// sp-col-expr.ir:106 +// sp-col-expr.ir:104 message SpColumnStringStartsWith { Expr col = 1; Expr prefix = 2; SrcPosition src = 3; } -// sp-col-expr.ir:114 +// sp-col-expr.ir:112 message SpColumnStringSubstr { Expr col = 1; Expr len = 2; @@ -1513,7 +1512,7 @@ message SpColumnTryCast { SpDataType to = 3; } -// sp-col-expr.ir:93 +// sp-col-expr.ir:91 message SpColumnWithinGroup { Expr col = 1; ExprArgList cols = 2; @@ -1541,7 +1540,7 @@ message SpDataframeAlias { SrcPosition src = 3; } -// sp-df-expr.ir:470 +// sp-df-expr.ir:471 message SpDataframeAnalyticsComputeLag { repeated Expr cols = 1; SpDataframeExpr df = 2; @@ -1552,7 +1551,7 @@ message SpDataframeAnalyticsComputeLag { SrcPosition src = 7; } -// sp-df-expr.ir:479 +// sp-df-expr.ir:480 message SpDataframeAnalyticsComputeLead { repeated Expr cols = 1; SpDataframeExpr df = 2; @@ -1563,7 +1562,7 @@ message SpDataframeAnalyticsComputeLead { SrcPosition src = 7; } -// sp-df-expr.ir:461 +// sp-df-expr.ir:462 message SpDataframeAnalyticsCumulativeAgg { repeated Tuple_String_List_String aggs = 1; SpDataframeExpr df = 2; @@ -1574,7 +1573,7 @@ message SpDataframeAnalyticsCumulativeAgg { SrcPosition src = 7; } -// sp-df-expr.ir:452 +// sp-df-expr.ir:453 message SpDataframeAnalyticsMovingAgg { repeated Tuple_String_List_String aggs = 1; SpDataframeExpr df = 2; @@ -1585,7 +1584,7 @@ message SpDataframeAnalyticsMovingAgg { repeated int64 window_sizes = 7; } -// sp-df-expr.ir:488 +// sp-df-expr.ir:489 message SpDataframeAnalyticsTimeSeriesAgg { repeated Tuple_String_List_String aggs = 1; SpDataframeExpr df = 2; @@ -1851,7 +1850,7 @@ message SpDataframeGroupBy { SrcPosition src = 3; } -// sp-df-expr.ir:428 +// sp-df-expr.ir:429 message SpDataframeGroupByGroupingSets { SpDataframeExpr df = 1; repeated SpGroupingSets grouping_sets = 2; @@ -1983,7 +1982,7 @@ message SpDataframePivot { SpPivotValue values = 5; } -// sp-df-expr.ir:330 +// sp-df-expr.ir:331 message SpDataframeRandomSplit { SpDataframeExpr df = 1; google.protobuf.Int64Value seed = 2; @@ -2042,7 +2041,7 @@ message SpDataframeRef { SrcPosition src = 2; } -// sp-df-expr.ir:337 +// sp-df-expr.ir:338 message SpDataframeRename { Expr col_or_mapper = 1; SpDataframeExpr df = 2; @@ -2050,14 +2049,14 @@ message SpDataframeRename { SrcPosition src = 4; } -// sp-df-expr.ir:355 +// sp-df-expr.ir:356 message SpDataframeRollup { ExprArgList cols = 1; SpDataframeExpr df = 2; SrcPosition src = 3; } -// sp-df-expr.ir:343 +// sp-df-expr.ir:344 message SpDataframeRollup_Columns { repeated SpColumnExpr cols = 1; SpDataframeExpr df = 2; @@ -2065,7 +2064,7 @@ message SpDataframeRollup_Columns { bool variadic = 4; } -// sp-df-expr.ir:349 +// sp-df-expr.ir:350 message SpDataframeRollup_Strings { repeated string cols = 1; SpDataframeExpr df = 2; @@ -2073,7 +2072,7 @@ message SpDataframeRollup_Strings { bool variadic = 4; } -// sp-df-expr.ir:360 +// sp-df-expr.ir:361 message SpDataframeSample { SpDataframeExpr df = 1; google.protobuf.Int64Value num = 2; @@ -2081,7 +2080,7 @@ message SpDataframeSample { SrcPosition src = 4; } -// sp-df-expr.ir:366 +// sp-df-expr.ir:367 message SpDataframeSelect_Columns { repeated Expr cols = 1; SpDataframeExpr df = 2; @@ -2089,7 +2088,7 @@ message SpDataframeSelect_Columns { bool variadic = 4; } -// sp-df-expr.ir:372 +// sp-df-expr.ir:373 message SpDataframeSelect_Exprs { SpDataframeExpr df = 1; repeated string exprs = 2; @@ -2103,7 +2102,7 @@ message SpDataframeShow { SrcPosition src = 2; } -// sp-df-expr.ir:378 +// sp-df-expr.ir:379 message SpDataframeSort { Expr ascending = 1; repeated Expr cols = 2; @@ -2195,28 +2194,28 @@ message SpDataframeType { repeated SpType tys = 2; } -// sp-df-expr.ir:385 +// sp-df-expr.ir:386 message SpDataframeUnion { SpDataframeExpr df = 1; SpDataframeExpr other = 2; SrcPosition src = 3; } -// sp-df-expr.ir:390 +// sp-df-expr.ir:391 message SpDataframeUnionAll { SpDataframeExpr df = 1; SpDataframeExpr other = 2; SrcPosition src = 3; } -// sp-df-expr.ir:395 +// sp-df-expr.ir:396 message SpDataframeUnionAllByName { SpDataframeExpr df = 1; SpDataframeExpr other = 2; SrcPosition src = 3; } -// sp-df-expr.ir:400 +// sp-df-expr.ir:401 message SpDataframeUnionByName { SpDataframeExpr df = 1; SpDataframeExpr other = 2; @@ -2227,19 +2226,20 @@ message SpDataframeUnionByName { message SpDataframeUnpivot { repeated Expr column_list = 1; SpDataframeExpr df = 2; - string name_column = 3; - SrcPosition src = 4; - string value_column = 5; + bool include_nulls = 3; + string name_column = 4; + SrcPosition src = 5; + string value_column = 6; } -// sp-df-expr.ir:405 +// sp-df-expr.ir:406 message SpDataframeWhere { SpColumnExpr condition = 1; SpDataframeExpr df = 2; SrcPosition src = 3; } -// sp-df-expr.ir:410 +// sp-df-expr.ir:411 message SpDataframeWithColumn { Expr col = 1; string col_name = 2; @@ -2247,7 +2247,7 @@ message SpDataframeWithColumn { SrcPosition src = 4; } -// sp-df-expr.ir:416 +// sp-df-expr.ir:417 message SpDataframeWithColumnRenamed { Expr col = 1; SpDataframeExpr df = 2; @@ -2255,7 +2255,7 @@ message SpDataframeWithColumnRenamed { SrcPosition src = 4; } -// sp-df-expr.ir:422 +// sp-df-expr.ir:423 message SpDataframeWithColumns { repeated string col_names = 1; SpDataframeExpr df = 2; @@ -2318,7 +2318,7 @@ message SpGroupedDataframeType { repeated string outer_columns = 2; } -// sp-df-expr.ir:434 +// sp-df-expr.ir:435 message SpGroupingSets { ExprArgList sets = 1; SrcPosition src = 2; @@ -2332,13 +2332,13 @@ message SpMatchedClause { } } -// sp-df-expr.ir:505 +// sp-df-expr.ir:506 message SpMergeDeleteWhenMatchedClause { Expr condition = 1; SrcPosition src = 2; } -// sp-df-expr.ir:509 +// sp-df-expr.ir:510 message SpMergeInsertWhenNotMatchedClause { Expr condition = 1; List_Expr insert_keys = 2; @@ -2346,7 +2346,7 @@ message SpMergeInsertWhenNotMatchedClause { SrcPosition src = 4; } -// sp-df-expr.ir:500 +// sp-df-expr.ir:501 message SpMergeUpdateWhenMatchedClause { Expr condition = 1; SrcPosition src = 2; @@ -2492,7 +2492,7 @@ message SpTable { SpTableVariant variant = 4; } -// sp-df-expr.ir:515 +// sp-df-expr.ir:516 message SpTableDelete { bool block = 1; Expr condition = 2; @@ -2502,7 +2502,7 @@ message SpTableDelete { repeated Tuple_String_String statement_params = 6; } -// sp-df-expr.ir:523 +// sp-df-expr.ir:524 message SpTableDropTable { VarId id = 1; SrcPosition src = 2; @@ -2523,7 +2523,7 @@ message SpTableFnCallOver { SrcPosition src = 4; } -// sp-df-expr.ir:527 +// sp-df-expr.ir:528 message SpTableMerge { bool block = 1; repeated SpMatchedClause clauses = 2; @@ -2534,7 +2534,7 @@ message SpTableMerge { repeated Tuple_String_String statement_params = 7; } -// sp-df-expr.ir:536 +// sp-df-expr.ir:537 message SpTableSample { SpDataframeExpr df = 1; google.protobuf.Int64Value num = 2; @@ -2544,7 +2544,7 @@ message SpTableSample { SrcPosition src = 6; } -// sp-df-expr.ir:544 +// sp-df-expr.ir:545 message SpTableUpdate { repeated Tuple_String_Expr assignments = 1; bool block = 2; @@ -2555,7 +2555,7 @@ message SpTableUpdate { repeated Tuple_String_String statement_params = 7; } -// sp-df-expr.ir:438 +// sp-df-expr.ir:439 message SpToSnowparkPandas { List_String columns = 1; SpDataframeExpr df = 2; diff --git a/src/snowflake/snowpark/_internal/type_utils.py b/src/snowflake/snowpark/_internal/type_utils.py index 55fe27c9f8f..3d1095132ab 100644 --- a/src/snowflake/snowpark/_internal/type_utils.py +++ b/src/snowflake/snowpark/_internal/type_utils.py @@ -30,6 +30,7 @@ get_origin, ) +import snowflake.snowpark.context as context import snowflake.snowpark.types # type: ignore from snowflake.connector.constants import FIELD_ID_TO_NAME from snowflake.connector.cursor import ResultMetadata @@ -157,9 +158,12 @@ def convert_metadata_to_sp_type( return StructType( [ StructField( - quote_name(field.name, keep_case=True), + field.name + if context._should_use_structured_type_semantics + else quote_name(field.name, keep_case=True), convert_metadata_to_sp_type(field, max_string_size), nullable=field.is_nullable, + _is_column=False, ) for field in metadata.fields ], diff --git a/src/snowflake/snowpark/_internal/udf_utils.py b/src/snowflake/snowpark/_internal/udf_utils.py index 96c423588df..f8168621059 100644 --- a/src/snowflake/snowpark/_internal/udf_utils.py +++ b/src/snowflake/snowpark/_internal/udf_utils.py @@ -29,7 +29,7 @@ import snowflake.snowpark from snowflake.connector.options import installed_pandas, pandas from snowflake.snowpark._internal import code_generation, type_utils -from snowflake.snowpark._internal.analyzer.datatype_mapper import to_sql +from snowflake.snowpark._internal.analyzer.datatype_mapper import to_sql, to_sql_no_cast from snowflake.snowpark._internal.telemetry import TelemetryField from snowflake.snowpark._internal.type_utils import ( NoneType, @@ -1481,6 +1481,8 @@ def generate_call_python_sp_sql( for arg in args: if isinstance(arg, snowflake.snowpark.Column): sql_args.append(session._analyzer.analyze(arg._expression, {})) + elif "system$" in sproc_name.lower(): + sql_args.append(to_sql_no_cast(arg, infer_type(arg))) else: sql_args.append(to_sql(arg, infer_type(arg))) return f"CALL {sproc_name}({', '.join(sql_args)})" diff --git a/src/snowflake/snowpark/catalog.py b/src/snowflake/snowpark/catalog.py new file mode 100644 index 00000000000..895bb16e2db --- /dev/null +++ b/src/snowflake/snowpark/catalog.py @@ -0,0 +1,700 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import re +from typing import List, Optional, Union + +from snowflake.core import Root # type: ignore +from snowflake.core.database import Database # type: ignore +from snowflake.core.exceptions import NotFoundError +from snowflake.core.procedure import Procedure +from snowflake.core.schema import Schema # type: ignore +from snowflake.core.table import Table, TableColumn +from snowflake.core.user_defined_function import UserDefinedFunction +from snowflake.core.view import View + + +import snowflake.snowpark +from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type +from snowflake.snowpark.functions import lit, parse_json +from snowflake.snowpark.types import DataType + + +class Catalog: + """The Catalog class provides methods to interact with and manage the Snowflake objects. + It allows users to list, get, and drop various database objects such as databases, schemas, tables, + views, functions, etc. + """ + + def __init__(self, session: "snowflake.snowpark.session.Session") -> None: # type: ignore + self._session = session + self._root = Root(session) + self._python_regex_udf = None + + def _parse_database( + self, + database: Optional[Union[str, Database]], + model_obj: Optional[ + Union[str, Schema, Table, View, Procedure, UserDefinedFunction] + ] = None, + ) -> str: + if isinstance(model_obj, (Schema, Table, View, Procedure, UserDefinedFunction)): + db_name = model_obj.database_name + assert db_name is not None # pyright + return db_name + + if isinstance(database, str): + return database + if isinstance(database, Database): + return database.name + if database is None: + current_database = self._session.get_current_database() + if current_database is None: + raise ValueError( + "No database detected. Please provide database to proceed." + ) + return current_database + raise ValueError( + f"Unexpected type. Expected str or Database, got '{type(database)}'" + ) + + def _parse_schema( + self, + schema: Optional[Union[str, Schema]], + model_obj: Optional[ + Union[str, Table, View, Procedure, UserDefinedFunction] + ] = None, + ) -> str: + if isinstance(model_obj, (Table, View, Procedure, UserDefinedFunction)): + schema_name = model_obj.schema_name + assert schema_name is not None # pyright + return schema_name + + if isinstance(schema, str): + return schema + if isinstance(schema, Schema): + return schema.name + if schema is None: + current_schema = self._session.get_current_schema() + if current_schema is None: + raise ValueError( + "No schema detected. Please provide schema to proceed." + ) + return current_schema + raise ValueError( + f"Unexpected type. Expected str or Schema, got '{type(schema)}'" + ) + + def _parse_function_or_procedure( + self, + fn: Union[str, Procedure, UserDefinedFunction], + arg_types: Optional[List[DataType]], + ) -> str: + if isinstance(fn, str): + if arg_types is None: + raise ValueError( + "arg_types must be provided when function/procedure is a string" + ) + arg_types_str = ", ".join( + [convert_sp_to_sf_type(arg_type) for arg_type in arg_types] + ) + return f"{fn}({arg_types_str})" + + arg_types_str = ", ".join(arg.datatype for arg in fn.arguments) + return f"{fn.name}({arg_types_str})" + + def _initialize_regex_udf(self) -> None: + with self._session._lock: + if self._python_regex_udf is not None: + return + + def python_regex_filter(pattern: str, input: str) -> bool: + return bool(re.match(pattern, input)) + + self._python_regex_udf = self._session.udf.register(python_regex_filter) + + def _list_objects( + self, + *, + object_name: str, + object_class, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + pattern: Optional[str] = None, + like: Optional[str] = None, + ): + db_name = self._parse_database(database) + schema_name = self._parse_schema(schema) + + like_str = f"LIKE '{like}'" if like else "" + + df = self._session.sql( + f"SHOW AS RESOURCE {object_name} {like_str} IN {db_name}.{schema_name} -- catalog api" + ) + if pattern: + # initialize udf + self._initialize_regex_udf() + assert self._python_regex_udf is not None # pyright + + # The result of SHOW AS RESOURCE query is a json string which contains + # key 'name' to store the name of the object. We parse json for the returned + # result and apply the filter on name. + df = df.filter( + self._python_regex_udf( + lit(pattern), parse_json('"As Resource"')["name"] + ) + ) + + return list(map(lambda row: object_class.from_json(row[0]), df.collect())) + + # List methods + def list_databases( + self, + *, + pattern: Optional[str] = None, + like: Optional[str] = None, + ) -> List[Database]: + """List databases in the current session. + + Args: + pattern: the python regex pattern of name to match. Defaults to None. + like: the sql style pattern for name to match. Default to None. + """ + iter = self._root.databases.iter(like=like) + if pattern: + iter = filter(lambda x: re.match(pattern, x.name), iter) + + return list(iter) + + def list_schemas( + self, + *, + database: Optional[Union[str, Database]] = None, + pattern: Optional[str] = None, + like: Optional[str] = None, + ) -> List[Schema]: + """List schemas in the current session. If database is provided, list schemas in the + database, otherwise list schemas in the current database. + + Args: + database: database name or ``Database`` object. Defaults to None. + pattern: the python regex pattern of name to match. Defaults to None. + like: the sql style pattern for name to match. Default to None. + """ + db_name = self._parse_database(database) + iter = self._root.databases[db_name].schemas.iter(like=like) + if pattern: + iter = filter(lambda x: re.match(pattern, x.name), iter) + return list(iter) + + def list_tables( + self, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + pattern: Optional[str] = None, + like: Optional[str] = None, + ) -> List[Table]: + """List tables in the current session. If database or schema are provided, list tables + in the given database or schema, otherwise list tables in the current database/schema. + + Args: + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + pattern: the python regex pattern of name to match. Defaults to None. + like: the sql style pattern for name to match. Default to None. + """ + return self._list_objects( + object_name="TABLES", + object_class=Table, + database=database, + schema=schema, + pattern=pattern, + like=like, + ) + + def list_views( + self, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + pattern: Optional[str] = None, + like: Optional[str] = None, + ) -> List[View]: + """List views in the current session. If database or schema are provided, list views + in the given database or schema, otherwise list views in the current database/schema. + + Args: + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + pattern: the python regex pattern of name to match. Defaults to None. + like: the sql style pattern for name to match. Default to None. + """ + return self._list_objects( + object_name="VIEWS", + object_class=View, + database=database, + schema=schema, + pattern=pattern, + like=like, + ) + + def list_columns( + self, + table_name: Union[str, Table], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> List[TableColumn]: + """List columns in the given table. + + Args: + table_name: table name. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + if isinstance(table_name, str): + table = self.get_table(table_name, database=database, schema=schema) + else: + table = table_name + cols = table.columns + assert cols is not None + return cols + + def list_procedures( + self, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + pattern: Optional[str] = None, + like: Optional[str] = None, + ) -> List[Procedure]: + """List of procedures in the given database and schema. If database or schema are not + provided, list procedures in the current database and schema. + + Args: + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + pattern: the python regex pattern of name to match. Defaults to None. + like: the sql style pattern for name to match. Default to None. + """ + return self._list_objects( + object_name="PROCEDURES", + object_class=Procedure, + database=database, + schema=schema, + pattern=pattern, + like=like, + ) + + def list_user_defined_functions( + self, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + pattern: Optional[str] = None, + like: Optional[str] = None, + ) -> List[UserDefinedFunction]: + """List of user defined functions in the given database and schema. If database or schema + are not provided, list user defined functions in the current database and schema. + Args: + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + pattern: the python regex pattern of name to match. Defaults to None. + like: the sql style pattern for name to match. Default to None. + """ + return self._list_objects( + object_name="USER FUNCTIONS", + object_class=UserDefinedFunction, + database=database, + schema=schema, + pattern=pattern, + like=like, + ) + + # get methods + def get_current_database(self) -> Optional[str]: + """Get the current database.""" + return self._session.get_current_database() + + def get_current_schema(self) -> Optional[str]: + """Get the current schema.""" + return self._session.get_current_schema() + + def get_database(self, database: str) -> Database: + """Name of the database to get""" + return self._root.databases[database].fetch() + + def get_schema( + self, schema: str, *, database: Optional[Union[str, Database]] = None + ) -> Schema: + """Name of the schema to get.""" + db_name = self._parse_database(database) + return self._root.databases[db_name].schemas[schema].fetch() + + def get_table( + self, + table_name: str, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> Table: + """Get the table by name in given database and schema. If database or schema are not + provided, get the table in the current database and schema. + + Args: + table_name: name of the table. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database) + schema_name = self._parse_schema(schema) + return ( + self._root.databases[db_name] + .schemas[schema_name] + .tables[table_name] + .fetch() + ) + + def get_view( + self, + view_name: str, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> View: + """Get the view by name in given database and schema. If database or schema are not + provided, get the view in the current database and schema. + + Args: + view_name: name of the view. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database) + schema_name = self._parse_schema(schema) + return ( + self._root.databases[db_name].schemas[schema_name].views[view_name].fetch() + ) + + def get_procedure( + self, + procedure_name: str, + arg_types: List[DataType], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> Procedure: + """Get the procedure by name and argument types in given database and schema. If database or + schema are not provided, get the procedure in the current database and schema. + + Args: + procedure_name: name of the procedure. + arg_types: list of argument types to uniquely identify the procedure. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database) + schema_name = self._parse_schema(schema) + procedure_id = self._parse_function_or_procedure(procedure_name, arg_types) + return ( + self._root.databases[db_name] + .schemas[schema_name] + .procedures[procedure_id] + .fetch() + ) + + def get_user_defined_function( + self, + udf_name: str, + arg_types: List[DataType], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> UserDefinedFunction: + """Get the user defined function by name and argument types in given database and schema. + If database or schema are not provided, get the user defined function in the current + database and schema. + + Args: + udf_name: name of the user defined function. + arg_types: list of argument types to uniquely identify the user defined function. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database) + schema_name = self._parse_schema(schema) + function_id = self._parse_function_or_procedure(udf_name, arg_types) + return ( + self._root.databases[db_name] + .schemas[schema_name] + .user_defined_functions[function_id] + .fetch() + ) + + # set methods + def set_current_database(self, database: Union[str, Database]) -> None: + """Set the current default database for the session. + + Args: + database: database name or ``Database`` object. + """ + db_name = self._parse_database(database) + self._session.use_database(db_name) + + def set_current_schema(self, schema: Union[str, Schema]) -> None: + """Set the current default schema for the session. + + Args: + schema: schema name or ``Schema`` object. + """ + schema_name = self._parse_schema(schema) + self._session.use_schema(schema_name) + + # exists methods + def database_exists(self, database: Union[str, Database]) -> bool: + """Check if the given database exists. + + Args: + database: database name or ``Database`` object. + """ + db_name = self._parse_database(database) + try: + self._root.databases[db_name].fetch() + return True + except NotFoundError: + return False + + def schema_exists( + self, + schema: Union[str, Schema], + *, + database: Optional[Union[str, Database]] = None, + ) -> bool: + """Check if the given schema exists in the given database. If database is not provided, + check if the schema exists in the current database. + + Args: + schema: schema name or ``Schema`` object. + database: database name or ``Database`` object. Defaults to None. + """ + db_name = self._parse_database(database, schema) + schema_name = self._parse_schema(schema) + try: + self._root.databases[db_name].schemas[schema_name].fetch() + return True + except NotFoundError: + return False + + def table_exists( + self, + table: Union[str, Table], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> bool: + """Check if the given table exists in the given database and schema. If database or schema + are not provided, check if the table exists in the current database and schema. + + Args: + table: table name or ``Table`` object. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database, table) + schema_name = self._parse_schema(schema, table) + table_name = table if isinstance(table, str) else table.name + try: + self._root.databases[db_name].schemas[schema_name].tables[ + table_name + ].fetch() + return True + except NotFoundError: + return False + + def view_exists( + self, + view: Union[str, View], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> bool: + """Check if the given view exists in the given database and schema. If database or schema + are not provided, check if the view exists in the current database and schema. + + Args: + view: view name or ``View`` object. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database, view) + schema_name = self._parse_schema(schema, view) + view_name = view if isinstance(view, str) else view.name + try: + self._root.databases[db_name].schemas[schema_name].views[view_name].fetch() + return True + except NotFoundError: + return False + + def procedure_exists( + self, + procedure: Union[str, Procedure], + arg_types: Optional[List[DataType]] = None, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> bool: + """Check if the given procedure exists in the given database and schema. If database or + schema are not provided, check if the procedure exists in the current database and schema. + + Args: + procedure: procedure name or ``Procedure`` object. + arg_types: list of argument types to uniquely identify the procedure. Defaults to None. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database, procedure) + schema_name = self._parse_schema(schema, procedure) + procedure_id = self._parse_function_or_procedure(procedure, arg_types) + + try: + self._root.databases[db_name].schemas[schema_name].procedures[ + procedure_id + ].fetch() + return True + except NotFoundError: + return False + + def user_defined_function_exists( + self, + udf: Union[str, UserDefinedFunction], + arg_types: Optional[List[DataType]] = None, + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> bool: + """Check if the given user defined function exists in the given database and schema. If + database or schema are not provided, check if the user defined function exists in the + current database and schema. + + Args: + udf: user defined function name or ``UserDefinedFunction`` object. + arg_types: list of argument types to uniquely identify the user defined function. + Defaults to None. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database, udf) + schema_name = self._parse_schema(schema, udf) + function_id = self._parse_function_or_procedure(udf, arg_types) + + try: + self._root.databases[db_name].schemas[schema_name].user_defined_functions[ + function_id + ].fetch() + return True + except NotFoundError: + return False + + # drop methods + def drop_database(self, database: Union[str, Database]) -> None: + """Drop the given database. + + Args: + database: database name or ``Database`` object. + """ + db_name = self._parse_database(database) + self._root.databases[db_name].drop() + + def drop_schema( + self, + schema: Union[str, Schema], + *, + database: Optional[Union[str, Database]] = None, + ) -> None: + """Drop the given schema in the given database. If database is not provided, drop the + schema in the current database. + + Args: + schema: schema name or ``Schema`` object. + database: database name or ``Database`` object. Defaults to None. + """ + db_name = self._parse_database(database, schema) + schema_name = self._parse_schema(schema) + self._root.databases[db_name].schemas[schema_name].drop() + + def drop_table( + self, + table: Union[str, Table], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> None: + """Drop the given table in the given database and schema. If database or schema are not + provided, drop the table in the current database and schema. + + Args: + table: table name or ``Table`` object. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database, table) + schema_name = self._parse_schema(schema, table) + table_name = table if isinstance(table, str) else table.name + + self._root.databases[db_name].schemas[schema_name].tables[table_name].drop() + + def drop_view( + self, + view: Union[str, View], + *, + database: Optional[Union[str, Database]] = None, + schema: Optional[Union[str, Schema]] = None, + ) -> None: + """Drop the given view in the given database and schema. If database or schema are not + provided, drop the view in the current database and schema. + + Args: + view: view name or ``View`` object. + database: database name or ``Database`` object. Defaults to None. + schema: schema name or ``Schema`` object. Defaults to None. + """ + db_name = self._parse_database(database, view) + schema_name = self._parse_schema(schema, view) + view_name = view if isinstance(view, str) else view.name + + self._root.databases[db_name].schemas[schema_name].views[view_name].drop() + + # aliases + listDatabases = list_databases + listSchemas = list_schemas + listTables = list_tables + listViews = list_views + listColumns = list_columns + listProcedures = list_procedures + listUserDefinedFunctions = list_user_defined_functions + + getCurrentDatabase = get_current_database + getCurrentSchema = get_current_schema + getDatabase = get_database + getSchema = get_schema + getTable = get_table + getView = get_view + getProcedure = get_procedure + getUserDefinedFunction = get_user_defined_function + + setCurrentDatabase = set_current_database + setCurrentSchema = set_current_schema + + databaseExists = database_exists + schemaExists = schema_exists + tableExists = table_exists + viewExists = view_exists + procedureExists = procedure_exists + userDefinedFunctionExists = user_defined_function_exists + + dropDatabase = drop_database + dropSchema = drop_schema + dropTable = drop_table + dropView = drop_view diff --git a/src/snowflake/snowpark/column.py b/src/snowflake/snowpark/column.py index 0b46f7d101b..7e9588d9d5d 100644 --- a/src/snowflake/snowpark/column.py +++ b/src/snowflake/snowpark/column.py @@ -91,6 +91,9 @@ StringType, TimestampTimeZone, TimestampType, + ArrayType, + MapType, + StructType, ) from snowflake.snowpark.window import Window, WindowSpec @@ -644,7 +647,7 @@ def in_( ast = None if _emit_ast: ast = proto.Expr() - proto_ast = ast.sp_column_in__seq + proto_ast = ast.sp_column_in proto_ast.col.CopyFrom(self._ast) return Column(Literal(False), _ast=ast, _emit_ast=_emit_ast) @@ -699,7 +702,7 @@ def validate_value(value_expr: Expression): ast = None if _emit_ast: ast = proto.Expr() - proto_ast = ast.sp_column_in__seq + proto_ast = ast.sp_column_in proto_ast.col.CopyFrom(self._ast) for val in vals: val_ast = proto_ast.values.add() @@ -917,6 +920,9 @@ def _cast( if isinstance(to, str): to = type_string_to_type_object(to) + if isinstance(to, (ArrayType, MapType, StructType)): + to = to._as_nested() + if self._ast is None: _emit_ast = False @@ -952,6 +958,7 @@ def desc(self, _emit_ast: bool = True) -> "Column": expr = proto.Expr() ast = with_src_position(expr.sp_column_desc) ast.col.CopyFrom(self._ast) + ast.null_order.sp_null_order_default = True return Column( SortOrder(self._expression, Descending()), _ast=expr, _emit_ast=_emit_ast ) @@ -965,7 +972,7 @@ def desc_nulls_first(self, _emit_ast: bool = True) -> "Column": expr = proto.Expr() ast = with_src_position(expr.sp_column_desc) ast.col.CopyFrom(self._ast) - ast.nulls_first.value = True + ast.null_order.sp_null_order_nulls_first = True return Column( SortOrder(self._expression, Descending(), NullsFirst()), _ast=expr, @@ -981,7 +988,7 @@ def desc_nulls_last(self, _emit_ast: bool = True) -> "Column": expr = proto.Expr() ast = with_src_position(expr.sp_column_desc) ast.col.CopyFrom(self._ast) - ast.nulls_first.value = False + ast.null_order.sp_null_order_nulls_last = True return Column( SortOrder(self._expression, Descending(), NullsLast()), _ast=expr, @@ -996,6 +1003,7 @@ def asc(self, _emit_ast: bool = True) -> "Column": expr = proto.Expr() ast = with_src_position(expr.sp_column_asc) ast.col.CopyFrom(self._ast) + ast.null_order.sp_null_order_default = True return Column( SortOrder(self._expression, Ascending()), _ast=expr, _emit_ast=_emit_ast ) @@ -1009,7 +1017,7 @@ def asc_nulls_first(self, _emit_ast: bool = True) -> "Column": expr = proto.Expr() ast = with_src_position(expr.sp_column_asc) ast.col.CopyFrom(self._ast) - ast.nulls_first.value = True + ast.null_order.sp_null_order_nulls_first = True return Column( SortOrder(self._expression, Ascending(), NullsFirst()), _ast=expr, @@ -1025,7 +1033,7 @@ def asc_nulls_last(self, _emit_ast: bool = True) -> "Column": expr = proto.Expr() ast = with_src_position(expr.sp_column_asc) ast.col.CopyFrom(self._ast) - ast.nulls_first.value = False + ast.null_order.sp_null_order_nulls_last = True return Column( SortOrder(self._expression, Ascending(), NullsLast()), _ast=expr, diff --git a/src/snowflake/snowpark/context.py b/src/snowflake/snowpark/context.py index c8f6888c5bd..8bc86f928a1 100644 --- a/src/snowflake/snowpark/context.py +++ b/src/snowflake/snowpark/context.py @@ -21,6 +21,10 @@ _should_continue_registration: Optional[Callable[..., bool]] = None +# Global flag that determines if structured type semantics should be used +_should_use_structured_type_semantics = False + + def get_active_session() -> "snowflake.snowpark.Session": """Returns the current active Snowpark session. diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py index 8f41909284a..9bf175a18e4 100644 --- a/src/snowflake/snowpark/dataframe.py +++ b/src/snowflake/snowpark/dataframe.py @@ -2447,6 +2447,7 @@ def unpivot( self._set_ast_ref(ast.df) ast.value_column = value_column ast.name_column = name_column + ast.include_nulls = include_nulls for c in column_list: build_expr_from_snowpark_column_or_col_name(ast.column_list.add(), c) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 29f2b40a7ad..c4b3dc0c789 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -194,6 +194,7 @@ set_builtin_fn_alias, snowpark_expression_to_ast, with_src_position, + build_function_expr, ) from snowflake.snowpark._internal.type_utils import ( ColumnOrLiteral, @@ -1309,7 +1310,13 @@ def approx_percentile( """ c = _to_col_if_str(col, "approx_percentile") - return builtin("approx_percentile", _emit_ast=_emit_ast)(c, lit(percentile)) + # Build AST here to prevent `percentile` from being recorded as a literal instead of float. + ast = ( + build_function_expr("approx_percentile", [c, percentile]) if _emit_ast else None + ) + return builtin("approx_percentile", _ast=ast, _emit_ast=False)( + c, lit(percentile, _emit_ast=False) + ) @publicapi @@ -1366,8 +1373,14 @@ def approx_percentile_estimate( """ c = _to_col_if_str(state, "approx_percentile_estimate") - return builtin("approx_percentile_estimate", _emit_ast=_emit_ast)( - c, lit(percentile) + # Build AST here to prevent `percentile` from being recorded as a literal instead of float. + ast = ( + build_function_expr("approx_percentile_estimate", [c, percentile]) + if _emit_ast + else None + ) + return builtin("approx_percentile_estimate", _ast=ast, _emit_ast=False)( + c, lit(percentile, _emit_ast=False) ) @@ -1948,7 +1961,13 @@ def to_decimal( [Row(ANS=Decimal('12.00')), Row(ANS=Decimal('11.30')), Row(ANS=Decimal('-90.12'))] """ c = _to_col_if_str(e, "to_decimal") - return builtin("to_decimal", _emit_ast=_emit_ast)(c, lit(precision), lit(scale)) + # Build AST here to prevent `precision` and `scale` from being recorded as a literal instead of int. + ast = ( + build_function_expr("to_decimal", [c, precision, scale]) if _emit_ast else None + ) + return builtin("to_decimal", _ast=ast, _emit_ast=False)( + c, lit(precision, _emit_ast=False), lit(scale, _emit_ast=False) + ) @publicapi @@ -2001,17 +2020,61 @@ def div0( >>> df.select(div0(df["a"], 1).alias("divided_by_one"), div0(df["a"], 0).alias("divided_by_zero")).collect() [Row(DIVIDED_BY_ONE=Decimal('1.000000'), DIVIDED_BY_ZERO=Decimal('0.000000'))] """ + # Build AST here to prevent `dividend` and `divisor` from being recorded as a literal instead of int/float. + ast = build_function_expr("div0", [dividend, divisor]) if _emit_ast else None dividend_col = ( - lit(dividend) + lit(dividend, _emit_ast=False) if isinstance(dividend, (int, float)) else _to_col_if_str(dividend, "div0") ) divisor_col = ( - lit(divisor) + lit(divisor, _emit_ast=False) if isinstance(divisor, (int, float)) else _to_col_if_str(divisor, "div0") ) - return builtin("div0", _emit_ast=_emit_ast)(dividend_col, divisor_col) + return builtin("div0", _ast=ast, _emit_ast=False)(dividend_col, divisor_col) + + +@publicapi +def divnull( + dividend: Union[ColumnOrName, int, float], + divisor: Union[ColumnOrName, int, float], + _emit_ast: bool = True, +) -> Column: + """Performs division like the division operator (/), + but returns NULL when the divisor is 0 (rather then reporting error). + + Example:: + + >>> df = session.create_dataframe([1], schema=["a"]) + >>> df.select(divnull(df["a"], 1).alias("divided_by_one"), divnull(df["a"], 0).alias("divided_by_zero")).collect() + [Row(DIVIDED_BY_ONE=Decimal('1.000000'), DIVIDED_BY_ZERO=None)] + """ + # TODO SNOW-1864578: find a way to encode divnull. + dividend_col = ( + lit(dividend, _emit_ast=False) + if isinstance(dividend, (int, float)) + else _to_col_if_str(dividend, "divnull") + ) + divisor_col = ( + lit(divisor, _emit_ast=False) + if isinstance(divisor, (int, float)) + else _to_col_if_str(divisor, "divnull") + ) + return dividend_col / nullifzero(divisor_col, _emit_ast=False) + + +@publicapi +def nullifzero(e: ColumnOrName, _emit_ast: bool = True) -> Column: + """Returns NULL if the argument evaluates to 0; otherwise, returns the argument. + + Example:: + >>> df = session.create_dataframe([0, 1], schema=["a"]) + >>> df.select(nullifzero(df["a"]).alias("result")).collect() + [Row(RESULT=None), Row(RESULT=1)] + """ + c = _to_col_if_str(e, "nullifzero") + return builtin("nullifzero", _emit_ast=_emit_ast)(c) @publicapi @@ -2540,7 +2603,9 @@ def lpad( """ c = _to_col_if_str(e, "lpad") p = _to_col_if_str(pad, "lpad") - return builtin("lpad", _emit_ast=_emit_ast)( + # Build AST here to prevent `len` from being recorded as a literal instead of int. + ast = build_function_expr("lpad", [c, len, p]) if _emit_ast else None + return builtin("lpad", _ast=ast, _emit_ast=False)( c, len if isinstance(len, Column) else lit(len), p ) @@ -2599,7 +2664,9 @@ def rpad( """ c = _to_col_if_str(e, "rpad") p = _to_col_if_str(pad, "rpad") - return builtin("rpad", _emit_ast=_emit_ast)( + # Build AST here to prevent `len` from being recorded as a literal instead of int. + ast = build_function_expr("rpad", [c, len, p]) if _emit_ast else None + return builtin("rpad", _ast=ast, _emit_ast=False)( c, len if isinstance(len, Column) else lit(len), p ) @@ -2651,7 +2718,9 @@ def repeat(s: ColumnOrName, n: Union[Column, int], _emit_ast: bool = True) -> Co """ c = _to_col_if_str(s, "repeat") - return builtin("repeat", _emit_ast=_emit_ast)( + # Build AST here to prevent `n` from being recorded as a literal instead of int. + ast = build_function_expr("repeat", [c, n]) if _emit_ast else None + return builtin("repeat", _ast=ast, _emit_ast=False)( c, n if isinstance(n, Column) else lit(n) ) @@ -2817,7 +2886,7 @@ def flatten_col_list(obj): name = c._expression.name name = name[1:] if name.startswith('"') else name name = name[:-1] if name.endswith('"') else name - new_cols.append(lit(name)) + new_cols.append(lit(name, _emit_ast=False)) # next insert field value c = _to_col_if_str(c, "struct") if isinstance(c, Column) and isinstance(c._expression, Alias): @@ -2846,17 +2915,19 @@ def log( >>> df.select(log(10, df["a"]).cast(IntegerType()).alias("log")).collect() [Row(LOG=0), Row(LOG=1)] """ + # Build AST here to prevent `base` and `x` from being recorded as a literal instead of int/float. + ast = build_function_expr("log", [base, x]) if _emit_ast else None b = ( - lit(base, _emit_ast=_emit_ast) + lit(base, _emit_ast=False) if isinstance(base, (int, float)) else _to_col_if_str(base, "log") ) arg = ( - lit(x, _emit_ast=_emit_ast) + lit(x, _emit_ast=False) if isinstance(x, (int, float)) else _to_col_if_str(x, "log") ) - return builtin("log", _emit_ast=_emit_ast)(b, arg) + return builtin("log", _ast=ast, _emit_ast=False)(b, arg) # Create base 2 and base 10 wrappers for use with the Modin log2 and log10 functions @@ -2887,17 +2958,19 @@ def pow( ------------ """ + # Build AST here to prevent `left` and `right` from being recorded as a literal instead of int/float. + ast = build_function_expr("pow", [left, right]) if _emit_ast else None number = ( - lit(left, _emit_ast=_emit_ast) + lit(left, _emit_ast=False) if isinstance(left, (int, float)) else _to_col_if_str(left, "pow") ) power = ( - lit(right, _emit_ast=_emit_ast) + lit(right, _emit_ast=False) if isinstance(right, (int, float)) else _to_col_if_str(right, "pow") ) - return builtin("pow", _emit_ast=_emit_ast)(number, power) + return builtin("pow", _ast=ast, _emit_ast=False)(number, power) @publicapi @@ -2920,6 +2993,8 @@ def round( """ c = _to_col_if_str(e, "round") + # Build AST here to prevent `scale` from being recorded as a literal instead of int/float. + ast = build_function_expr("round", [c, scale]) if _emit_ast else None scale_col = ( lit(scale, _emit_ast=False) if isinstance(scale, (int, float)) @@ -2931,7 +3006,7 @@ def round( ast = proto.Expr() build_builtin_fn_apply(ast, "round", e, scale) - col = builtin("round", _emit_ast=False)(c, scale_col) + col = builtin("round", _ast=ast, _emit_ast=False)(c, scale_col) col._ast = ast return col @@ -3047,11 +3122,17 @@ def substring( [Row(SUBSTRING("S", 2)='bc'), Row(SUBSTRING("S", 2)='ef')] """ s = _to_col_if_str(str, "substring") - p = pos if isinstance(pos, Column) else lit(pos, _emit_ast=_emit_ast) + # Build AST here to prevent `pos` and `len` from being recorded as a literal instead of int/None. + ast = ( + build_function_expr("substring", [s, pos, len], ignore_null_args=True) + if _emit_ast + else None + ) + p = pos if isinstance(pos, Column) else lit(pos, _emit_ast=False) if len is None: - return builtin("substring", _emit_ast=_emit_ast)(s, p) - length = len if isinstance(len, Column) else lit(len, _emit_ast=_emit_ast) - return builtin("substring", _emit_ast=_emit_ast)(s, p, length) + return builtin("substring", _emit_ast=False)(s, p) + length = len if isinstance(len, Column) else lit(len, _emit_ast=False) + return builtin("substring", _ast=ast, _emit_ast=False)(s, p, length) @publicapi @@ -3142,11 +3223,19 @@ def regexp_count( """ sql_func_name = "regexp_count" sub = _to_col_if_str(subject, sql_func_name) - pat = pattern if isinstance(pattern, Column) else lit(pattern) - pos = position if isinstance(position, Column) else lit(position) - params = [lit(p) for p in parameters] - return builtin(sql_func_name, _emit_ast=_emit_ast)(sub, pat, pos, *params) + # Build AST here to prevent `pattern` and `position` from being recorded as a literal instead of int/str. + ast = ( + build_function_expr(sql_func_name, [sub, pattern, position] + list(parameters)) + if _emit_ast + else None + ) + + pat = pattern if isinstance(pattern, Column) else lit(pattern, _emit_ast=False) + pos = position if isinstance(position, Column) else lit(position, _emit_ast=False) + + params = [lit(p, _emit_ast=False) for p in parameters] + return builtin(sql_func_name, _ast=ast, _emit_ast=False)(sub, pat, pos, *params) @publicapi @@ -3231,13 +3320,36 @@ def regexp_replace( """ sql_func_name = "regexp_replace" sub = _to_col_if_str(subject, sql_func_name) - pat = pattern if isinstance(pattern, Column) else lit(pattern) - rep = replacement if isinstance(replacement, Column) else lit(replacement) - pos = position if isinstance(position, Column) else lit(position) - occ = occurrences if isinstance(occurrences, Column) else lit(occurrences) + # Build AST here to prevent `pattern`, `replacement`, `position`, `occurrences` from being recorded as a literal + # instead of int/str. + ast = ( + build_function_expr( + sql_func_name, + [sub, pattern, replacement, position, occurrences] + list(parameters), + ) + if _emit_ast + else None + ) - params = [p if isinstance(p, Column) else lit(p) for p in parameters] - return builtin(sql_func_name, _emit_ast=_emit_ast)(sub, pat, rep, pos, occ, *params) + pat = pattern if isinstance(pattern, Column) else lit(pattern, _emit_ast=False) + rep = ( + replacement + if isinstance(replacement, Column) + else lit(replacement, _emit_ast=False) + ) + pos = position if isinstance(position, Column) else lit(position, _emit_ast=False) + occ = ( + occurrences + if isinstance(occurrences, Column) + else lit(occurrences, _emit_ast=False) + ) + + params = [ + p if isinstance(p, Column) else lit(p, _emit_ast=False) for p in parameters + ] + return builtin(sql_func_name, _ast=ast, _emit_ast=False)( + sub, pat, rep, pos, occ, *params + ) @publicapi @@ -3265,9 +3377,15 @@ def replace( """ sql_func_name = "replace" sub = _to_col_if_str(subject, sql_func_name) + # Build AST here to prevent `pattern` and `replacement` from being recorded as a literal instead of str. + ast = ( + build_function_expr(sql_func_name, [sub, pattern, replacement]) + if _emit_ast + else None + ) pat = lit(pattern, _emit_ast=_emit_ast) rep = lit(replacement, _emit_ast=_emit_ast) - return builtin(sql_func_name, _emit_ast=_emit_ast)(sub, pat, rep) + return builtin(sql_func_name, _ast=ast, _emit_ast=False)(sub, pat, rep) @publicapi @@ -3305,12 +3423,22 @@ def charindex( """ t = _to_col_if_str(target_expr, "charindex") s = _to_col_if_str(source_expr, "charindex") + # Build AST here to prevent `position` from being recorded as a literal instead of int/None. + ast = ( + build_function_expr("char_index", [t, s, position], ignore_null_args=True) + if _emit_ast + else None + ) return ( - builtin("charindex", _emit_ast=_emit_ast)( - t, s, position if isinstance(position, Column) else lit(position) + builtin("charindex", _ast=ast, _emit_ast=False)( + t, + s, + position + if isinstance(position, Column) + else lit(position, _emit_ast=False), ) if position is not None - else builtin("charindex", _emit_ast=_emit_ast)(t, s) + else builtin("charindex", _emit_ast=False)(t, s) ) @@ -3554,10 +3682,12 @@ def insert( """ b = _to_col_if_str(base_expr, "insert") i = _to_col_if_str(insert_expr, "insert") - return builtin("insert", _emit_ast=_emit_ast)( + # Build AST here to prevent `position` and `length` from being recorded as a literal instead of int. + ast = build_function_expr("insert", [b, position, length, i]) if _emit_ast else None + return builtin("insert", _ast=ast, _emit_ast=False)( b, - position if isinstance(position, Column) else lit(position), - length if isinstance(length, Column) else lit(length), + position if isinstance(position, Column) else lit(position, _emit_ast=False), + length if isinstance(length, Column) else lit(length, _emit_ast=False), i, ) @@ -3581,8 +3711,10 @@ def left( """ s = _to_col_if_str(str_expr, "left") - return builtin("left", _emit_ast=_emit_ast)( - s, length if isinstance(length, Column) else lit(length) + # Build AST here to prevent `length` from being recorded as a literal instead of int. + ast = build_function_expr("left", [s, length]) if _emit_ast else None + return builtin("left", _ast=ast, _emit_ast=False)( + s, length if isinstance(length, Column) else lit(length, _emit_ast=False) ) @@ -3605,8 +3737,10 @@ def right( """ s = _to_col_if_str(str_expr, "right") - return builtin("right", _emit_ast=_emit_ast)( - s, length if isinstance(length, Column) else lit(length) + # Build AST here to prevent `length` from being recorded as a literal instead of int. + ast = build_function_expr("right", [s, length]) if _emit_ast else None + return builtin("right", _ast=ast, _emit_ast=False)( + s, length if isinstance(length, Column) else lit(length, _emit_ast=False) ) @@ -3655,12 +3789,14 @@ def to_char( """ c = _to_col_if_str(c, "to_char") + # Build AST here to prevent `format` from being recorded as a literal instead of str/None. + ast = build_function_expr("to_char", [c, format]) if _emit_ast else None return ( - builtin("to_char", _emit_ast=_emit_ast)( - c, format if isinstance(format, Column) else lit(format) + builtin("to_char", _ast=ast, _emit_ast=False)( + c, format if isinstance(format, Column) else lit(format, _emit_ast=False) ) if format is not None - else builtin("to_char", _emit_ast=_emit_ast)(c) + else builtin("to_char", _emit_ast=False)(c) ) @@ -4427,7 +4563,7 @@ def array_intersection( def array_except( source_array: ColumnOrName, array_of_elements_to_exclude: ColumnOrName, - allow_duplicates=True, + allow_duplicates: bool = True, _emit_ast: bool = True, ) -> Column: """Returns a new ARRAY that contains the elements from one input ARRAY that are not in another input ARRAY. @@ -4547,7 +4683,11 @@ def array_except( if _emit_ast: ast = proto.Expr() build_builtin_fn_apply( - ast, "array_except", source_array, array_of_elements_to_exclude + ast, + "array_except", + source_array, + array_of_elements_to_exclude, + allow_duplicates, ) array1 = _to_col_if_str(source_array, "array_except") @@ -4645,6 +4785,32 @@ def array_flatten(array: ColumnOrName, _emit_ast: bool = True) -> Column: return builtin("array_flatten", _emit_ast=_emit_ast)(array) +@publicapi +def array_reverse(col: ColumnOrName, _emit_ast: bool = True) -> Column: + """Returns an array with the elements of the input array in reverse order. + + Args: + col: The source array. + + Example:: + >>> df = session.sql("select [1, 2, 3, 4] :: ARRAY(INT) as A") + >>> df.select(array_reverse("A")).show() + -------------------------- + |"ARRAY_REVERSE(""A"")" | + -------------------------- + |[ | + | 4, | + | 3, | + | 2, | + | 1 | + |] | + -------------------------- + + """ + array = _to_col_if_str(col, "array_reverse") + return builtin("array_reverse", _emit_ast=_emit_ast)(array) + + @publicapi def array_sort( array: ColumnOrName, @@ -4733,10 +4899,16 @@ def array_sort( - :func:`~snowflake.snowpark.functions.sort_array` which is an alias of :meth:`~snowflake.snowpark.functions.array_sort`. """ array = _to_col_if_str(array, "array_sort") - return builtin("array_sort", _emit_ast=_emit_ast)( + # Build AST here to prevent `sort_ascending` and `nulls_first` from being recorded as a literal instead of bool. + ast = ( + build_function_expr("array_sort", [array, sort_ascending, nulls_first]) + if _emit_ast + else None + ) + return builtin("array_sort", _ast=ast, _emit_ast=False)( array, - lit(sort_ascending, _emit_ast=_emit_ast), - lit(nulls_first, _emit_ast=_emit_ast), + lit(sort_ascending, _emit_ast=False), + lit(nulls_first, _emit_ast=False), ) @@ -5005,7 +5177,7 @@ def date_add( # Convert the input to a column if it is a string col = _to_col_if_str(col, "date_add") num_of_days = ( - lit(num_of_days) + lit(num_of_days, _emit_ast=False) if isinstance(num_of_days, int) else _to_col_if_str(num_of_days, "date_add") ) @@ -5048,12 +5220,12 @@ def date_sub( # Convert the input parameters to the appropriate type col = _to_col_if_str(col, "date_sub") num_of_days = ( - lit(num_of_days) + lit(num_of_days, _emit_ast=False) if isinstance(num_of_days, int) else _to_col_if_str(num_of_days, "date_sub") ) # Return the date column with the number of days subtracted - ans = dateadd("day", -1 * num_of_days, col) + ans = dateadd("day", -1 * num_of_days, col, _emit_ast=False) ans._ast = ast return ans @@ -5389,11 +5561,13 @@ def dayofyear(e: ColumnOrName, _emit_ast: bool = True) -> Column: return builtin("dayofyear", _emit_ast=_emit_ast)(c) +@publicapi def window( time_column: ColumnOrName, window_duration: str, slide_duration: Optional[str] = None, start_time: Optional[str] = None, + _emit_ast: bool = True, ) -> Column: """ Converts a time column into a window object with start and end times. Window start times are @@ -5482,13 +5656,13 @@ def window( "snowflake.snowpark.functions.window does not support slide_duration parameter yet." ) - epoch = lit("1970-01-01 00:00:00").cast( + epoch = lit("1970-01-01 00:00:00", _emit_ast=False).cast( TimestampType(timezone=TimestampTimeZone.NTZ) ) time = _to_col_if_str(time_column, "window") window_duration, window_unit = parse_duration_string(window_duration) - window_duration = lit(window_duration) + window_duration = lit(window_duration, _emit_ast=False) window_unit = f"{window_unit}s" base = epoch @@ -5496,14 +5670,14 @@ def window( start_duration, start_unit = parse_duration_string(start_time) base += make_interval(**{f"{start_unit}s": start_duration}) - window = floor(datediff(window_unit, base, time) / window_duration) - window_start = dateadd(window_unit, window * window_duration, base) + window = floor(datediff(window_unit, base, time, _emit_ast=False) / window_duration) + window_start = dateadd(window_unit, window * window_duration, base, _emit_ast=False) return object_construct_keep_null( - lit("start"), + lit("start", _emit_ast=False), window_start, - lit("end"), - dateadd(window_unit, window_duration, window_start), - ).alias("window") + lit("end", _emit_ast=False), + dateadd(window_unit, window_duration, window_start, _emit_ast=False), + ).alias("window", _emit_ast=False) @publicapi @@ -5803,7 +5977,7 @@ def _timestamp_from_parts_internal( return y, m, d, h, min_, s, ns elif tz is not None: # We need to fill in nanoseconds as 0 to make the sql function work - return y, m, d, h, min_, s, lit(0), tz + return y, m, d, h, min_, s, lit(0, _emit_ast=False), tz else: return y, m, d, h, min_, s else: @@ -6089,7 +6263,9 @@ def timestamp_tz_from_parts( elif nanoseconds is not None: ans = builtin(func_name, _emit_ast=False)(y, m, d, h, min_, s, ns) elif timezone is not None: - ans = builtin(func_name, _emit_ast=False)(y, m, d, h, min_, s, lit(0), tz) + ans = builtin(func_name, _emit_ast=False)( + y, m, d, h, min_, s, lit(0, _emit_ast=False), tz + ) else: ans = builtin(func_name, _emit_ast=False)(y, m, d, h, min_, s) @@ -6407,7 +6583,11 @@ def array_remove( build_builtin_fn_apply(ast, "array_remove", array, element) a = _to_col_if_str(array, "array_remove") - e = lit(element).cast("VARIANT") if isinstance(element, str) else element + e = ( + lit(element, _emit_ast=False).cast("VARIANT", _emit_ast=False) + if isinstance(element, str) + else element + ) ans = builtin("array_remove", _emit_ast=False)(a, e) ans._ast = ast return ans @@ -6779,6 +6959,88 @@ def array_unique_agg(col: ColumnOrName, _emit_ast: bool = True) -> Column: return _call_function("array_unique_agg", True, c, _emit_ast=_emit_ast) +@publicapi +def map_cat(col1: ColumnOrName, col2: ColumnOrName, _emit_ast: bool = True): + """Returns the concatenatation of two MAPs. + + Args: + col1: The source map + col2: The map to be appended to col1 + + Example:: + >>> df = session.sql("select {'k1': 'v1'} :: MAP(STRING,STRING) as A, {'k2': 'v2'} :: MAP(STRING,STRING) as B") + >>> df.select(map_cat("A", "B")).show() + --------------------------- + |"MAP_CAT(""A"", ""B"")" | + --------------------------- + |{ | + | "k1": "v1", | + | "k2": "v2" | + |} | + --------------------------- + + """ + m1 = _to_col_if_str(col1, "map_cat") + m2 = _to_col_if_str(col2, "map_cat") + return builtin("map_cat", _emit_ast=_emit_ast)(m1, m2) + + +@publicapi +def map_contains_key(value: ColumnOrLiteral, col: ColumnOrName, _emit_ast: bool = True): + """Determines whether the specified MAP contains the specified key. + + Args: + value: The key to find. + col: The map to be searched. + + Example 1:: + >>> df = session.sql("select {'k1': 'v1'} :: MAP(STRING,STRING) as M, 'k1' as V") + >>> df.select(map_contains_key(col("V"), "M")).show() + ------------------------------------ + |"MAP_CONTAINS_KEY(""V"", ""M"")" | + ------------------------------------ + |True | + ------------------------------------ + + + Example 2:: + >>> df = session.sql("select {'k1': 'v1'} :: MAP(STRING,STRING) as M") + >>> df.select(map_contains_key("k1", "M")).show() + ----------------------------------- + |"MAP_CONTAINS_KEY('K1', ""M"")" | + ----------------------------------- + |True | + ----------------------------------- + + """ + m = _to_col_if_str(col, "map_contains") + return builtin("map_contains_key", _emit_ast=_emit_ast)(value, m) + + +@publicapi +def map_keys(col: ColumnOrName, _emit_ast: bool = True): + """Returns the keys in a MAP. + + Args: + col: The input map. + + Example 1:: + >>> df = session.sql("select {'k1': 'v1', 'k2': 'v2'} :: MAP(STRING,STRING) as M") + >>> df.select(map_keys("M")).show() + --------------------- + |"MAP_KEYS(""M"")" | + --------------------- + |[ | + | "k1", | + | "k2" | + |] | + --------------------- + + """ + m = _to_col_if_str(col, "map_keys") + return builtin("map_keys", _emit_ast=_emit_ast)(m) + + @publicapi def size(col: ColumnOrName, _emit_ast: bool = True) -> Column: """Returns the size of the input ARRAY, OBJECT or MAP. Returns NULL if the @@ -7154,7 +7416,7 @@ def asc(c: ColumnOrName, _emit_ast: bool = True) -> Column: build_builtin_fn_apply(ast, "asc", c) c = _to_col_if_str(c, "asc") - ans = c.asc() + ans = c.asc(_emit_ast=False) ans._ast = ast return ans @@ -7177,7 +7439,7 @@ def asc_nulls_first(c: ColumnOrName, _emit_ast: bool = True) -> Column: build_builtin_fn_apply(ast, "asc_nulls_first", c) c = _to_col_if_str(c, "asc_nulls_first") - ans = c.asc_nulls_first() + ans = c.asc_nulls_first(_emit_ast=False) ans._ast = ast return ans @@ -7223,7 +7485,7 @@ def desc(c: ColumnOrName, _emit_ast: bool = True) -> Column: build_builtin_fn_apply(ast, "desc", c) c = _to_col_if_str(c, "desc") - ans = c.desc() + ans = c.desc(_emit_ast=False) ans._ast = ast return ans @@ -7247,7 +7509,7 @@ def desc_nulls_first(c: ColumnOrName, _emit_ast: bool = True) -> Column: build_builtin_fn_apply(ast, "desc_nulls_first", c) c = _to_col_if_str(c, "desc_nulls_first") - ans = c.desc_nulls_first() + ans = c.desc_nulls_first(_emit_ast=False) ans._ast = ast return ans @@ -7269,7 +7531,7 @@ def desc_nulls_last(c: ColumnOrName, _emit_ast: bool = True) -> Column: ast = proto.Expr() build_builtin_fn_apply(ast, "desc_nulls_last", c) c = _to_col_if_str(c, "desc_nulls_last") - ans = c.desc_nulls_last() + ans = c.desc_nulls_last(_emit_ast=False) ans._ast = ast return ans @@ -7394,7 +7656,7 @@ def cast( build_builtin_fn_apply(ast, "cast", column, to) c = _to_col_if_str(column, "cast") - ans = c.cast(to) + ans = c.cast(to, _emit_ast=False) ans._ast = ast return ans @@ -7430,7 +7692,7 @@ def try_cast( build_builtin_fn_apply(ast, "try_cast", column, to) c = _to_col_if_str(column, "try_cast") - ans = c.try_cast(to) + ans = c.try_cast(to, _emit_ast=False) ans._ast = ast return ans @@ -7446,9 +7708,11 @@ def _as_decimal_or_number( if scale and not precision: raise ValueError("Cannot define scale without precision") if precision and scale: - return builtin(cast_type, _emit_ast=False)(c, lit(precision), lit(scale)) + return builtin(cast_type, _emit_ast=False)( + c, lit(precision, _emit_ast=False), lit(scale, _emit_ast=False) + ) elif precision: - return builtin(cast_type, _emit_ast=False)(c, lit(precision)) + return builtin(cast_type, _emit_ast=False)(c, lit(precision, _emit_ast=False)) else: return builtin(cast_type, _emit_ast=False)(c) @@ -9647,7 +9911,10 @@ def table_function(function_name: str, _emit_ast: bool = True) -> Callable: @publicapi def call_function( - function_name: str, *args: ColumnOrLiteral, _emit_ast: bool = True + function_name: str, + *args: ColumnOrLiteral, + _ast: proto.Expr = None, + _emit_ast: bool = True, ) -> Column: """Invokes a Snowflake `system-defined function `_ (built-in function) with the specified name and arguments. @@ -9670,11 +9937,13 @@ def call_function( """ - return _call_function(function_name, False, *args, _emit_ast=_emit_ast) + return _call_function(function_name, False, *args, _ast=_ast, _emit_ast=_emit_ast) @publicapi -def function(function_name: str, _emit_ast: bool = True) -> Callable: +def function( + function_name: str, _ast: proto.Expr = None, _emit_ast: bool = True +) -> Callable: """ Function object to invoke a Snowflake `system-defined function `_ (built-in function). Use this to invoke any built-in functions not explicitly listed in this object. @@ -9703,7 +9972,9 @@ def function(function_name: str, _emit_ast: bool = True) -> Callable: ---------------- """ - return lambda *args: call_function(function_name, *args, _emit_ast=_emit_ast) + return lambda *args: call_function( + function_name, *args, _ast=_ast, _emit_ast=_emit_ast + ) def _call_function( @@ -9979,6 +10250,9 @@ def sproc( sort_array = array_sort map_from_arrays = arrays_to_object signum = sign +array_join = array_to_string +array_union = array_cat +map_concat = map_cat @publicapi @@ -10159,3 +10433,25 @@ def snowflake_cortex_summarize(text: ColumnOrLiteralStr): sql_func_name = "snowflake.cortex.summarize" text_col = _to_col_if_lit(text, sql_func_name) return builtin(sql_func_name)(text_col) + + +def snowflake_cortex_sentiment(text: ColumnOrLiteralStr): + """ + A string containing the text for which a sentiment score should be calculated. + + Args: + text: A string containing the English text from which a summary should be generated. + Returns: + A floating-point number from -1 to 1 (inclusive) indicating the level of negative or positive sentiment in the + text. Values around 0 indicate neutral sentiment. + + Example:: + + >>> content = "A very very bad review!" + >>> df = session.create_dataframe([[content]], schema=["content"]) + >>> result = df.select(snowflake_cortex_sentiment(content)).collect()[0][0] + >>> assert -1 <= result <= 0 + """ + sql_func_name = "snowflake.cortex.sentiment" + text_col = _to_col_if_lit(text, sql_func_name) + return builtin(sql_func_name)(text_col) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/aggregation_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/aggregation_utils.py index 693a0f318ef..72978fc797c 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/aggregation_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/aggregation_utils.py @@ -101,6 +101,9 @@ def _array_agg_keepna( # parse_json(lit("null")) per [2]. # [1] https://stackoverflow.com/a/77422662 # [2] https://github.com/snowflakedb/snowflake-connector-python/issues/1388#issuecomment-1371091831 + + # HOWEVER it appears that this workaround only works for integer values. + # See details in SNOW-1859090. return array_flatten( array_agg( array_construct( diff --git a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py index 187f9d26c59..bbb8c9ad020 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py @@ -26,31 +26,7 @@ pandas_lit, is_compatible_snowpark_types, ) -from snowflake.snowpark.functions import ( - builtin, - col, - dense_rank, - ln, - log, - _log2, - _log10, - sin, - snowflake_cortex_summarize, - udf, - to_variant, - when, - udtf, - exp, - cos, - tan, - sinh, - cosh, - tanh, - ceil, - floor, - trunc, - sqrt, -) +from snowflake.snowpark import functions as sp_func from snowflake.snowpark.modin.plugin._internal.frame import InternalFrame from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import ( OrderedDataFrame, @@ -95,22 +71,23 @@ cloudpickle.register_pickle_by_value(sys.modules[__name__]) SUPPORTED_SNOWPARK_PYTHON_FUNCTIONS_IN_APPLY = { - exp, - ln, - log, - _log2, - _log10, - sin, - cos, - tan, - sinh, - cosh, - tanh, - ceil, - floor, - trunc, - sqrt, - snowflake_cortex_summarize, + sp_func.exp, + sp_func.ln, + sp_func.log, + sp_func._log2, + sp_func._log10, + sp_func.sin, + sp_func.cos, + sp_func.tan, + sp_func.sinh, + sp_func.cosh, + sp_func.tanh, + sp_func.ceil, + sp_func.floor, + sp_func.trunc, + sp_func.sqrt, + sp_func.snowflake_cortex_summarize, + sp_func.snowflake_cortex_sentiment, } @@ -285,7 +262,7 @@ def end_partition(self, df): # type: ignore[no-untyped-def] # pragma: no cover ApplyFunc.end_partition._sf_vectorized_input = native_pd.DataFrame # type: ignore[attr-defined] packages = list(session.get_packages().values()) + udf_packages - func_udtf = udtf( + func_udtf = sp_func.udtf( ApplyFunc, output_schema=PandasDataFrameType( [LongType(), StringType(), VariantType()], @@ -707,7 +684,7 @@ def end_partition(self, df: native_pd.DataFrame): # type: ignore[no-untyped-def excluded=existing_identifiers, wrap_double_underscore=False, ) - return udtf( + return sp_func.udtf( ApplyFunc, output_schema=PandasDataFrameType( [StringType(), IntegerType(), VariantType(), IntegerType(), IntegerType()], @@ -781,7 +758,7 @@ def apply_func(x): # type: ignore[no-untyped-def] # pragma: no cover def apply_func(x): # type: ignore[no-untyped-def] # pragma: no cover return x.apply(func, args=args, **kwargs) - func_udf = udf( + func_udf = sp_func.udf( apply_func, return_type=PandasSeriesType(return_type), input_types=[PandasSeriesType(input_type)], @@ -1185,12 +1162,12 @@ def groupby_apply_pivot_result_to_final_ordered_dataframe( # in GROUP_KEY_APPEARANCE_ORDER) and assign the # label i to all rows that came from func(group_i). [ - col(original_row_position_snowflake_quoted_identifier).as_( + sp_func.col(original_row_position_snowflake_quoted_identifier).as_( new_index_identifier ) if sort_method is GroupbyApplySortMethod.ORIGINAL_ROW_ORDER else ( - dense_rank().over( + sp_func.dense_rank().over( Window.order_by( *( SnowparkColumn(col).asc_nulls_last() @@ -1211,9 +1188,11 @@ def groupby_apply_pivot_result_to_final_ordered_dataframe( ), *[ ( - col(old_quoted_identifier).as_(quoted_identifier) + sp_func.col(old_quoted_identifier).as_(quoted_identifier) if return_variant - else col(old_quoted_identifier).cast(return_type).as_(quoted_identifier) + else sp_func.col(old_quoted_identifier) + .cast(return_type) + .as_(quoted_identifier) ) for old_quoted_identifier, quoted_identifier in zip( data_column_snowflake_quoted_identifiers @@ -1398,7 +1377,7 @@ def groupby_apply_sort_method( # Need to wrap column name in IDENTIFIER, or else bool agg function # will treat the name as a string literal is_transform: bool = not ordered_dataframe_before_sort.agg( - builtin("boolor_agg")( + sp_func.builtin("boolor_agg")( SnowparkColumn(original_row_position_quoted_identifier) == -1 ).as_("is_transform") ).collect()[0][0] @@ -1473,7 +1452,7 @@ def make_condition(key: Any) -> SnowparkColumn: # Cast one of the values in the comparison to variant so that we # we can compare types that are otherwise not comparable in # Snowflake, like timestamp and int. - return col.equal_null(to_variant(pandas_lit(key))) + return col.equal_null(sp_func.to_variant(pandas_lit(key))) # If any of the values we are mapping to have types that are # incompatible with the current column's type, we have to cast the new @@ -1496,7 +1475,7 @@ def make_condition(key: Any) -> SnowparkColumn: def make_result(value: Any) -> SnowparkColumn: value_expression = pandas_lit(value) return ( - to_variant(value_expression) + sp_func.to_variant(value_expression) if should_cast_result_to_variant else value_expression ) @@ -1508,7 +1487,7 @@ def make_result(value: Any) -> SnowparkColumn: make_condition(key_and_value[0]), make_result(key_and_value[1]) ), itertools.islice(map_items, 1, None), - when(make_condition(first_key), make_result(first_value)), + sp_func.when(make_condition(first_key), make_result(first_value)), ) if isinstance(mapping, defaultdict): case_expression = case_expression.otherwise( diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 38ecd99b80a..c9171d9369c 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -372,6 +372,9 @@ from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.modin.utils import MODIN_UNNAMED_SERIES_LABEL +from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import ( + NUMPY_UNIVERSAL_FUNCTION_TO_SNOWFLAKE_FUNCTION, +) from snowflake.snowpark.session import Session from snowflake.snowpark.types import ( ArrayType, @@ -8432,6 +8435,30 @@ def apply( ) return self._apply_snowpark_python_function_to_columns(func, kwargs) + # TODO SNOW-1739034: remove 'no cover' when apply tests are enabled in CI + sf_func = NUMPY_UNIVERSAL_FUNCTION_TO_SNOWFLAKE_FUNCTION.get( + func + ) # pragma: no cover + if sf_func is not None: # pragma: no cover + return self._apply_snowpark_python_function_to_columns(sf_func, kwargs) + + if get_snowflake_agg_func(func, {}, axis) is not None: # pragma: no cover + # np.std and np.var 'ddof' parameter defaults to 0 but + # df.std and df.var 'ddof' parameter defaults to 1. + # Set it here explicitly to 0 if not provided. + if func in (np.std, np.var) and "ddof" not in kwargs: + kwargs["ddof"] = 0 + # np.median return NaN if any value is NaN while df.median skips NaN values. + # Set 'skipna' to false to match behavior. + if func == np.median: + kwargs["skipna"] = False + qc = self.agg(func, axis, None, kwargs) + if axis == 1: + # agg method populates series name with aggregation function name but + # in apply we need unnamed series. + qc = qc.set_columns([MODIN_UNNAMED_SERIES_LABEL]) + return qc + if axis == 0: frame = self._modin_frame @@ -8755,6 +8782,18 @@ def applymap( f"Snowpark pandas applymap API doesn't yet support Snowpark Python function `{func.__name__}` with args = '{args}'." ) return self._apply_snowpark_python_function_to_columns(func, kwargs) + + # TODO SNOW-1739034: remove pragma no cover when apply tests are enabled in CI + # Check if the function is a known numpy function that can be translated to + # Snowflake function. + sf_func = NUMPY_UNIVERSAL_FUNCTION_TO_SNOWFLAKE_FUNCTION.get(func) + if sf_func is not None: # pragma: no cover + return self._apply_snowpark_python_function_to_columns(sf_func, kwargs) + + if func in (np.sum, np.min, np.max): # pragma: no cover + # Aggregate functions applied element-wise to columns are no-op. + return self + # Currently, NULL values are always passed into the udtf even if strict=True, # which is a bug on the server side SNOW-880105. # The fix will not land soon, so we are going to raise not implemented error for now. @@ -18123,7 +18162,7 @@ def dt_total_seconds(self, include_index: bool = False) -> "SnowflakeQueryCompil ) ) - def dt_strftime(self, date_format: str) -> None: + def dt_strftime(self, date_format: str) -> "SnowflakeQueryCompiler": """ Format underlying date-time data using specified format. @@ -18133,8 +18172,102 @@ def dt_strftime(self, date_format: str) -> None: Returns: New QueryCompiler containing formatted date-time values. """ - ErrorMessage.not_implemented( - "Snowpark pandas doesn't yet support the method 'Series.dt.strftime'" + + def strftime_func(column: SnowparkColumn) -> SnowparkColumn: + directive_to_function_map: dict[str, Callable] = { + "d": ( + # Day of the month as a zero-padded decimal number + lambda column: lpad( + dayofmonth(column), pandas_lit(2), pandas_lit("0") + ) + ), + "m": ( + # Month as a zero-padded decimal number + lambda column: lpad(month(column), pandas_lit(2), pandas_lit("0")) + ), + "Y": ( + # Year with century as a decimal number + lambda column: lpad(year(column), pandas_lit(4), pandas_lit("0")) + ), + "H": ( + # Hour (24-hour clock) as a zero-padded decimal number + lambda column: lpad(hour(column), pandas_lit(2), pandas_lit("0")) + ), + "M": ( + # Minute as a zero-padded decimal number + lambda column: lpad(minute(column), pandas_lit(2), pandas_lit("0")) + ), + "S": ( + # Second as a zero-padded decimal number + lambda column: lpad(second(column), pandas_lit(2), pandas_lit("0")) + ), + "f": ( + # Microsecond as a decimal number, zero-padded to 6 digits + lambda column: lpad( + floor(date_part("ns", column) / 1000), + pandas_lit(6), + pandas_lit("0"), + ) + ), + "j": ( + # Day of the year as a zero-padded decimal number + lambda column: lpad( + dayofyear(column), pandas_lit(3), pandas_lit("0") + ) + ), + "X": ( + # Locale’s appropriate time representation + lambda column: trunc(to_time(column), pandas_lit("second")) + ), + "%": ( + # A literal '%' character + lambda column: pandas_lit("%") + ), + } + + parts = re.split("%.", date_format) + directive_first = False + if parts[0] == "": + parts = parts[1:] + directive_first = True + if parts[-1] == "": + parts = parts[:-1] + directives = re.findall("%.", date_format) + cols = [] + for i in range(min(len(parts), len(directives))): + directive_function = directive_to_function_map.get(directives[i][1:]) + if not directive_function: + raise ErrorMessage.not_implemented( + f"Snowpark pandas 'Series.dt.strftime' method does not yet support the directive '%{directives[i][1:]}'" + ) + + if directive_first: + cols.append(directive_function(column)) + cols.append(pandas_lit(parts[i])) + else: + cols.append(pandas_lit(parts[i])) + cols.append(directive_function(column)) + + if len(parts) > len(directives): + cols.append(pandas_lit(parts[-1])) + if len(parts) < len(directives): + directive_function = directive_to_function_map.get(directives[-1][1:]) + if not directive_function: + raise ErrorMessage.not_implemented( + f"Snowpark pandas 'Series.dt.strftime' method does not yet support the directive '%{directives[-1][1:]}'" + ) + cols.append(directive_function(column)) + + if len(cols) == 1: + return iff(column.is_null(), pandas_lit(None), cols[0]) + else: + return iff(column.is_null(), pandas_lit(None), concat(*cols)) + + return SnowflakeQueryCompiler( + self._modin_frame.apply_snowpark_function_to_columns( + strftime_func, + include_index=False, + ) ) def topn( @@ -19406,3 +19539,146 @@ def timedelta_property( include_index=include_index, ) ) + + def groupby_unique( + self, + by: Any, + axis: int, + groupby_kwargs: dict, + agg_args: Sequence, + agg_kwargs: dict, + numeric_only: bool, + is_series_groupby: bool, + drop: bool = False, + ) -> "SnowflakeQueryCompiler": + """ + Aggregate unique values for each group into a list. + + + Parameters + ---------- + by : Any + Index level name(s) or column label(s) to group by. + axis: int + The axis along which to group data. This parameter must be 0, but + we keep it to match the interface of Modin's BaseQueryCompiler. + groupby_kwargs: dict + The keyword arguments to groupby(). + agg_args: Sequence + Positional arguments to the unique() aggregation function. This + parameter must be empty because unique() does not take positional + arguments, but we keep the parameter to match the interface of + Modin's BaseQueryCompiler. + agg_kwargs: dict + Keyword arguments to the unique() aggregation function. This + parameter must be empty because unique() does not take keyword + arguments, but we keep the parameter to match the interface of + Modin's BaseQueryCompiler. + numeric_only: bool + This parameter is meaningless as unique() does not take a + numeric_only parameter, but we keep the parameter to match the + interface of Modin's BaseQueryCompiler. + is_series_groupby: bool + Whether this method is called via SeriesGroupBy as opposed to + DataFrameGroupBy. This parameter should always be true, but we keep + it to match the interface of Modin's BaseQueryCompiler. + drop: bool, default False + Whether the `by` columns are internal this dataframe. + + Returns + ------- + A new SnowflakeQueryCompiler with the unique values of the singular + data column for each group. + """ + assert axis == 0, "Internal error. SeriesGroupBy.unique() axis should be 0." + assert len(agg_args) == 0, ( + "Internal error. SeriesGroupBy.unique() does not take " + + "aggregation arguments." + ) + assert len(agg_kwargs) == 0, ( + "Internal error. SeriesGroupBy.unique() does not take " + + "aggregation arguments." + ) + assert ( + is_series_groupby is True + ), "Internal error. Only SeriesGroupBy has a unique() method." + + compiler = SnowflakeQueryCompiler( + self._modin_frame.ensure_row_position_column() + ) + by_list = extract_groupby_column_pandas_labels( + compiler, by, groupby_kwargs.get("level", None) + ) + by_snowflake_quoted_identifiers_list = [] + for ( + entry + ) in compiler._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( + by_list + ): + assert len(entry) == 1, ( + "Internal error. Each grouping label should correspond to a " + + "single Snowpark column." + ) + by_snowflake_quoted_identifiers_list.append(entry[0]) + + # There is no built-in snowflake function to aggregation unique values + # of a column into array while preserving a certain order. We implement + # the aggregation in the following steps: + # 1) Project a new column representing the row position of each row + # within each combination of group + data column value. + # 2) Filter the result to the rows where the new column is equal to 1, + # i.e. get the row where each data column value appears for the first + # time within each group. + # 3) Project away the extra rank column. + # 4) Group according to `groupby_kwargs` and for each group, aggregate + # the (singular) remaining data column into a list ordered by the + # original row order. + frame_with_rank = compiler._modin_frame.append_column( + "_rank_column", + rank().over( + Window.partition_by( + *by_snowflake_quoted_identifiers_list, + *( + identifier + for identifier in ( + compiler._modin_frame.data_column_snowflake_quoted_identifiers + ) + if identifier not in by_snowflake_quoted_identifiers_list + ), + ).order_by( + compiler._modin_frame.row_position_snowflake_quoted_identifier + ) + ), + ) + return ( + SnowflakeQueryCompiler( + frame_with_rank.filter( + col(frame_with_rank.data_column_snowflake_quoted_identifiers[-1]) + == 1 + ) + ) + .take_2d_positional( + index=slice(None), + columns=( + list( + range( + len( + frame_with_rank.data_column_snowflake_quoted_identifiers + ) + - 1 + ) + ) + ), + ) + .groupby_agg( + by=by, + agg_func="array_agg", + axis=axis, + groupby_kwargs=groupby_kwargs, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + numeric_only=numeric_only, + is_series_groupby=is_series_groupby, + drop=drop, + ) + ) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/__init__.py b/src/snowflake/snowpark/modin/plugin/docstrings/__init__.py index fa2b9fb28d3..780b4d85750 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/__init__.py @@ -6,6 +6,7 @@ from snowflake.snowpark.modin.plugin.docstrings.base import BasePandasDataset from snowflake.snowpark.modin.plugin.docstrings.dataframe import DataFrame +from snowflake.snowpark.modin.plugin.docstrings.datetime_index import DatetimeIndex from snowflake.snowpark.modin.plugin.docstrings.general import * # noqa: F401,F403 from snowflake.snowpark.modin.plugin.docstrings.groupby import ( DataFrameGroupBy, @@ -33,4 +34,5 @@ "SeriesGroupBy", "StringMethods", "Index", + "DatetimeIndex", ] diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index f9b798020ec..da4eef039b6 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -1686,12 +1686,151 @@ def floordiv(): @classmethod def from_dict(): """ - Construct ``DataFrame`` from dict of array-like or dicts. + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {‘columns’, ‘index’, ‘tight’}, default ‘columns’ + The “orientation” of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass ‘columns’ (default). Otherwise if the keys should be rows, pass ‘index’. If ‘tight’, assume a dict with keys [‘index’, ‘columns’, ‘data’, ‘index_names’, ‘column_names’]. + + Added in version 1.4.0: ‘tight’ as an allowed value for the orient argument + + dtype : dtype, default None + Data type to force after DataFrame construction, otherwise infer. + columns : list, default None + Column labels to use when orient='index'. Raises a ValueError if used with orient='columns' or orient='tight'. + + Returns + ------- + DataFrame + + See also + -------- + DataFrame.from_records + DataFrame from structured ndarray, sequence of tuples or dicts, or DataFrame. + DataFrame + DataFrame object creation using constructor. + DataFrame.to_dict + Convert the DataFrame to a dictionary. + + Examples + -------- + + By default the keys of the dict become the DataFrame columns: + + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify orient='index' to create the DataFrame using dictionary keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the ‘index’ orientation, the column names can be specified manually: + + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 a b c d + + Specify orient='tight' to create the DataFrame using a ‘tight’ format: + + >>> data = {'index': [('a', 'b'), ('a', 'c')], + ... 'columns': [('x', 1), ('y', 2)], + ... 'data': [[1, 3], [2, 4]], + ... 'index_names': ['n1', 'n2'], + ... 'column_names': ['z1', 'z2']} + >>> pd.DataFrame.from_dict(data, orient='tight') # doctest: +NORMALIZE_WHITESPACE + z1 x y + z2 1 2 + n1 n2 + a b 1 3 + c 2 4 """ def from_records(): """ - Convert structured or record ndarray to ``DataFrame``. + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of tuples or dicts, or DataFrame. + + Parameters + ---------- + data : structured ndarray, sequence of tuples or dicts, or DataFrame + Structured input data. + + Deprecated since version 2.1.0: Passing a DataFrame is deprecated. + + index : str, list of fields, array-like + Field of array to use as the index, alternately a specific set of input labels to use. + exclude : sequence, default None + Columns or fields to exclude. + columns : sequence, default None + Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns). + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. + nrows : int, default None + Number of rows to read if data is an iterator. + + Returns + ------- + DataFrame + + See also + -------- + DataFrame.from_dict + DataFrame from dict of array-like or dicts. + DataFrame + DataFrame object creation using constructor. + + Examples + -------- + Data can be provided as a structured ndarray: + + >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], + ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of dicts: + + >>> data = [{'col_1': 3, 'col_2': 'a'}, + ... {'col_1': 2, 'col_2': 'b'}, + ... {'col_1': 1, 'col_2': 'c'}, + ... {'col_1': 0, 'col_2': 'd'}] + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of tuples with corresponding columns: + + >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] + >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d """ def ge(): @@ -4679,7 +4818,77 @@ def value_counts(): def map(): """ - Apply a function to the `DataFrame` elementwise. + Apply a function to a Dataframe elementwise. + + Added in version 2.1.0: DataFrame.applymap was deprecated and renamed to DataFrame.map. + + This method applies a function that accepts and returns a scalar to every element of a DataFrame. + + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, ‘ignore’}, default None + If ‘ignore’, propagate NaN values, without passing them to func. + **kwargs + Additional keyword arguments to pass as keywords arguments to func. + + Returns + ------- + DataFrame + Transformed DataFrame. + + See also + -------- + DataFrame.apply + Apply a function along input axis of DataFrame. + DataFrame.replace + Replace values given in to_replace with value. + Series.map + Apply a function elementwise on a Series. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.map(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + + Like Series.map, NA values can be ignored: + + >>> df_copy = df.copy() + >>> df_copy.iloc[0, 0] = pd.NA + >>> df_copy.map(lambda x: len(str(x)), na_action='ignore') # doctest: +SKIP + 0 1 + 0 NaN 4 + 1 5.0 5 + + It is also possible to use map with functions that are not lambda functions: + + >>> df.map(round, ndigits=1) + 0 1 + 0 1.0 2.1 + 1 3.4 4.6 + + Note that a vectorized version of func often exists, which will be much faster. You could square each number elementwise. + + >>> df.map(lambda x: x**2) + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + + But it’s better to avoid map in that case. + + >>> df ** 2 + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 """ def mask(): diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/datetime_index.py b/src/snowflake/snowpark/modin/plugin/docstrings/datetime_index.py new file mode 100644 index 00000000000..3edd8723c9f --- /dev/null +++ b/src/snowflake/snowpark/modin/plugin/docstrings/datetime_index.py @@ -0,0 +1,1365 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +# Code in this file may constitute partial or total reimplementation, or modification of +# existing code originally distributed by the Modin project, under the Apache License, +# Version 2.0. + +"""This module contains DatetimeIndex docstrings that override modin's docstrings.""" + +from __future__ import annotations + +from .index import Index + + +class DatetimeIndex(Index): + def __new__(): + """ + Create new instance of DatetimeIndex. This overrides behavior of Index.__new__. + + Parameters + ---------- + data : array-like (1-dimensional), pandas.Index, modin.pandas.Series, optional + Datetime-like data to construct index with. + freq : str or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation. + tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + Set the Timezone of the data. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + closed : {'left', 'right'}, optional + Set whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from 03:00 + DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC + and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter + dictates how ambiguous times should be handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for ambiguous + times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + dayfirst : bool, default False + If True, parse dates in `data` with the day first order. + yearfirst : bool, default False + If True parse dates in `data` with the year first order. + dtype : numpy.dtype or DatetimeTZDtype or str, default None + Note that the only NumPy dtype allowed is `datetime64[ns]`. + copy : bool, default False + Make a copy of input ndarray. + name : label, default None + Name to be stored in the index. + query_compiler : SnowflakeQueryCompiler, optional + A query compiler object to create the ``Index`` from. + + Returns: + New instance of DatetimeIndex. + """ + + def __init__() -> None: + """ + Immutable ndarray-like of datetime64 data. + + Parameters + ---------- + data : array-like (1-dimensional), pandas.Index, modin.pandas.Series, optional + Datetime-like data to construct index with. + freq : str or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation. + tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + Set the Timezone of the data. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + closed : {'left', 'right'}, optional + Set whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from 03:00 + DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC + and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter + dictates how ambiguous times should be handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for ambiguous + times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + dayfirst : bool, default False + If True, parse dates in `data` with the day first order. + yearfirst : bool, default False + If True parse dates in `data` with the year first order. + dtype : numpy.dtype or DatetimeTZDtype or str, default None + Note that the only NumPy dtype allowed is `datetime64[ns]`. + copy : bool, default False + Make a copy of input ndarray. + name : label, default None + Name to be stored in the index. + query_compiler : SnowflakeQueryCompiler, optional + A query compiler object to create the ``Index`` from. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"], tz="America/Los_Angeles") + >>> idx + DatetimeIndex(['2020-01-01 02:00:00-08:00', '2020-02-01 03:00:00-08:00'], dtype='datetime64[ns, UTC-08:00]', freq=None) + """ + + def _dt_property(): + """ + Get the datetime property. + + Parameters + ---------- + property_name : str + The name of the datetime property. + + Returns + ------- + Index + The datetime property. + """ + + @property + def year(): + """ + The year of the datetime. + + Returns + ------- + An Index with the year of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="YE") + >>> idx + DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]', freq=None) + >>> idx.year + Index([2000, 2001, 2002], dtype='int64') + """ + + @property + def month(): + """ + The month as January=1, December=12. + + Returns + ------- + An Index with the month of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="ME") + >>> idx + DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]', freq=None) + >>> idx.month + Index([1, 2, 3], dtype='int64') + """ + + @property + def day(): + """ + The day of the datetime. + + Returns + ------- + An Index with the day of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="D") + >>> idx + DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]', freq=None) + >>> idx.day + Index([1, 2, 3], dtype='int64') + """ + + @property + def hour(): + """ + The hours of the datetime. + + Returns + ------- + An Index with the hours of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="h") + >>> idx + DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00', + '2000-01-01 02:00:00'], + dtype='datetime64[ns]', freq=None) + >>> idx.hour + Index([0, 1, 2], dtype='int64') + """ + + @property + def minute(): + """ + The minutes of the datetime. + + Returns + ------- + An Index with the minutes of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="min") + >>> idx + DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', + '2000-01-01 00:02:00'], + dtype='datetime64[ns]', freq=None) + >>> idx.minute + Index([0, 1, 2], dtype='int64') + """ + + @property + def second(): + """ + The seconds of the datetime. + + Returns + ------- + An Index with the seconds of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="s") + >>> idx + DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:00:01', + '2000-01-01 00:00:02'], + dtype='datetime64[ns]', freq=None) + >>> idx.second + Index([0, 1, 2], dtype='int64') + """ + + @property + def microsecond(): + """ + The microseconds of the datetime. + + Returns + ------- + An Index with the microseconds of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="us") + >>> idx + DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.000001', + '2000-01-01 00:00:00.000002'], + dtype='datetime64[ns]', freq=None) + >>> idx.microsecond + Index([0, 1, 2], dtype='int64') + """ + + @property + def nanosecond(): + """ + The nanoseconds of the datetime. + + Returns + ------- + An Index with the nanoseconds of the datetime. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="ns") + >>> idx + DatetimeIndex([ '2000-01-01 00:00:00', + '2000-01-01 00:00:00.000000001', + '2000-01-01 00:00:00.000000002'], + dtype='datetime64[ns]', freq=None) + >>> idx.nanosecond + Index([0, 1, 2], dtype='int64') + """ + + @property + def date(): + """ + Returns the date part of Timestamps without time and timezone information. + + Returns + ------- + Returns an Index with the date part of Timestamps. Note this is different + from native pandas which returns a python array. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.date + Index([2020-01-01, 2020-02-01], dtype='object') + """ + + @property + def dayofweek(): + """ + The day of the week with Monday=0, Sunday=6. + + Return the day of the week. It is assumed the week starts on + Monday, which is denoted by 0 and ends on Sunday which is denoted + by 6. This method is available on both Series with datetime + values (using the `dt` accessor) or DatetimeIndex. + + Returns + ------- + An Index Containing integers indicating the day number. + + Examples + -------- + >>> idx = pd.date_range('2016-12-31', '2017-01-08', freq='D') + >>> idx.dayofweek + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int64') + """ + + day_of_week = dayofweek + weekday = dayofweek + + @property + def dayofyear(): + """ + The ordinal day of the year. + + Returns + ------- + An Index Containing integers indicating the ordinal day of the year. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.dayofyear + Index([1, 32], dtype='int64') + """ + + day_of_year = dayofyear + + @property + def quarter(): + """ + The quarter of the date. + + Returns + ------- + An Index Containing quarter of the date. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.quarter + Index([1, 1], dtype='int64') + """ + + @property + def is_month_start(): + """ + Indicates whether the date is the first day of the month. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + See Also + -------- + is_month_end : Similar property indicating the last day of the month. + + Examples + -------- + >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_start + Index([False, False, True], dtype='bool') + """ + + @property + def is_month_end(): + """ + Indicates whether the date is the last day of the month. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + See Also + -------- + is_month_start : Similar property indicating the first day of the month. + + Examples + -------- + + >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_end + Index([False, True, False], dtype='bool') + """ + + @property + def is_quarter_start(): + """ + Indicator for whether the date is the first day of a quarter. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + See Also + -------- + is_quarter_end : Similar property indicating the last day of the quarter. + + Examples + -------- + >>> idx = pd.date_range('2017-03-30', periods=4) + >>> idx + DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], dtype='datetime64[ns]', freq=None) + + >>> idx.is_quarter_start + Index([False, False, True, False], dtype='bool') + """ + + @property + def is_quarter_end(): + """ + Indicator for whether the date is the last day of a quarter. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + See Also + -------- + is_quarter_start: Similar property indicating the first day of the quarter. + + Examples + -------- + >>> idx = pd.date_range('2017-03-30', periods=4) + >>> idx + DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], dtype='datetime64[ns]', freq=None) + + >>> idx.is_quarter_end + Index([False, True, False, False], dtype='bool') + """ + + @property + def is_year_start(): + """ + Indicate whether the date is the first day of a year. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + See Also + -------- + is_year_end : Similar property indicating the last day of the year. + + Examples + -------- + >>> idx = pd.date_range("2017-12-30", periods=3) + >>> idx + DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq=None) + + >>> idx.is_year_start + Index([False, False, True], dtype='bool') + """ + + @property + def is_year_end(): + """ + Indicate whether the date is the last day of the year. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + See Also + -------- + is_year_start : Similar property indicating the start of the year. + + Examples + -------- + >>> idx = pd.date_range("2017-12-30", periods=3) + >>> idx + DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq=None) + + >>> idx.is_year_end + Index([False, True, False], dtype='bool') + """ + + @property + def is_leap_year(): + """ + Boolean indicator if the date belongs to a leap year. + + A leap year is a year, which has 366 days (instead of 365) including + 29th of February as an intercalary day. + Leap years are years which are multiples of four except for years + divisible by 100 but not by 400. + + Returns + ------- + An Index with boolean values. Note this is different from native pandas which + returns a python array. + + Examples + -------- + >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE") + >>> idx + DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], dtype='datetime64[ns]', freq=None) + >>> idx.is_leap_year + Index([True, False, False], dtype='bool') + """ + + @property + def time(): + """ + Returns the time part of the Timestamps. + + Returns + ------- + An Index with the time part of the Timestamps. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.time + Index([10:00:00, 11:00:00], dtype='object') + """ + + @property + def timetz(): + """ + Returns the time part of the Timestamps with timezone. + + Returns + ------- + An Index with the time part with timezone of the Timestamps. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.timetz # doctest: +SKIP + Index(["10:00:00+00:00", "11:00:00+00:00"], dtype='object') + """ + + @property + def tz(): + """ + Return the timezone. + + Returns + ------- + datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + Returns None when the array is tz-naive. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", + ... "2/1/2020 11:00:00+00:00"]) + >>> idx.tz # doctest: +SKIP + datetime.timezone.utc + """ + + @property + def freq(): + """ + Return the frequency object if it's set, otherwise None. + + Examples + -------- + >>> idx = pd.date_range("2000-01-01", periods=3, freq="YE") + >>> idx.freq # doctest: +SKIP + + """ + + @property + def freqstr(): + """ + Return the frequency object as a string if it's set, otherwise None. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00"], freq="D") + >>> idx.freqstr # doctest: +SKIP + 'D' + + The frequency can be inferred if there are more than 2 points: + + >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], + ... freq="infer") + >>> idx.freqstr # doctest: +SKIP + '2D' + """ + + @property + def inferred_freq(): + """ + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) + >>> idx.inferred_freq # doctest: +SKIP + '2D' + """ + + def indexer_at_time(): + """ + Return index locations of values at particular time of day. + + Parameters + ---------- + time : datetime.time or str + Time passed in either as object (datetime.time) or as string in + appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). + + Returns + ------- + np.ndarray[np.intp] + + See Also + -------- + indexer_between_time : Get index locations of values between particular + times of day. + DataFrame.at_time : Select values at particular time of day. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00", "2/1/2020 11:00", + ... "3/1/2020 10:00"]) + >>> idx.indexer_at_time("10:00") # doctest: +SKIP + array([0, 2]) + """ + + def indexer_between_time(): + """ + Return index locations of values between particular times of day. + + Parameters + ---------- + start_time, end_time : datetime.time, str + Time passed either as object (datetime.time) or as string in + appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). + include_start : bool, default True + include_end : bool, default True + + Returns + ------- + np.ndarray[np.intp] + + See Also + -------- + indexer_at_time : Get index locations of values at particular time of day. + DataFrame.between_time : Select values between particular times of day. + + Examples + -------- + >>> idx = pd.date_range("2023-01-01", periods=4, freq="h") + >>> idx + DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00', + '2023-01-01 02:00:00', '2023-01-01 03:00:00'], + dtype='datetime64[ns]', freq=None) + >>> idx.indexer_between_time("00:00", "2:00", include_end=False) # doctest: +SKIP + array([0, 1]) + """ + + def normalize(): + """ + Convert times to midnight. + + The time component of the date-time is converted to midnight i.e. + 00:00:00. This is useful in cases, when the time does not matter. + Length is unaltered. The timezones are unaffected. + + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on Datetime Array/Index. + + Returns + ------- + DatetimeArray, DatetimeIndex or Series + The same type as the original data. Series will have the same + name and index. DatetimeIndex will have the same name. + + See Also + -------- + floor : Floor the datetimes to the specified freq. + ceil : Ceil the datetimes to the specified freq. + round : Round the datetimes to the specified freq. + + Examples + -------- + >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', + ... periods=3, tz='Asia/Calcutta') # doctest: +SKIP + >>> idx # doctest: +SKIP + DatetimeIndex(['2014-08-01 10:00:00+05:30', + '2014-08-01 11:00:00+05:30', + '2014-08-01 12:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq=None) + >>> idx.normalize() # doctest: +SKIP + DatetimeIndex(['2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq=None) + """ + + def strftime(): + """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format + doc <%(URL)s>`__. + + Formats supported by the C `strftime` API but not by the python string format + doc (such as `"%%R"`, `"%%r"`) are not officially supported and should be + preferably replaced with their supported equivalents (such as `"%%H:%%M"`, + `"%%I:%%M:%%S %%p"`). + + Note that `PeriodIndex` support additional directives, detailed in + `Period.strftime`. + + Parameters + ---------- + date_format : str + Date format string (e.g. "%%Y-%%m-%%d"). + + Returns + ------- + ndarray[object] + NumPy ndarray of formatted strings. + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + Timestamp.strftime : Format a single Timestamp. + Period.strftime : Format a single Period. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq=None) + >>> rng.strftime('%%B %%d, %%Y, %%r') # doctest: +SKIP + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ + + def snap(): + """ + Snap time stamps to nearest occurring frequency. + + Returns + ------- + DatetimeIndex + + Examples + -------- + >>> idx = pd.DatetimeIndex(['2023-01-01', '2023-01-02', + ... '2023-02-01', '2023-02-02']) + >>> idx + DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], dtype='datetime64[ns]', freq=None) + >>> idx.snap('MS') # doctest: +SKIP + DatetimeIndex(['2023-01-01', '2023-01-01', '2023-02-01', '2023-02-01'], dtype='datetime64[ns]', freq=None) + """ + + def tz_convert(): + """ + Convert tz-aware Datetime Array/Index from one time zone to another. + + Parameters + ---------- + tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + Time zone for time. Corresponding timestamps would be converted + to this time zone of the Datetime Array/Index. A `tz` of None will + convert to UTC and remove the timezone information. + + Returns + ------- + Array or Index + + Raises + ------ + TypeError + If Datetime Array/Index is tz-naive. + + See Also + -------- + DatetimeIndex.tz : A timezone that has a variable offset from UTC. + DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a + given time zone, or remove timezone from a tz-aware DatetimeIndex. + + Examples + -------- + With the `tz` parameter, we can change the DatetimeIndex + to other time zones: + + >>> dti = pd.date_range(start='2014-08-01 09:00', + ... freq='h', periods=3, tz='Europe/Berlin') # doctest: +SKIP + + >>> dti # doctest: +SKIP + DatetimeIndex(['2014-08-01 09:00:00+02:00', + '2014-08-01 10:00:00+02:00', + '2014-08-01 11:00:00+02:00'], + dtype='datetime64[ns, Europe/Berlin]', freq=None) + + >>> dti.tz_convert('US/Central') # doctest: +SKIP + DatetimeIndex(['2014-08-01 02:00:00-05:00', + '2014-08-01 03:00:00-05:00', + '2014-08-01 04:00:00-05:00'], + dtype='datetime64[ns, US/Central]', freq='h') + + With the ``tz=None``, we can remove the timezone (after converting + to UTC if necessary): + + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', + ... periods=3, tz='Europe/Berlin') # doctest: +SKIP + + >>> dti # doctest: +SKIP + DatetimeIndex(['2014-08-01 09:00:00+02:00', + '2014-08-01 10:00:00+02:00', + '2014-08-01 11:00:00+02:00'], + dtype='datetime64[ns, Europe/Berlin]', freq=None) + + >>> dti.tz_convert(None) # doctest: +SKIP + DatetimeIndex(['2014-08-01 07:00:00', + '2014-08-01 08:00:00', + '2014-08-01 09:00:00'], + dtype='datetime64[ns]', freq='h') + """ + + def tz_localize(): + """ + Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. + + This method takes a time zone (tz) naive Datetime Array/Index object + and makes this time zone aware. It does not move the time to another + time zone. + + This method can also be used to do the inverse -- to create a time + zone unaware object from an aware object. To that end, pass `tz=None`. + + Parameters + ---------- + tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + Time zone to convert timestamps to. Passing ``None`` will + remove the time zone information preserving local time. + ambiguous : 'infer', 'NaT', bool array, default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ +default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + Same type as self + Array/Index converted to the specified time zone. + + Raises + ------ + TypeError + If the Datetime Array/Index is tz-aware and tz is not None. + + See Also + -------- + DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from + one time zone to another. + + Examples + -------- + >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3) + >>> tz_naive + DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', + '2018-03-03 09:00:00'], + dtype='datetime64[ns]', freq=None) + + Localize DatetimeIndex in US/Eastern time zone: + + >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern') + >>> tz_aware + DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', + '2018-03-03 09:00:00-05:00'], + dtype='datetime64[ns, UTC-05:00]', freq=None) + + With the ``tz=None``, we can remove the time zone information + while keeping the local time (not converted to UTC): + + >>> tz_aware.tz_localize(None) + DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', + '2018-03-03 09:00:00'], + dtype='datetime64[ns]', freq=None) + """ + + def round(): + """ + Perform round operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + frequency aliases for a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + This parameter is only supported for 'raise'. + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + This parameter is only supported for 'raise'. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex with round values. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq=None) + + >>> rng.round('h') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + """ + + def floor(): + """ + Perform floor operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + frequency aliases for a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + This parameter is only supported for 'raise'. + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + This parameter is only supported for 'raise'. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex with floor values. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq=None) + + >>> rng.floor('h') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + """ + + def ceil(): + """ + Perform ceil operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + frequency aliases for a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + This parameter is only supported for 'raise'. + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + This parameter is only supported for 'raise'. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex with ceil values. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq=None) + + >>> rng.ceil('h') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + """ + + def month_name(): + """ + Return the month names with specified locale. + + Parameters + ---------- + locale : str, optional + Locale determining the language in which to return the month name. + Default is English locale (``'en_US.utf8'``). Use the command + ``locale -a`` on your terminal on Unix systems to find your locale + language code. + + Returns + ------- + Index of month names. + + Examples + -------- + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) + >>> idx.month_name() + Index(['January', 'February', 'March'], dtype='object') + + Using the ``locale`` parameter you can set a different locale language, + for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month + names in Brazilian Portuguese language. + + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) + >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') + """ + + def day_name(): + """ + Return the day names with specified locale. + + Parameters + ---------- + locale : str, optional + Locale determining the language in which to return the day name. + Default is English locale (``'en_US.utf8'``). Use the command + ``locale -a`` on your terminal on Unix systems to find your locale + language code. + + Returns + ------- + Index of day names. + + Examples + -------- + >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) + >>> idx.day_name() + Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') + + Using the ``locale`` parameter you can set a different locale language, + for example: ``idx.day_name(locale='pt_BR.utf8')`` will return day + names in Brazilian Portuguese language. + + >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) + >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP + Index(['Segunda', 'Terça', 'Quarta'], dtype='object') + """ + + def as_unit(): + """ + Convert to a dtype with the given unit resolution. + + Parameters + ---------- + unit : {'s', 'ms', 'us', 'ns'} + + Returns + ------- + same type as self + + Examples + -------- + >>> idx = pd.DatetimeIndex(['2020-01-02 01:02:03.004005006']) + >>> idx + DatetimeIndex(['2020-01-02 01:02:03.004005006'], dtype='datetime64[ns]', freq=None) + >>> idx.as_unit('s') # doctest: +SKIP + DatetimeIndex(['2020-01-02 01:02:03'], dtype='datetime64[s]', freq=None) + """ + + def to_period(): + """ + Cast to PeriodArray/PeriodIndex at a particular frequency. + + Converts DatetimeArray/Index to PeriodArray/PeriodIndex. + + Parameters + ---------- + freq : str or Period, optional + One of pandas' period aliases or a Period object. + Will be inferred by default. + + Returns + ------- + PeriodArray/PeriodIndex + + Raises + ------ + ValueError + When converting a DatetimeArray/Index with non-regular values, + so that a frequency cannot be inferred. + + See Also + -------- + PeriodIndex: Immutable ndarray holding ordinal values. + DatetimeIndex.to_pydatetime: Return DatetimeIndex as object. + + Examples + -------- + >>> df = pd.DataFrame({"y": [1, 2, 3]}, + ... index=pd.to_datetime(["2000-03-31 00:00:00", + ... "2000-05-31 00:00:00", + ... "2000-08-31 00:00:00"])) + >>> df.index.to_period("M") # doctest: +SKIP + PeriodIndex(['2000-03', '2000-05', '2000-08'], + dtype='period[M]') + + Infer the daily frequency + + >>> idx = pd.date_range("2017-01-01", periods=2) + >>> idx.to_period() # doctest: +SKIP + PeriodIndex(['2017-01-01', '2017-01-02'], dtype='period[D]') + """ + + def to_pydatetime(): + """ + Return a ndarray of ``datetime.datetime`` objects. + + Returns + ------- + numpy.ndarray + + Examples + -------- + >>> idx = pd.date_range('2018-02-27', periods=3) + >>> idx.to_pydatetime() # doctest: +SKIP + array([datetime.datetime(2018, 2, 27, 0, 0), + datetime.datetime(2018, 2, 28, 0, 0), + datetime.datetime(2018, 3, 1, 0, 0)], dtype=object) + """ + + def mean(): + """ + Return the mean value of the Array. + + Parameters + ---------- + skipna : bool, default True + Whether to ignore any NaT elements. + axis : int, optional, default 0 + The axis to calculate the mean over. + This parameter is ignored - 0 is the only valid axis. + + Returns + ------- + scalar Timestamp + + See Also + -------- + numpy.ndarray.mean : Returns the average of array elements along a given axis. + Series.mean : Return the mean value in a Series. + + Notes + ----- + mean is only defined for Datetime and Timedelta dtypes, not for Period. + + Examples + -------- + >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx + DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq=None) + >>> idx.mean() + Timestamp('2001-01-02 00:00:00') + """ + + def std(): + """ + Return sample standard deviation over requested axis. + + Normalized by `N-1` by default. This can be changed using ``ddof``. + + Parameters + ---------- + axis : int, optional + The axis to calculate the standard deviation over. + This parameter is ignored - 0 is the only valid axis. + ddof : int, default 1 + Degrees of Freedom. The divisor used in calculations is `N - ddof`, + where `N` represents the number of elements. + This parameter is not yet supported. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is ``NA``, the result + will be ``NA``. + + Returns + ------- + Timedelta + + See Also + -------- + numpy.ndarray.std : Returns the standard deviation of the array elements + along given axis. + Series.std : Return sample standard deviation over requested axis. + + Examples + -------- + For :class:`pandas.DatetimeIndex`: + + >>> idx = pd.date_range('2001-01-01 00:00', periods=3) + >>> idx + DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq=None) + >>> idx.std() + Timedelta('1 days 00:00:00') + """ diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index b9869db1403..bdf9f8ea2d3 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -2564,6 +2564,45 @@ def size(): pass def unique(): + """ + Return unique values for each group. + + Return unique values for each of the grouped values. Returned in + order of appearance. Hash table-based unique, therefore does NOT sort. + + Returns + ------- + Series + Unique values for each of the grouped values. + + See Also + -------- + Series.unique : Return unique values of Series object. + + Examples + -------- + >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), + ... ('Beagle', 'dog', 15.2), + ... ('Chihuahua', 'dog', 6.9), + ... ('Persian', 'cat', 9.2), + ... ('Chihuahua', 'dog', 7), + ... ('Persian', 'cat', 8.8)], + ... columns=['breed', 'animal', 'height_in']) + >>> df # doctest: +NORMALIZE_WHITESPACE + breed animal height_in + 0 Chihuahua dog 6.1 + 1 Beagle dog 15.2 + 2 Chihuahua dog 6.9 + 3 Persian cat 9.2 + 4 Chihuahua dog 7.0 + 5 Persian cat 8.8 + >>> ser = df.groupby('animal')['breed'].unique() + >>> ser + animal + cat [Persian] + dog [Chihuahua, Beagle] + Name: breed, dtype: object + """ pass def apply(): diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 460b01fc1d4..701b6016524 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -728,6 +728,71 @@ def autocorr(): def between(): """ Return boolean Series equivalent to left <= series <= right. + + This function returns a boolean vector containing `True` wherever the + corresponding Series element is between the boundary values `left` and + `right`. NA values are treated as `False`. + + Parameters + ---------- + left : scalar or list-like + Left boundary. + right : scalar or list-like + Right boundary. + inclusive : {"both", "neither", "left", "right"} + Include boundaries. Whether to set each bound as closed or open. + + Returns + ------- + Series + Series representing whether each element is between left and + right (inclusive). + + See Also + -------- + Series.gt : Greater than of series and other. + Series.lt : Less than of series and other. + + Notes + ----- + This function is equivalent to ``(left <= ser) & (ser <= right)`` + + Examples + -------- + >>> s = pd.Series([2, 0, 4, 8, np.nan]) + + Boundary values are included by default: + + >>> s.between(1, 4) + 0 True + 1 False + 2 True + 3 False + 4 None + dtype: object + + Note that to for consistency with Snowflake SQL rules, comparisons with `None`/`np.nan` + will return `None`. Call `astype(bool)` on the result to coerce `None` to `False`. + + With `inclusive` set to ``"neither"`` boundary values are excluded: + + >>> s.between(1, 4, inclusive="neither") + 0 True + 1 False + 2 False + 3 False + 4 None + dtype: object + + `left` and `right` can be any scalar value: + + >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve']) + >>> s.between('Anna', 'Daniel') + 0 False + 1 True + 2 True + 3 False + dtype: bool """ def bfill(): diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py index c6048234071..df89889c26d 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py @@ -454,7 +454,7 @@ def ljust(): For Series.str.center: >>> ser = pd.Series(['dog', 'bird', 'mouse']) - >>> ser.str.center(8, fillchar='.') # doctest: +SKIP + >>> ser.str.center(8, fillchar='.') 0 ..dog... 1 ..bird.. 2 .mouse.. @@ -500,7 +500,7 @@ def rjust(): For Series.str.center: >>> ser = pd.Series(['dog', 'bird', 'mouse']) - >>> ser.str.center(8, fillchar='.') # doctest: +SKIP + >>> ser.str.center(8, fillchar='.') 0 ..dog... 1 ..bird.. 2 .mouse.. @@ -2218,7 +2218,49 @@ def normalize(): pass def strftime(): - pass + """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in python string format doc. + + Formats supported by the C strftime API but not by the python string format doc (such as “%R”, “%r”) are not officially supported and should be preferably replaced with their supported equivalents (such as “%H:%M”, “%I:%M:%S %p”). + + Note that PeriodIndex support additional directives, detailed in Period.strftime. + + Parameters + ---------- + date_format : str + Date format string (e.g. “%Y-%m-%d”). + + Returns + ------- + ndarray[object] + NumPy ndarray of formatted strings. + + See also + -------- + to_datetime + Convert the given argument to datetime. + DatetimeIndex.normalize + Return DatetimeIndex with times to midnight. + DatetimeIndex.round + Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor + Floor the DatetimeIndex to the specified freq. + Timestamp.strftime + Format a single Timestamp. + Period.strftime + Format a single Period. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%B %d, %Y, %r') # doctest: +SKIP + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ def round(): """ diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 7ba3eeabd26..57dab1b246d 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -137,9 +137,15 @@ def decorator(base_method: Any): # Avoid overwriting builtin `map` by accident @register_dataframe_accessor("map") -@dataframe_not_implemented() -def _map(self, func, na_action: str | None = None, **kwargs) -> DataFrame: - pass # pragma: no cover +def _map(self, func: PythonFuncType, na_action: str | None = None, **kwargs): + # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions + if not callable(func): + raise TypeError(f"{func} is not callable") # pragma: no cover + return self.__constructor__( + query_compiler=self._query_compiler.applymap( + func, na_action=na_action, **kwargs + ) + ) @register_dataframe_not_implemented() @@ -406,17 +412,27 @@ def __rdivmod__(self, other): # The from_dict and from_records accessors are class methods and cannot be overridden via the # extensions module, as they need to be foisted onto the namespace directly because they are not # routed through getattr. To this end, we manually set DataFrame.from_dict to our new method. -@dataframe_not_implemented() +@classmethod def from_dict( cls, data, orient="columns", dtype=None, columns=None ): # pragma: no cover # noqa: PR01, RT01, D200 - pass # pragma: no cover + """ + Construct ``DataFrame`` from dict of array-like or dicts. + """ + return DataFrame( + native_pd.DataFrame.from_dict( + data=data, + orient=orient, + dtype=dtype, + columns=columns, + ) + ) DataFrame.from_dict = from_dict -@dataframe_not_implemented() +@classmethod def from_records( cls, data, @@ -426,7 +442,23 @@ def from_records( coerce_float=False, nrows=None, ): # pragma: no cover # noqa: PR01, RT01, D200 - pass # pragma: no cover + """ + Convert structured or record ndarray to ``DataFrame``. + """ + if isinstance(data, DataFrame): + ErrorMessage.not_implemented( + "Snowpark pandas 'DataFrame.from_records' method does not yet support 'data' parameter of type 'DataFrame'" + ) + return DataFrame( + native_pd.DataFrame.from_records( + data=data, + index=index, + exclude=exclude, + columns=columns, + coerce_float=coerce_float, + nrows=nrows, + ) + ) DataFrame.from_records = from_records @@ -804,14 +836,12 @@ def apply( # Snowpark pandas uses a separate QC method, while modin directly calls map. @register_dataframe_accessor("applymap") def applymap(self, func: PythonFuncType, na_action: str | None = None, **kwargs): - # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions - if not callable(func): - raise TypeError(f"{func} is not callable") - return self.__constructor__( - query_compiler=self._query_compiler.applymap( - func, na_action=na_action, **kwargs - ) + warnings.warn( + "DataFrame.applymap has been deprecated. Use DataFrame.map instead.", + FutureWarning, + stacklevel=2, ) + return self.map(func, na_action=na_action, **kwargs) # We need to override _get_columns to satisfy diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index 96f004d3cce..574cf4fc897 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -50,6 +50,10 @@ datetime_index_not_implemented, ) from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage +from snowflake.snowpark.modin.utils import ( + _inherit_docstrings, + doc_replace_dataframe_with_link, +) _CONSTRUCTOR_DEFAULTS = { "freq": lib.no_default, @@ -65,6 +69,9 @@ } +@_inherit_docstrings( + native_pd.DatetimeIndex, modify_doc=doc_replace_dataframe_with_link +) class DatetimeIndex(Index): # Equivalent index type in native pandas @@ -85,54 +92,6 @@ def __new__( name: Hashable | None = _CONSTRUCTOR_DEFAULTS["name"], query_compiler: SnowflakeQueryCompiler = None, ) -> DatetimeIndex: - """ - Create new instance of DatetimeIndex. This overrides behavior of Index.__new__. - - Parameters - ---------- - data : array-like (1-dimensional), pandas.Index, modin.pandas.Series, optional - Datetime-like data to construct index with. - freq : str or pandas offset object, optional - One of pandas date offset strings or corresponding objects. The string - 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str - Set the Timezone of the data. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - closed : {'left', 'right'}, optional - Set whether to include `start` and `end` that are on the - boundary. The default includes boundary points on either end. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - When clocks moved backward due to DST, ambiguous times may arise. - For example in Central European Time (UTC+01), when going from 03:00 - DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC - and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter - dictates how ambiguous times should be handled. - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False signifies a - non-DST time (note that this flag is only applicable for ambiguous - times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. - dayfirst : bool, default False - If True, parse dates in `data` with the day first order. - yearfirst : bool, default False - If True parse dates in `data` with the year first order. - dtype : numpy.dtype or DatetimeTZDtype or str, default None - Note that the only NumPy dtype allowed is `datetime64[ns]`. - copy : bool, default False - Make a copy of input ndarray. - name : label, default None - Name to be stored in the index. - query_compiler : SnowflakeQueryCompiler, optional - A query compiler object to create the ``Index`` from. - - Returns: - New instance of DatetimeIndex. - """ if query_compiler: # Raise error if underlying type is not a TimestampType. if not query_compiler.is_datetime64_any_dtype(idx=0, is_index=True): @@ -178,74 +137,11 @@ def __init__( name: Hashable | None = _CONSTRUCTOR_DEFAULTS["name"], query_compiler: SnowflakeQueryCompiler = None, ) -> None: - """ - Immutable ndarray-like of datetime64 data. - - Parameters - ---------- - data : array-like (1-dimensional), pandas.Index, modin.pandas.Series, optional - Datetime-like data to construct index with. - freq : str or pandas offset object, optional - One of pandas date offset strings or corresponding objects. The string - 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str - Set the Timezone of the data. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - closed : {'left', 'right'}, optional - Set whether to include `start` and `end` that are on the - boundary. The default includes boundary points on either end. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - When clocks moved backward due to DST, ambiguous times may arise. - For example in Central European Time (UTC+01), when going from 03:00 - DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC - and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter - dictates how ambiguous times should be handled. - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False signifies a - non-DST time (note that this flag is only applicable for ambiguous - times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. - dayfirst : bool, default False - If True, parse dates in `data` with the day first order. - yearfirst : bool, default False - If True parse dates in `data` with the year first order. - dtype : numpy.dtype or DatetimeTZDtype or str, default None - Note that the only NumPy dtype allowed is `datetime64[ns]`. - copy : bool, default False - Make a copy of input ndarray. - name : label, default None - Name to be stored in the index. - query_compiler : SnowflakeQueryCompiler, optional - A query compiler object to create the ``Index`` from. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"], tz="America/Los_Angeles") - >>> idx - DatetimeIndex(['2020-01-01 02:00:00-08:00', '2020-02-01 03:00:00-08:00'], dtype='datetime64[ns, UTC-08:00]', freq=None) - """ # DatetimeIndex is already initialized in __new__ method. We keep this method # only for docstring generation. + pass # pragma: no cover def _dt_property(self, property_name: str) -> Index: - """ - Get the datetime property. - - Parameters - ---------- - property_name : str - The name of the datetime property. - - Returns - ------- - Index - The datetime property. - """ if property_name in ( "date", "time", @@ -268,206 +164,42 @@ def _dt_property(self, property_name: str) -> Index: @property def year(self) -> Index: - """ - The year of the datetime. - - Returns - ------- - An Index with the year of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="YE") - >>> idx - DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]', freq=None) - >>> idx.year - Index([2000, 2001, 2002], dtype='int64') - """ return self._dt_property("year") @property def month(self) -> Index: - """ - The month as January=1, December=12. - - Returns - ------- - An Index with the month of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="ME") - >>> idx - DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]', freq=None) - >>> idx.month - Index([1, 2, 3], dtype='int64') - """ return self._dt_property("month") @property def day(self) -> Index: - """ - The day of the datetime. - - Returns - ------- - An Index with the day of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="D") - >>> idx - DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.day - Index([1, 2, 3], dtype='int64') - """ return self._dt_property("day") @property def hour(self) -> Index: - """ - The hours of the datetime. - - Returns - ------- - An Index with the hours of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="h") - >>> idx - DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00', - '2000-01-01 02:00:00'], - dtype='datetime64[ns]', freq=None) - >>> idx.hour - Index([0, 1, 2], dtype='int64') - """ return self._dt_property("hour") @property def minute(self) -> Index: - """ - The minutes of the datetime. - - Returns - ------- - An Index with the minutes of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="min") - >>> idx - DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', - '2000-01-01 00:02:00'], - dtype='datetime64[ns]', freq=None) - >>> idx.minute - Index([0, 1, 2], dtype='int64') - """ return self._dt_property("minute") @property def second(self) -> Index: - """ - The seconds of the datetime. - - Returns - ------- - An Index with the seconds of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="s") - >>> idx - DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:00:01', - '2000-01-01 00:00:02'], - dtype='datetime64[ns]', freq=None) - >>> idx.second - Index([0, 1, 2], dtype='int64') - """ return self._dt_property("second") @property def microsecond(self) -> Index: - """ - The microseconds of the datetime. - - Returns - ------- - An Index with the microseconds of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="us") - >>> idx - DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.000001', - '2000-01-01 00:00:00.000002'], - dtype='datetime64[ns]', freq=None) - >>> idx.microsecond - Index([0, 1, 2], dtype='int64') - """ return self._dt_property("microsecond") @property def nanosecond(self) -> Index: - """ - The nanoseconds of the datetime. - - Returns - ------- - An Index with the nanoseconds of the datetime. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="ns") - >>> idx - DatetimeIndex([ '2000-01-01 00:00:00', - '2000-01-01 00:00:00.000000001', - '2000-01-01 00:00:00.000000002'], - dtype='datetime64[ns]', freq=None) - >>> idx.nanosecond - Index([0, 1, 2], dtype='int64') - """ return self._dt_property("nanosecond") @property def date(self) -> Index: - """ - Returns the date part of Timestamps without time and timezone information. - - Returns - ------- - Returns an Index with the date part of Timestamps. Note this is different - from native pandas which returns a python array. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) - >>> idx.date - Index([2020-01-01, 2020-02-01], dtype='object') - """ return self._dt_property("date") @property def dayofweek(self) -> Index: - """ - The day of the week with Monday=0, Sunday=6. - - Return the day of the week. It is assumed the week starts on - Monday, which is denoted by 0 and ends on Sunday which is denoted - by 6. This method is available on both Series with datetime - values (using the `dt` accessor) or DatetimeIndex. - - Returns - ------- - An Index Containing integers indicating the day number. - - Examples - -------- - >>> idx = pd.date_range('2016-12-31', '2017-01-08', freq='D') - >>> idx.dayofweek - Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int64') - """ return self._dt_property("dayofweek") day_of_week = dayofweek @@ -475,553 +207,95 @@ def dayofweek(self) -> Index: @property def dayofyear(self) -> Index: - """ - The ordinal day of the year. - - Returns - ------- - An Index Containing integers indicating the ordinal day of the year. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) - >>> idx.dayofyear - Index([1, 32], dtype='int64') - """ return self._dt_property("dayofyear") day_of_year = dayofyear @property def quarter(self) -> Index: - """ - The quarter of the date. - - Returns - ------- - An Index Containing quarter of the date. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) - >>> idx.quarter - Index([1, 1], dtype='int64') - """ return self._dt_property("quarter") @property def is_month_start(self) -> Index: - """ - Indicates whether the date is the first day of the month. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - See Also - -------- - is_month_end : Similar property indicating the last day of the month. - - Examples - -------- - >>> idx = pd.date_range("2018-02-27", periods=3) - >>> idx.is_month_start - Index([False, False, True], dtype='bool') - """ return self._dt_property("is_month_start") @property def is_month_end(self) -> Index: - """ - Indicates whether the date is the last day of the month. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - See Also - -------- - is_month_start : Similar property indicating the first day of the month. - - Examples - -------- - - >>> idx = pd.date_range("2018-02-27", periods=3) - >>> idx.is_month_end - Index([False, True, False], dtype='bool') - """ return self._dt_property("is_month_end") @property def is_quarter_start(self) -> Index: - """ - Indicator for whether the date is the first day of a quarter. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - See Also - -------- - is_quarter_end : Similar property indicating the last day of the quarter. - - Examples - -------- - >>> idx = pd.date_range('2017-03-30', periods=4) - >>> idx - DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], dtype='datetime64[ns]', freq=None) - - >>> idx.is_quarter_start - Index([False, False, True, False], dtype='bool') - """ return self._dt_property("is_quarter_start") @property def is_quarter_end(self) -> Index: - """ - Indicator for whether the date is the last day of a quarter. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - See Also - -------- - is_quarter_start: Similar property indicating the first day of the quarter. - - Examples - -------- - >>> idx = pd.date_range('2017-03-30', periods=4) - >>> idx - DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], dtype='datetime64[ns]', freq=None) - - >>> idx.is_quarter_end - Index([False, True, False, False], dtype='bool') - """ return self._dt_property("is_quarter_end") @property def is_year_start(self) -> Index: - """ - Indicate whether the date is the first day of a year. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - See Also - -------- - is_year_end : Similar property indicating the last day of the year. - - Examples - -------- - >>> idx = pd.date_range("2017-12-30", periods=3) - >>> idx - DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq=None) - - >>> idx.is_year_start - Index([False, False, True], dtype='bool') - """ return self._dt_property("is_year_start") @property def is_year_end(self) -> Index: - """ - Indicate whether the date is the last day of the year. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - See Also - -------- - is_year_start : Similar property indicating the start of the year. - - Examples - -------- - >>> idx = pd.date_range("2017-12-30", periods=3) - >>> idx - DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq=None) - - >>> idx.is_year_end - Index([False, True, False], dtype='bool') - """ return self._dt_property("is_year_end") @property def is_leap_year(self) -> Index: - """ - Boolean indicator if the date belongs to a leap year. - - A leap year is a year, which has 366 days (instead of 365) including - 29th of February as an intercalary day. - Leap years are years which are multiples of four except for years - divisible by 100 but not by 400. - - Returns - ------- - An Index with boolean values. Note this is different from native pandas which - returns a python array. - - Examples - -------- - >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE") - >>> idx - DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], dtype='datetime64[ns]', freq=None) - >>> idx.is_leap_year - Index([True, False, False], dtype='bool') - """ return self._dt_property("is_leap_year") @property def time(self) -> Index: - """ - Returns the time part of the Timestamps. - - Returns - ------- - An Index with the time part of the Timestamps. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) - >>> idx.time - Index([10:00:00, 11:00:00], dtype='object') - """ return self._dt_property("time") @datetime_index_not_implemented() @property def timetz(self) -> Index: - """ - Returns the time part of the Timestamps with timezone. - - Returns - ------- - An Index with the time part with timezone of the Timestamps. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) - >>> idx.timetz # doctest: +SKIP - Index(["10:00:00+00:00", "11:00:00+00:00"], dtype='object') - """ + pass # pragma: no cover @datetime_index_not_implemented() @property def tz(self) -> tzinfo | None: - """ - Return the timezone. - - Returns - ------- - datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None - Returns None when the array is tz-naive. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", - ... "2/1/2020 11:00:00+00:00"]) - >>> idx.tz # doctest: +SKIP - datetime.timezone.utc - """ + pass # pragma: no cover @datetime_index_not_implemented() @property def freq(self) -> str | None: - """ - Return the frequency object if it's set, otherwise None. - - Examples - -------- - >>> idx = pd.date_range("2000-01-01", periods=3, freq="YE") - >>> idx.freq # doctest: +SKIP - - """ + pass # pragma: no cover @datetime_index_not_implemented() @property def freqstr(self) -> str | None: - """ - Return the frequency object as a string if it's set, otherwise None. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00"], freq="D") - >>> idx.freqstr # doctest: +SKIP - 'D' - - The frequency can be inferred if there are more than 2 points: - - >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], - ... freq="infer") - >>> idx.freqstr # doctest: +SKIP - '2D' - """ + pass # pragma: no cover @datetime_index_not_implemented() @property def inferred_freq(self) -> str | None: - """ - Tries to return a string representing a frequency generated by infer_freq. - - Returns None if it can't autodetect the frequency. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) - >>> idx.inferred_freq # doctest: +SKIP - '2D' - """ + pass # pragma: no cover @datetime_index_not_implemented() def indexer_at_time(self, time, asof: bool = False) -> np.ndarray[np.intp]: - """ - Return index locations of values at particular time of day. - - Parameters - ---------- - time : datetime.time or str - Time passed in either as object (datetime.time) or as string in - appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", - "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). - - Returns - ------- - np.ndarray[np.intp] - - See Also - -------- - indexer_between_time : Get index locations of values between particular - times of day. - DataFrame.at_time : Select values at particular time of day. - - Examples - -------- - >>> idx = pd.DatetimeIndex(["1/1/2020 10:00", "2/1/2020 11:00", - ... "3/1/2020 10:00"]) - >>> idx.indexer_at_time("10:00") # doctest: +SKIP - array([0, 2]) - """ + pass # pragma: no cover @datetime_index_not_implemented() def indexer_between_time( self, start_time, end_time, include_start: bool = True, include_end: bool = True ) -> np.ndarray[np.intp]: - """ - Return index locations of values between particular times of day. - - Parameters - ---------- - start_time, end_time : datetime.time, str - Time passed either as object (datetime.time) or as string in - appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", - "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). - include_start : bool, default True - include_end : bool, default True - - Returns - ------- - np.ndarray[np.intp] - - See Also - -------- - indexer_at_time : Get index locations of values at particular time of day. - DataFrame.between_time : Select values between particular times of day. - - Examples - -------- - >>> idx = pd.date_range("2023-01-01", periods=4, freq="h") - >>> idx - DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00', - '2023-01-01 02:00:00', '2023-01-01 03:00:00'], - dtype='datetime64[ns]', freq=None) - >>> idx.indexer_between_time("00:00", "2:00", include_end=False) # doctest: +SKIP - array([0, 1]) - """ + pass # pragma: no cover def normalize(self) -> DatetimeIndex: - """ - Convert times to midnight. - - The time component of the date-time is converted to midnight i.e. - 00:00:00. This is useful in cases, when the time does not matter. - Length is unaltered. The timezones are unaffected. - - This method is available on Series with datetime values under - the ``.dt`` accessor, and directly on Datetime Array/Index. - - Returns - ------- - DatetimeArray, DatetimeIndex or Series - The same type as the original data. Series will have the same - name and index. DatetimeIndex will have the same name. - - See Also - -------- - floor : Floor the datetimes to the specified freq. - ceil : Ceil the datetimes to the specified freq. - round : Round the datetimes to the specified freq. - - Examples - -------- - >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', - ... periods=3, tz='Asia/Calcutta') # doctest: +SKIP - >>> idx # doctest: +SKIP - DatetimeIndex(['2014-08-01 10:00:00+05:30', - '2014-08-01 11:00:00+05:30', - '2014-08-01 12:00:00+05:30'], - dtype='datetime64[ns, Asia/Calcutta]', freq=None) - >>> idx.normalize() # doctest: +SKIP - DatetimeIndex(['2014-08-01 00:00:00+05:30', - '2014-08-01 00:00:00+05:30', - '2014-08-01 00:00:00+05:30'], - dtype='datetime64[ns, Asia/Calcutta]', freq=None) - """ return DatetimeIndex( query_compiler=self._query_compiler.dt_normalize(include_index=True) ) @datetime_index_not_implemented() def strftime(self, date_format: str) -> np.ndarray[np.object_]: - """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format - doc <%(URL)s>`__. - - Formats supported by the C `strftime` API but not by the python string format - doc (such as `"%%R"`, `"%%r"`) are not officially supported and should be - preferably replaced with their supported equivalents (such as `"%%H:%%M"`, - `"%%I:%%M:%%S %%p"`). - - Note that `PeriodIndex` support additional directives, detailed in - `Period.strftime`. - - Parameters - ---------- - date_format : str - Date format string (e.g. "%%Y-%%m-%%d"). - - Returns - ------- - ndarray[object] - NumPy ndarray of formatted strings. - - See Also - -------- - to_datetime : Convert the given argument to datetime. - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - Timestamp.strftime : Format a single Timestamp. - Period.strftime : Format a single Period. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq=None) - >>> rng.strftime('%%B %%d, %%Y, %%r') # doctest: +SKIP - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """ + pass # pragma: no cover @datetime_index_not_implemented() def snap(self, freq: Frequency = "S") -> DatetimeIndex: - """ - Snap time stamps to nearest occurring frequency. - - Returns - ------- - DatetimeIndex - - Examples - -------- - >>> idx = pd.DatetimeIndex(['2023-01-01', '2023-01-02', - ... '2023-02-01', '2023-02-02']) - >>> idx - DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], dtype='datetime64[ns]', freq=None) - >>> idx.snap('MS') # doctest: +SKIP - DatetimeIndex(['2023-01-01', '2023-01-01', '2023-02-01', '2023-02-01'], dtype='datetime64[ns]', freq=None) - """ + pass # pragma: no cover def tz_convert(self, tz) -> DatetimeIndex: - """ - Convert tz-aware Datetime Array/Index from one time zone to another. - - Parameters - ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None - Time zone for time. Corresponding timestamps would be converted - to this time zone of the Datetime Array/Index. A `tz` of None will - convert to UTC and remove the timezone information. - - Returns - ------- - Array or Index - - Raises - ------ - TypeError - If Datetime Array/Index is tz-naive. - - See Also - -------- - DatetimeIndex.tz : A timezone that has a variable offset from UTC. - DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a - given time zone, or remove timezone from a tz-aware DatetimeIndex. - - Examples - -------- - With the `tz` parameter, we can change the DatetimeIndex - to other time zones: - - >>> dti = pd.date_range(start='2014-08-01 09:00', - ... freq='h', periods=3, tz='Europe/Berlin') # doctest: +SKIP - - >>> dti # doctest: +SKIP - DatetimeIndex(['2014-08-01 09:00:00+02:00', - '2014-08-01 10:00:00+02:00', - '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq=None) - - >>> dti.tz_convert('US/Central') # doctest: +SKIP - DatetimeIndex(['2014-08-01 02:00:00-05:00', - '2014-08-01 03:00:00-05:00', - '2014-08-01 04:00:00-05:00'], - dtype='datetime64[ns, US/Central]', freq='h') - - With the ``tz=None``, we can remove the timezone (after converting - to UTC if necessary): - - >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', - ... periods=3, tz='Europe/Berlin') # doctest: +SKIP - - >>> dti # doctest: +SKIP - DatetimeIndex(['2014-08-01 09:00:00+02:00', - '2014-08-01 10:00:00+02:00', - '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq=None) - - >>> dti.tz_convert(None) # doctest: +SKIP - DatetimeIndex(['2014-08-01 07:00:00', - '2014-08-01 08:00:00', - '2014-08-01 09:00:00'], - dtype='datetime64[ns]', freq='h') - """ # TODO (SNOW-1660843): Support tz in pd.date_range and unskip the doctests. return DatetimeIndex( query_compiler=self._query_compiler.dt_tz_convert( @@ -1036,91 +310,6 @@ def tz_localize( ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", ) -> DatetimeIndex: - """ - Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. - - This method takes a time zone (tz) naive Datetime Array/Index object - and makes this time zone aware. It does not move the time to another - time zone. - - This method can also be used to do the inverse -- to create a time - zone unaware object from an aware object. To that end, pass `tz=None`. - - Parameters - ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None - Time zone to convert timestamps to. Passing ``None`` will - remove the time zone information preserving local time. - ambiguous : 'infer', 'NaT', bool array, default 'raise' - When clocks moved backward due to DST, ambiguous times may arise. - For example in Central European Time (UTC+01), when going from - 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at - 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the - `ambiguous` parameter dictates how ambiguous times should be - handled. - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False signifies a - non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times. - - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times. - - Returns - ------- - Same type as self - Array/Index converted to the specified time zone. - - Raises - ------ - TypeError - If the Datetime Array/Index is tz-aware and tz is not None. - - See Also - -------- - DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from - one time zone to another. - - Examples - -------- - >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3) - >>> tz_naive - DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', - '2018-03-03 09:00:00'], - dtype='datetime64[ns]', freq=None) - - Localize DatetimeIndex in US/Eastern time zone: - - >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern') - >>> tz_aware - DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', - '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) - - With the ``tz=None``, we can remove the time zone information - while keeping the local time (not converted to UTC): - - >>> tz_aware.tz_localize(None) - DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', - '2018-03-03 09:00:00'], - dtype='datetime64[ns]', freq=None) - """ # TODO (SNOW-1660843): Support tz in pd.date_range and unskip the doctests. return DatetimeIndex( query_compiler=self._query_compiler.dt_tz_localize( @@ -1134,65 +323,6 @@ def tz_localize( def round( self, freq: Frequency, ambiguous: str = "raise", nonexistent: str = "raise" ) -> DatetimeIndex: - """ - Perform round operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - frequency aliases for a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - This parameter is only supported for 'raise'. - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times. - - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' - This parameter is only supported for 'raise'. - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times. - - Returns - ------- - DatetimeIndex with round values. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq=None) - - >>> rng.round('h') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - """ return DatetimeIndex( query_compiler=self._query_compiler.dt_round( freq, ambiguous, nonexistent, include_index=True @@ -1202,65 +332,6 @@ def round( def floor( self, freq: Frequency, ambiguous: str = "raise", nonexistent: str = "raise" ) -> DatetimeIndex: - """ - Perform floor operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - frequency aliases for a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - This parameter is only supported for 'raise'. - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times. - - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' - This parameter is only supported for 'raise'. - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times. - - Returns - ------- - DatetimeIndex with floor values. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq=None) - - >>> rng.floor('h') - DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - """ return DatetimeIndex( query_compiler=self._query_compiler.dt_floor( freq, ambiguous, nonexistent, include_index=True @@ -1270,65 +341,6 @@ def floor( def ceil( self, freq: Frequency, ambiguous: str = "raise", nonexistent: str = "raise" ) -> DatetimeIndex: - """ - Perform ceil operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - frequency aliases for a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - This parameter is only supported for 'raise'. - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times. - - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' - This parameter is only supported for 'raise'. - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times. - - Returns - ------- - DatetimeIndex with ceil values. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq=None) - - >>> rng.ceil('h') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 13:00:00'], - dtype='datetime64[ns]', freq=None) - """ return DatetimeIndex( query_compiler=self._query_compiler.dt_ceil( freq, ambiguous, nonexistent, include_index=True @@ -1336,39 +348,6 @@ def ceil( ) def month_name(self, locale: str = None) -> Index: - """ - Return the month names with specified locale. - - Parameters - ---------- - locale : str, optional - Locale determining the language in which to return the month name. - Default is English locale (``'en_US.utf8'``). Use the command - ``locale -a`` on your terminal on Unix systems to find your locale - language code. - - Returns - ------- - Index of month names. - - Examples - -------- - >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) - >>> idx - DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) - >>> idx.month_name() - Index(['January', 'February', 'March'], dtype='object') - - Using the ``locale`` parameter you can set a different locale language, - for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month - names in Brazilian Portuguese language. - - >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) - >>> idx - DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) - >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP - Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') - """ return Index( query_compiler=self._query_compiler.dt_month_name( locale=locale, include_index=True @@ -1376,39 +355,6 @@ def month_name(self, locale: str = None) -> Index: ) def day_name(self, locale: str = None) -> Index: - """ - Return the day names with specified locale. - - Parameters - ---------- - locale : str, optional - Locale determining the language in which to return the day name. - Default is English locale (``'en_US.utf8'``). Use the command - ``locale -a`` on your terminal on Unix systems to find your locale - language code. - - Returns - ------- - Index of day names. - - Examples - -------- - >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) - >>> idx - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.day_name() - Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') - - Using the ``locale`` parameter you can set a different locale language, - for example: ``idx.day_name(locale='pt_BR.utf8')`` will return day - names in Brazilian Portuguese language. - - >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) - >>> idx - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP - Index(['Segunda', 'Terça', 'Quarta'], dtype='object') - """ return Index( query_compiler=self._query_compiler.dt_day_name( locale=locale, include_index=True @@ -1417,124 +363,19 @@ def day_name(self, locale: str = None) -> Index: @datetime_index_not_implemented() def as_unit(self, unit: str) -> DatetimeIndex: - """ - Convert to a dtype with the given unit resolution. - - Parameters - ---------- - unit : {'s', 'ms', 'us', 'ns'} - - Returns - ------- - same type as self - - Examples - -------- - >>> idx = pd.DatetimeIndex(['2020-01-02 01:02:03.004005006']) - >>> idx - DatetimeIndex(['2020-01-02 01:02:03.004005006'], dtype='datetime64[ns]', freq=None) - >>> idx.as_unit('s') # doctest: +SKIP - DatetimeIndex(['2020-01-02 01:02:03'], dtype='datetime64[s]', freq=None) - """ + pass # pragma: no cover @datetime_index_not_implemented() def to_period(self, freq=None) -> Index: - """ - Cast to PeriodArray/PeriodIndex at a particular frequency. - - Converts DatetimeArray/Index to PeriodArray/PeriodIndex. - - Parameters - ---------- - freq : str or Period, optional - One of pandas' period aliases or a Period object. - Will be inferred by default. - - Returns - ------- - PeriodArray/PeriodIndex - - Raises - ------ - ValueError - When converting a DatetimeArray/Index with non-regular values, - so that a frequency cannot be inferred. - - See Also - -------- - PeriodIndex: Immutable ndarray holding ordinal values. - DatetimeIndex.to_pydatetime: Return DatetimeIndex as object. - - Examples - -------- - >>> df = pd.DataFrame({"y": [1, 2, 3]}, - ... index=pd.to_datetime(["2000-03-31 00:00:00", - ... "2000-05-31 00:00:00", - ... "2000-08-31 00:00:00"])) - >>> df.index.to_period("M") # doctest: +SKIP - PeriodIndex(['2000-03', '2000-05', '2000-08'], - dtype='period[M]') - - Infer the daily frequency - - >>> idx = pd.date_range("2017-01-01", periods=2) - >>> idx.to_period() # doctest: +SKIP - PeriodIndex(['2017-01-01', '2017-01-02'], dtype='period[D]') - """ + pass # pragma: no cover @datetime_index_not_implemented() def to_pydatetime(self) -> np.ndarray: - """ - Return a ndarray of ``datetime.datetime`` objects. - - Returns - ------- - numpy.ndarray - - Examples - -------- - >>> idx = pd.date_range('2018-02-27', periods=3) - >>> idx.to_pydatetime() # doctest: +SKIP - array([datetime.datetime(2018, 2, 27, 0, 0), - datetime.datetime(2018, 2, 28, 0, 0), - datetime.datetime(2018, 3, 1, 0, 0)], dtype=object) - """ + pass # pragma: no cover def mean( self, *, skipna: bool = True, axis: AxisInt | None = 0 ) -> native_pd.Timestamp: - """ - Return the mean value of the Array. - - Parameters - ---------- - skipna : bool, default True - Whether to ignore any NaT elements. - axis : int, optional, default 0 - The axis to calculate the mean over. - This parameter is ignored - 0 is the only valid axis. - - Returns - ------- - scalar Timestamp - - See Also - -------- - numpy.ndarray.mean : Returns the average of array elements along a given axis. - Series.mean : Return the mean value in a Series. - - Notes - ----- - mean is only defined for Datetime and Timedelta dtypes, not for Period. - - Examples - -------- - >>> idx = pd.date_range('2001-01-01 00:00', periods=3) - >>> idx - DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.mean() - Timestamp('2001-01-02 00:00:00') - """ # Need to convert timestamp to int value (nanoseconds) before aggregating. # TODO: SNOW-1625233 When `tz` is supported, add a `tz` parameter to `to_datetime` for correct timezone result. if axis not in [None, 0]: @@ -1552,44 +393,6 @@ def std( skipna: bool = True, **kwargs, ) -> timedelta: - """ - Return sample standard deviation over requested axis. - - Normalized by `N-1` by default. This can be changed using ``ddof``. - - Parameters - ---------- - axis : int, optional - The axis to calculate the standard deviation over. - This parameter is ignored - 0 is the only valid axis. - ddof : int, default 1 - Degrees of Freedom. The divisor used in calculations is `N - ddof`, - where `N` represents the number of elements. - This parameter is not yet supported. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is ``NA``, the result - will be ``NA``. - - Returns - ------- - Timedelta - - See Also - -------- - numpy.ndarray.std : Returns the standard deviation of the array elements - along given axis. - Series.std : Return sample standard deviation over requested axis. - - Examples - -------- - For :class:`pandas.DatetimeIndex`: - - >>> idx = pd.date_range('2001-01-01 00:00', periods=3) - >>> idx - DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.std() - Timedelta('1 days 00:00:00') - """ if axis not in [None, 0]: raise ValueError( f"axis={axis} is not supported, this parameter is ignored. 0 is the only valid axis." diff --git a/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py index fabbd2b23ef..add8e432df1 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py @@ -1514,8 +1514,10 @@ def nsmallest(self, n=5, keep="first"): ErrorMessage.method_not_implemented_error(name="nsmallest", class_="GroupBy") def unique(self): - # TODO: SNOW-1063350: Modin upgrade - modin.pandas.groupby.SeriesGroupBy functions - ErrorMessage.method_not_implemented_error(name="unique", class_="GroupBy") + return self._wrap_aggregation( + type(self._query_compiler).groupby_unique, + numeric_only=False, + ) def size(self): # TODO: Remove this once SNOW-1478924 is fixed diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index ce9266d0e1f..cc54a5c4723 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -149,11 +149,6 @@ def autocorr(self, lag=1): # noqa: PR01, RT01, D200 pass # pragma: no cover -@register_series_not_implemented() -def between(self, left, right, inclusive: str = "both"): # noqa: PR01, RT01, D200 - pass # pragma: no cover - - @register_series_not_implemented() def corr(self, other, method="pearson", min_periods=None): # noqa: PR01, RT01, D200 pass # pragma: no cover diff --git a/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py b/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py index ce8c02b7692..026a8ad46cb 100644 --- a/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py +++ b/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py @@ -6,6 +6,7 @@ import modin.pandas as pd from modin.pandas.base import BasePandasDataset from modin.pandas.utils import is_scalar +import numpy as np from snowflake.snowpark import functions as sp_func from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage @@ -288,3 +289,49 @@ def map_to_bools(inputs: Any) -> Any: sp_func.trunc ), # df.truncate not supported in snowpandas yet } + + +# Map from numpy universal (element-wise) function to Snowflake function. +# This is used to map numpy functions to builtin sql functions when numpy function is +# passed in apply and map methods. For example: df.apply(np.) +# Using native SQL functions instead of creating UDF/UDTF provides significant +# better performance. +NUMPY_UNIVERSAL_FUNCTION_TO_SNOWFLAKE_FUNCTION = { + # Math operations + np.absolute: sp_func.abs, + np.sign: sp_func.sign, + np.negative: sp_func.negate, + np.positive: lambda col: col, + np.sqrt: sp_func.sqrt, + np.square: lambda col: sp_func.builtin("square")(col), + np.cbrt: lambda col: sp_func.builtin("cbrt")(col), + np.reciprocal: lambda col: 1 / col, + np.exp: sp_func.exp, + np.exp2: lambda col: sp_func.pow(2, col), + np.expm1: lambda col: sp_func.exp(col) - 1, + np.log: sp_func.ln, + np.log2: sp_func._log2, + np.log10: sp_func._log10, + np.log1p: lambda col: sp_func.ln(col + 1), + # Trigonometric functions + np.sin: sp_func.sin, + np.cos: sp_func.cos, + np.tan: sp_func.tan, + np.sinh: sp_func.sinh, + np.cosh: sp_func.cosh, + np.tanh: sp_func.tanh, + np.arcsin: lambda col: sp_func.builtin("asin")(col), + np.arccos: lambda col: sp_func.builtin("acos")(col), + np.arctan: lambda col: sp_func.builtin("atan")(col), + np.arctan2: lambda col: sp_func.builtin("atan2")(col), + np.arcsinh: lambda col: sp_func.builtin("asinh")(col), + np.arccosh: lambda col: sp_func.builtin("acosh")(col), + np.arctanh: lambda col: sp_func.builtin("atanh")(col), + np.degrees: lambda col: sp_func.builtin("degrees")(col), + np.radians: lambda col: sp_func.builtin("radians")(col), + # Floating functions + np.ceil: sp_func.ceil, + np.floor: sp_func.floor, + np.trunc: sp_func.trunc, + np.isnan: sp_func.is_null, +} diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py index 6aac9832260..e6648528bd5 100644 --- a/src/snowflake/snowpark/session.py +++ b/src/snowflake/snowpark/session.py @@ -134,6 +134,7 @@ zip_file_or_directory_to_stream, ) from snowflake.snowpark.async_job import AsyncJob +from snowflake.snowpark.catalog import Catalog from snowflake.snowpark.column import Column from snowflake.snowpark.context import ( _is_execution_environment_sandboxed_for_client, @@ -656,6 +657,7 @@ def __init__( self._runtime_version_from_requirement: str = None self._temp_table_auto_cleaner: TempTableAutoCleaner = TempTableAutoCleaner(self) self._sp_profiler = StoredProcedureProfiler(session=self) + self._catalog = None self._ast_batch = AstBatch(self) @@ -735,6 +737,19 @@ def get_active_session(cls) -> Optional["Session"]: getActiveSession = get_active_session + @property + @experimental(version="1.27.0") + def catalog(self) -> Catalog: + """Returns the catalog object.""" + if self._catalog is None: + if isinstance(self._conn, MockServerConnection): + self._conn.log_not_supported_error( + external_feature_name="Session.catalog", + raise_error=NotImplementedError, + ) + self._catalog = Catalog(self) + return self._catalog + def close(self) -> None: """Close this session.""" if is_in_stored_procedure(): diff --git a/src/snowflake/snowpark/types.py b/src/snowflake/snowpark/types.py index 06bcc8969b5..333fc580f60 100644 --- a/src/snowflake/snowpark/types.py +++ b/src/snowflake/snowpark/types.py @@ -16,6 +16,7 @@ # Use correct version from here: from snowflake.snowpark._internal.utils import installed_pandas, pandas, quote_name +import snowflake.snowpark.context as context # TODO: connector installed_pandas is broken. If pyarrow is not installed, but pandas is this function returns the wrong answer. # The core issue is that in the connector detection of both pandas/arrow are mixed, which is wrong. @@ -341,6 +342,14 @@ def __init__( def __repr__(self) -> str: return f"ArrayType({repr(self.element_type) if self.element_type else ''})" + def _as_nested(self) -> "ArrayType": + if not context._should_use_structured_type_semantics: + return self + element_type = self.element_type + if isinstance(element_type, (ArrayType, MapType, StructType)): + element_type = element_type._as_nested() + return ArrayType(element_type, self.structured) + def is_primitive(self): return False @@ -391,6 +400,14 @@ def __repr__(self) -> str: def is_primitive(self): return False + def _as_nested(self) -> "MapType": + if not context._should_use_structured_type_semantics: + return self + value_type = self.value_type + if isinstance(value_type, (ArrayType, MapType, StructType)): + value_type = value_type._as_nested() + return MapType(self.key_type, value_type, self.structured) + @classmethod def from_json(cls, json_dict: Dict[str, Any]) -> "MapType": return MapType( @@ -552,29 +569,46 @@ def __init__( column_identifier: Union[ColumnIdentifier, str], datatype: DataType, nullable: bool = True, + _is_column: bool = True, ) -> None: - self.column_identifier = ( - ColumnIdentifier(column_identifier) - if isinstance(column_identifier, str) - else column_identifier - ) + self.name = column_identifier + self._is_column = _is_column self.datatype = datatype self.nullable = nullable @property def name(self) -> str: - """Returns the column name.""" - return self.column_identifier.name + if self._is_column or not context._should_use_structured_type_semantics: + return self.column_identifier.name + else: + return self._name @name.setter - def name(self, n: str) -> None: - self.column_identifier = ColumnIdentifier(n) + def name(self, n: Union[ColumnIdentifier, str]) -> None: + if isinstance(n, ColumnIdentifier): + self._name = n.name + self.column_identifier = n + else: + self._name = n + self.column_identifier = ColumnIdentifier(n) + + def _as_nested(self) -> "StructField": + if not context._should_use_structured_type_semantics: + return self + datatype = self.datatype + if isinstance(datatype, (ArrayType, MapType, StructType)): + datatype = datatype._as_nested() + # Nested StructFields do not follow column naming conventions + return StructField(self._name, datatype, self.nullable, _is_column=False) def __repr__(self) -> str: return f"StructField({self.name!r}, {repr(self.datatype)}, nullable={self.nullable})" def __eq__(self, other): - return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ + return isinstance(other, self.__class__) and ( + (self.name, self._is_column, self.datatype, self.nullable) + == (other.name, other._is_column, other.datatype, other.nullable) + ) @classmethod def from_json(cls, json_dict: Dict[str, Any]) -> "StructField": @@ -620,9 +654,9 @@ def __init__( self, fields: Optional[List["StructField"]] = None, structured=False ) -> None: self.structured = structured - if fields is None: - fields = [] - self.fields = fields + self.fields = [] + for field in fields or []: + self.add(field) def add( self, @@ -630,20 +664,31 @@ def add( datatype: Optional[DataType] = None, nullable: Optional[bool] = True, ) -> "StructType": - if isinstance(field, StructField): - self.fields.append(field) - elif isinstance(field, (str, ColumnIdentifier)): + if isinstance(field, (str, ColumnIdentifier)): if datatype is None: raise ValueError( "When field argument is str or ColumnIdentifier, datatype must not be None." ) - self.fields.append(StructField(field, datatype, nullable)) - else: + field = StructField(field, datatype, nullable) + elif not isinstance(field, StructField): raise ValueError( f"field argument must be one of str, ColumnIdentifier or StructField. Got: '{type(field)}'" ) + + # Nested data does not follow the same schema conventions as top level fields. + if isinstance(field.datatype, (ArrayType, MapType, StructType)): + field.datatype = field.datatype._as_nested() + + self.fields.append(field) return self + def _as_nested(self) -> "StructType": + if not context._should_use_structured_type_semantics: + return self + return StructType( + [field._as_nested() for field in self.fields], self.structured + ) + @classmethod def _from_attributes(cls, attributes: list) -> "StructType": return cls([StructField(a.name, a.datatype, a.nullable) for a in attributes]) diff --git a/tests/ast/data/DataFrame.unpivot.test b/tests/ast/data/DataFrame.unpivot.test index 3ff1ba8523c..717897beb84 100644 --- a/tests/ast/data/DataFrame.unpivot.test +++ b/tests/ast/data/DataFrame.unpivot.test @@ -10,7 +10,7 @@ df = df.unpivot("sales", "month", ["jan", "feb"]) df = session.create_dataframe([(1, "electronics", 100, 200), (2, "clothes", 100, 300)], schema=["empid", "dept", "jan", "feb"]) -df = df.unpivot("sales", "month", ["jan", "feb"]) +df = df.unpivot("sales", "month", ["jan", "feb"], False) ## EXPECTED ENCODED AST diff --git a/tests/ast/data/Dataframe.to_snowpark_pandas.test b/tests/ast/data/Dataframe.to_snowpark_pandas.test index 7ec7ae0151e..6ab3ab9bf4a 100644 --- a/tests/ast/data/Dataframe.to_snowpark_pandas.test +++ b/tests/ast/data/Dataframe.to_snowpark_pandas.test @@ -35,7 +35,10 @@ body { } } src { + end_column: 41 + end_line: 25 file: "SRC_POSITION_TEST_MODE" + start_column: 13 start_line: 25 } variant { @@ -64,7 +67,10 @@ body { } } src { + end_column: 52 + end_line: 27 file: "SRC_POSITION_TEST_MODE" + start_column: 29 start_line: 27 } } @@ -93,7 +99,10 @@ body { list: "A" } src { + end_column: 65 + end_line: 29 file: "SRC_POSITION_TEST_MODE" + start_column: 29 start_line: 29 } } @@ -123,7 +132,10 @@ body { } } src { + end_column: 70 + end_line: 31 file: "SRC_POSITION_TEST_MODE" + start_column: 29 start_line: 31 } } @@ -156,7 +168,10 @@ body { list: "A" } src { + end_column: 87 + end_line: 33 file: "SRC_POSITION_TEST_MODE" + start_column: 29 start_line: 33 } } @@ -183,5 +198,5 @@ client_language { } client_version { major: 1 - minor: 25 + minor: 26 } diff --git a/tests/ast/data/col_asc.test b/tests/ast/data/col_asc.test index d938cd948bf..90a8d4612e1 100644 --- a/tests/ast/data/col_asc.test +++ b/tests/ast/data/col_asc.test @@ -90,6 +90,9 @@ body { } } } + null_order { + sp_null_order_default: true + } src { end_column: 37 end_line: 29 @@ -163,8 +166,8 @@ body { } } } - nulls_first { - value: true + null_order { + sp_null_order_nulls_first: true } src { end_column: 49 @@ -239,7 +242,8 @@ body { } } } - nulls_first { + null_order { + sp_null_order_nulls_last: true } src { end_column: 48 diff --git a/tests/ast/data/col_desc.test b/tests/ast/data/col_desc.test index 4156c2d0c0b..97d30879d23 100644 --- a/tests/ast/data/col_desc.test +++ b/tests/ast/data/col_desc.test @@ -88,6 +88,9 @@ body { } } } + null_order { + sp_null_order_default: true + } src { end_column: 38 end_line: 27 @@ -161,8 +164,8 @@ body { } } } - nulls_first { - value: true + null_order { + sp_null_order_nulls_first: true } src { end_column: 50 @@ -237,7 +240,8 @@ body { } } } - nulls_first { + null_order { + sp_null_order_nulls_last: true } src { end_column: 49 diff --git a/tests/ast/data/col_in_.test b/tests/ast/data/col_in_.test index 4d274a9a283..9fee6e2f51d 100644 --- a/tests/ast/data/col_in_.test +++ b/tests/ast/data/col_in_.test @@ -55,7 +55,7 @@ body { expr { sp_dataframe_select__columns { cols { - sp_column_in__seq { + sp_column_in { col { apply_expr { fn { @@ -157,7 +157,7 @@ body { expr { sp_dataframe_select__columns { cols { - sp_column_in__seq { + sp_column_in { col { apply_expr { fn { @@ -230,7 +230,7 @@ body { expr { sp_dataframe_select__columns { cols { - sp_column_in__seq { + sp_column_in { col { apply_expr { fn { diff --git a/tests/ast/data/functions.test b/tests/ast/data/functions.test index 9d47f6ffe29..39562f02ade 100644 --- a/tests/ast/data/functions.test +++ b/tests/ast/data/functions.test @@ -712,11 +712,11 @@ df55 = df.select(variance(col("A"))) df56 = df.select(var_pop(col("A"))) -df57 = df.select(approx_percentile(col("A"), lit(0.6)), approx_percentile(col("B"), lit(0.0))) +df57 = df.select(approx_percentile(col("A"), 0.6), approx_percentile(col("B"), 0.0)) df58 = df.select(approx_percentile_accumulate(col("A"))) -df59 = df.select(approx_percentile_estimate(col("A"), lit(0.3))) +df59 = df.select(approx_percentile_estimate(col("A"), 0.3)) df60 = df.select(approx_percentile_combine(col("A"))) @@ -744,11 +744,11 @@ df71 = df.select(seq4(12)) df72 = df.select(seq8(324)) -df73 = df.select(to_decimal(col("A"), lit(10), lit(3)), to_decimal(col("B"), lit(12), lit(3))) +df73 = df.select(to_decimal(col("A"), 10, 3), to_decimal(col("B"), 12, 3)) df74 = df.select(to_double(col("A"), None), to_double("A", None), to_double("A", "999.9"), to_double(col("A"), col("B"))) -df75 = df.select(div0(lit(0), lit(1)), div0(lit(1.2), lit(9.3)), div0(lit(10), lit(89.2)), div0(col("A"), lit(1)), div0(lit(0.2), col("A")), div0(lit(0.3), col("B"))) +df75 = df.select(div0(0, 1), div0(1.2, 9.3), div0(10, 89.2), div0("A", 1), div0(0.2, "A"), div0(0.3, col("B"))) df76 = df.select(sqrt(col("A"))) @@ -804,15 +804,15 @@ df101 = df.select(length(col("A"))) df102 = df.select(lower(col("A"))) -df103 = df.select(lpad(col("A"), col("B"), col("B")), lpad(col("A"), lit(100), col("B")), lpad(col("A"), col("B"), col("B"))) +df103 = df.select(lpad(col("A"), col("B"), col("B")), lpad(col("A"), 100, col("B")), lpad(col("A"), col("B"), col("B"))) df104 = df.select(ltrim(col("A")), ltrim(col("A")), ltrim(col("A"), col("B")), ltrim(col("A"), lit("B"))) -df105 = df.select(rpad(col("A"), col("B"), col("B")), rpad(col("A"), lit(100), col("B")), rpad(col("A"), col("B"), col("B"))) +df105 = df.select(rpad(col("A"), col("B"), col("B")), rpad(col("A"), 100, col("B")), rpad(col("A"), col("B"), col("B"))) df106 = df.select(rtrim(col("A")), rtrim(col("A")), rtrim(col("A"), col("B")), rtrim(col("A"), lit("B"))) -df107 = df.select(repeat(col("A"), lit(1)), repeat(col("A"), lit(20)), repeat(col("B"), col("A"))) +df107 = df.select(repeat(col("A"), 1), repeat(col("A"), 20), repeat(col("B"), col("A"))) df108 = df.select(reverse(col("A"))) @@ -826,9 +826,9 @@ df112 = df111.select(strtok_to_array(col("A")), strtok_to_array(col("A")), strto df113 = df111.select(struct("A", col("A"), col("B"))) -df114 = df111.select(log(col("A"), lit(10)), log(col("B"), lit(4.3)), log(col("A"), col("B"))) +df114 = df111.select(log("A", 10), log(col("B"), 4.3), log("A", "B")) -df115 = df111.select(pow(col("A"), lit(10)), pow(col("B"), lit(4.3)), pow(col("A"), col("B"))) +df115 = df111.select(pow("A", 10), pow(col("B"), 4.3), pow("A", "B")) df116 = df111.select(round("A", 0), round("A", 0), round(col("B"), 4.7)) @@ -836,19 +836,19 @@ df117 = df111.select(sign(col("A"))) df118 = df111.select(split(col("A"), col("B")), split(col("A"), lit("asfdg"))) -df119 = df111.select(substring(col("A"), col("A"), col("A")), substring(col("A"), lit(0), lit(10)), substring(col("A"), lit(20), col("B"))) +df119 = df111.select(substring(col("A"), col("A"), col("A")), substring(col("A"), 0, 10), substring(col("A"), 20, col("B"))) df120 = df111.select(substring_index("A", "abc", 3), substring_index(col("A"), col("B"), 2)) -df121 = df111.select(regexp_count(col("A"), lit("B"), col("A")), regexp_count(col("A"), col("B"), col("C"), lit(1), lit(2), lit("test"))) +df121 = df111.select(regexp_count(col("A"), "B", col("A")), regexp_count(col("A"), col("B"), col("C"), 1, 2, "test")) df122 = df111.select(regexp_extract("A", "B", 2)) -df123 = df111.select(regexp_replace(col("A"), lit("B"), lit(""), lit(1), lit(0)), regexp_replace(col("A"), col("B"), col("C"), col("D"), col("E"), col("F"), lit(1), lit("sgh"), lit(99.9))) +df123 = df111.select(regexp_replace(col("A"), "B", "", 1, 0), regexp_replace(col("A"), col("B"), col("C"), col("D"), col("E"), col("F"), 1, "sgh", 99.9)) -df124 = df111.select(replace(col("A"), lit(""), lit("")), replace(col("A"), lit("B"), lit("ahsgj"))) +df124 = df111.select(replace(col("A"), "", ""), replace(col("A"), "B", "ahsgj")) -df125 = df111.select(charindex(col("A"), col("B")), charindex(col("A"), col("B")), charindex(col("A"), col("B"), lit(20)), charindex(col("A"), col("B"), col("C"))) +df125 = df111.select(char_index(col("A"), col("B"), 20), char_index(col("A"), col("B"), col("C"))) df126 = df111.select(collate(col("A"), "sp-upper")) @@ -866,15 +866,15 @@ df132 = df111.select(startswith(col("A"), col("B"))) df133 = df111.select(endswith(col("A"), col("B"))) -df134 = df111.select(insert(col("A"), col("B"), col("C"), col("D")), insert(col("A"), lit(12), lit(13), col("D"))) +df134 = df111.select(insert(col("A"), col("B"), col("C"), col("D")), insert(col("A"), 12, 13, col("D"))) -df135 = df111.select(left(col("A"), col("B")), left(col("A"), lit(10))) +df135 = df111.select(left(col("A"), col("B")), left(col("A"), 10)) -df136 = df111.select(right(col("A"), col("B")), right(col("A"), lit(10))) +df136 = df111.select(right(col("A"), col("B")), right(col("A"), 10)) df137 = df111.select(char(col("A"))) -df138 = df111.select(to_char(col("A")), to_char(col("B")), to_char(col("A"), lit("bcd"))) +df138 = df111.select(to_char(col("A"), "bcd")) df139 = df111.select(date_format(col("A"), col("B")), date_format("A", "YYYY")) @@ -934,7 +934,7 @@ df175 = df111.select(array_distinct(col("A"))) df176 = df111.select(array_intersection(col("A"), col("B")), array_intersection(col("A"), col("B")), array_intersection(col("A"), col("B"))) -df177 = df111.select(array_except("A", "B"), array_except("A", col("B")), array_except("B", "A")) +df177 = df111.select(array_except("A", "B", True), array_except("A", col("B"), True), array_except("B", "A", False)) df178 = df111.select(array_min(col("A"))) @@ -942,7 +942,7 @@ df179 = df111.select(array_max(col("A"))) df180 = df111.select(array_flatten(col("A"))) -df181 = df111.select(array_sort(col("A"), lit(True), lit(False)), array_sort(col("A"), lit(True), lit(False)), array_sort(col("A"), lit(False), lit(True))) +df181 = df111.select(array_sort(col("A"), True, False), array_sort(col("A"), True, False), array_sort(col("A"), False, True)) df182 = df111.select(arrays_to_object(col("A"), col("B")), arrays_to_object(col("A"), col("A"))) @@ -7606,28 +7606,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 52 - end_line: 141 - file: "SRC_POSITION_TEST_MODE" - start_column: 25 - start_line: 141 - } - v: 0.6 - } - } + float64_val { src { end_column: 52 end_line: 141 @@ -7635,6 +7614,7 @@ body { start_column: 25 start_line: 141 } + v: 0.6 } } src { @@ -7690,27 +7670,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 86 - end_line: 141 - file: "SRC_POSITION_TEST_MODE" - start_column: 54 - start_line: 141 - } - } - } + float64_val { src { end_column: 86 end_line: 141 @@ -7885,28 +7845,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 61 - end_line: 145 - file: "SRC_POSITION_TEST_MODE" - start_column: 25 - start_line: 145 - } - v: 0.3 - } - } + float64_val { src { end_column: 61 end_line: 145 @@ -7914,6 +7853,7 @@ body { start_column: 25 start_line: 145 } + v: 0.3 } } src { @@ -9492,28 +9432,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 47 - end_line: 173 - file: "SRC_POSITION_TEST_MODE" - start_column: 25 - start_line: 173 - } - v: 10 - } - } + int64_val { src { end_column: 47 end_line: 173 @@ -9521,31 +9440,11 @@ body { start_column: 25 start_line: 173 } + v: 10 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 47 - end_line: 173 - file: "SRC_POSITION_TEST_MODE" - start_column: 25 - start_line: 173 - } - v: 3 - } - } + int64_val { src { end_column: 47 end_line: 173 @@ -9553,6 +9452,7 @@ body { start_column: 25 start_line: 173 } + v: 3 } } src { @@ -9608,28 +9508,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 76 - end_line: 173 - file: "SRC_POSITION_TEST_MODE" - start_column: 49 - start_line: 173 - } - v: 12 - } - } + int64_val { src { end_column: 76 end_line: 173 @@ -9637,31 +9516,11 @@ body { start_column: 49 start_line: 173 } + v: 12 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 76 - end_line: 173 - file: "SRC_POSITION_TEST_MODE" - start_column: 49 - start_line: 173 - } - v: 3 - } - } + int64_val { src { end_column: 76 end_line: 173 @@ -9669,6 +9528,7 @@ body { start_column: 49 start_line: 173 } + v: 3 } } src { @@ -9986,27 +9846,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 35 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 25 - start_line: 177 - } - } - } + int64_val { src { end_column: 35 end_line: 177 @@ -10017,28 +9857,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 35 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 25 - start_line: 177 - } - v: 1 - } - } + int64_val { src { end_column: 35 end_line: 177 @@ -10046,6 +9865,7 @@ body { start_column: 25 start_line: 177 } + v: 1 } } src { @@ -10069,28 +9889,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 51 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 37 - start_line: 177 - } - v: 1.2 - } - } + float64_val { src { end_column: 51 end_line: 177 @@ -10098,31 +9897,11 @@ body { start_column: 37 start_line: 177 } + v: 1.2 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 51 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 37 - start_line: 177 - } - v: 9.3 - } - } + float64_val { src { end_column: 51 end_line: 177 @@ -10130,6 +9909,7 @@ body { start_column: 37 start_line: 177 } + v: 9.3 } } src { @@ -10153,28 +9933,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 67 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 53 - start_line: 177 - } - v: 10 - } - } + int64_val { src { end_column: 67 end_line: 177 @@ -10182,31 +9941,11 @@ body { start_column: 53 start_line: 177 } + v: 10 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 67 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 53 - start_line: 177 - } - v: 89.2 - } - } + float64_val { src { end_column: 67 end_line: 177 @@ -10214,6 +9953,7 @@ body { start_column: 53 start_line: 177 } + v: 89.2 } } src { @@ -10237,28 +9977,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 81 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 69 - start_line: 177 - } - v: "A" - } - } + string_val { src { end_column: 81 end_line: 177 @@ -10266,31 +9985,11 @@ body { start_column: 69 start_line: 177 } + v: "A" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 81 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 69 - start_line: 177 - } - v: 1 - } - } + int64_val { src { end_column: 81 end_line: 177 @@ -10298,6 +9997,7 @@ body { start_column: 69 start_line: 177 } + v: 1 } } src { @@ -10321,28 +10021,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 97 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 83 - start_line: 177 - } - v: 0.2 - } - } + float64_val { src { end_column: 97 end_line: 177 @@ -10350,31 +10029,11 @@ body { start_column: 83 start_line: 177 } + v: 0.2 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 97 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 83 - start_line: 177 - } - v: "A" - } - } + string_val { src { end_column: 97 end_line: 177 @@ -10382,6 +10041,7 @@ body { start_column: 83 start_line: 177 } + v: "A" } } src { @@ -10405,28 +10065,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 118 - end_line: 177 - file: "SRC_POSITION_TEST_MODE" - start_column: 99 - start_line: 177 - } - v: 0.3 - } - } + float64_val { src { end_column: 118 end_line: 177 @@ -10434,6 +10073,7 @@ body { start_column: 99 start_line: 177 } + v: 0.3 } } pos_args { @@ -13399,28 +13039,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 71 - end_line: 233 - file: "SRC_POSITION_TEST_MODE" - start_column: 52 - start_line: 233 - } - v: 100 - } - } + int64_val { src { end_column: 71 end_line: 233 @@ -13428,6 +13047,7 @@ body { start_column: 52 start_line: 233 } + v: 100 } } pos_args { @@ -14079,28 +13699,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 71 - end_line: 237 - file: "SRC_POSITION_TEST_MODE" - start_column: 52 - start_line: 237 - } - v: 100 - } - } + int64_val { src { end_column: 71 end_line: 237 @@ -14108,6 +13707,7 @@ body { start_column: 52 start_line: 237 } + v: 100 } } pos_args { @@ -14643,28 +14243,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 40 - end_line: 241 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 241 - } - v: 1 - } - } + int64_val { src { end_column: 40 end_line: 241 @@ -14672,6 +14251,7 @@ body { start_column: 26 start_line: 241 } + v: 1 } } src { @@ -14727,28 +14307,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 62 - end_line: 241 - file: "SRC_POSITION_TEST_MODE" - start_column: 42 - start_line: 241 - } - v: 20 - } - } + int64_val { src { end_column: 62 end_line: 241 @@ -14756,6 +14315,7 @@ body { start_column: 42 start_line: 241 } + v: 20 } } src { @@ -15869,28 +15429,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 38 - end_line: 255 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 255 - } - v: "A" - } - } + string_val { src { end_column: 38 end_line: 255 @@ -15898,31 +15437,11 @@ body { start_column: 26 start_line: 255 } + v: "A" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 38 - end_line: 255 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 255 - } - v: 10 - } - } + int64_val { src { end_column: 38 end_line: 255 @@ -15930,6 +15449,7 @@ body { start_column: 26 start_line: 255 } + v: 10 } } src { @@ -15985,28 +15505,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 58 - end_line: 255 - file: "SRC_POSITION_TEST_MODE" - start_column: 40 - start_line: 255 - } - v: 4.3 - } - } + float64_val { src { end_column: 58 end_line: 255 @@ -16014,6 +15513,7 @@ body { start_column: 40 start_line: 255 } + v: 4.3 } } src { @@ -16037,28 +15537,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 73 - end_line: 255 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 255 - } - v: "A" - } - } + string_val { src { end_column: 73 end_line: 255 @@ -16066,31 +15545,11 @@ body { start_column: 60 start_line: 255 } + v: "A" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 73 - end_line: 255 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 255 - } - v: "B" - } - } + string_val { src { end_column: 73 end_line: 255 @@ -16098,6 +15557,7 @@ body { start_column: 60 start_line: 255 } + v: "B" } } src { @@ -16151,28 +15611,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 38 - end_line: 257 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 257 - } - v: "A" - } - } + string_val { src { end_column: 38 end_line: 257 @@ -16180,31 +15619,11 @@ body { start_column: 26 start_line: 257 } + v: "A" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 38 - end_line: 257 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 257 - } - v: 10 - } - } + int64_val { src { end_column: 38 end_line: 257 @@ -16212,6 +15631,7 @@ body { start_column: 26 start_line: 257 } + v: 10 } } src { @@ -16267,28 +15687,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 58 - end_line: 257 - file: "SRC_POSITION_TEST_MODE" - start_column: 40 - start_line: 257 - } - v: 4.3 - } - } + float64_val { src { end_column: 58 end_line: 257 @@ -16296,6 +15695,7 @@ body { start_column: 40 start_line: 257 } + v: 4.3 } } src { @@ -16319,28 +15719,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 73 - end_line: 257 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 257 - } - v: "A" - } - } + string_val { src { end_column: 73 end_line: 257 @@ -16348,31 +15727,11 @@ body { start_column: 60 start_line: 257 } + v: "A" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 73 - end_line: 257 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 257 - } - v: "B" - } - } + string_val { src { end_column: 73 end_line: 257 @@ -16380,6 +15739,7 @@ body { start_column: 60 start_line: 257 } + v: "B" } } src { @@ -17041,27 +16401,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 88 - end_line: 265 - file: "SRC_POSITION_TEST_MODE" - start_column: 62 - start_line: 265 - } - } - } + int64_val { src { end_column: 88 end_line: 265 @@ -17072,28 +16412,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 88 - end_line: 265 - file: "SRC_POSITION_TEST_MODE" - start_column: 62 - start_line: 265 - } - v: 10 - } - } + int64_val { src { end_column: 88 end_line: 265 @@ -17101,6 +16420,7 @@ body { start_column: 62 start_line: 265 } + v: 10 } } src { @@ -17156,28 +16476,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 123 - end_line: 265 - file: "SRC_POSITION_TEST_MODE" - start_column: 90 - start_line: 265 - } - v: 20 - } - } + int64_val { src { end_column: 123 end_line: 265 @@ -17185,6 +16484,7 @@ body { start_column: 90 start_line: 265 } + v: 20 } } pos_args { @@ -17484,28 +16784,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 58 - end_line: 269 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 269 - } - v: "B" - } - } + string_val { src { end_column: 58 end_line: 269 @@ -17513,6 +16792,7 @@ body { start_column: 26 start_line: 269 } + v: "B" } } pos_args { @@ -17664,28 +16944,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 116 - end_line: 269 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 269 - } - v: 1 - } - } + int64_val { src { end_column: 116 end_line: 269 @@ -17693,31 +16952,11 @@ body { start_column: 60 start_line: 269 } + v: 1 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 116 - end_line: 269 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 269 - } - v: 2 - } - } + int64_val { src { end_column: 116 end_line: 269 @@ -17725,31 +16964,11 @@ body { start_column: 60 start_line: 269 } + v: 2 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 116 - end_line: 269 - file: "SRC_POSITION_TEST_MODE" - start_column: 60 - start_line: 269 - } - v: "test" - } - } + string_val { src { end_column: 116 end_line: 269 @@ -17757,6 +16976,7 @@ body { start_column: 60 start_line: 269 } + v: "test" } } src { @@ -17928,28 +17148,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 60 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 273 - } - v: "B" - } - } + string_val { src { end_column: 60 end_line: 273 @@ -17957,30 +17156,11 @@ body { start_column: 26 start_line: 273 } + v: "B" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 60 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 273 - } - } - } + string_val { src { end_column: 60 end_line: 273 @@ -17991,28 +17171,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 60 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 273 - } - v: 1 - } - } + int64_val { src { end_column: 60 end_line: 273 @@ -18020,30 +17179,11 @@ body { start_column: 26 start_line: 273 } + v: 1 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 60 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 273 - } - } - } + int64_val { src { end_column: 60 end_line: 273 @@ -18266,28 +17406,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 152 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 62 - start_line: 273 - } - v: 1 - } - } + int64_val { src { end_column: 152 end_line: 273 @@ -18295,31 +17414,11 @@ body { start_column: 62 start_line: 273 } + v: 1 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 152 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 62 - start_line: 273 - } - v: "sgh" - } - } + string_val { src { end_column: 152 end_line: 273 @@ -18327,31 +17426,11 @@ body { start_column: 62 start_line: 273 } + v: "sgh" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - float64_val { - src { - end_column: 152 - end_line: 273 - file: "SRC_POSITION_TEST_MODE" - start_column: 62 - start_line: 273 - } - v: 99.9 - } - } + float64_val { src { end_column: 152 end_line: 273 @@ -18359,6 +17438,7 @@ body { start_column: 62 start_line: 273 } + v: 99.9 } } src { @@ -18444,27 +17524,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 51 - end_line: 275 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 275 - } - } - } + string_val { src { end_column: 51 end_line: 275 @@ -18475,27 +17535,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 51 - end_line: 275 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 275 - } - } - } + string_val { src { end_column: 51 end_line: 275 @@ -18558,28 +17598,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 79 - end_line: 275 - file: "SRC_POSITION_TEST_MODE" - start_column: 53 - start_line: 275 - } - v: "B" - } - } + string_val { src { end_column: 79 end_line: 275 @@ -18587,31 +17606,11 @@ body { start_column: 53 start_line: 275 } + v: "B" } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 79 - end_line: 275 - file: "SRC_POSITION_TEST_MODE" - start_column: 53 - start_line: 275 - } - v: "ahsgj" - } - } + string_val { src { end_column: 79 end_line: 275 @@ -18619,6 +17618,7 @@ body { start_column: 53 start_line: 275 } + v: "ahsgj" } } src { @@ -18666,175 +17666,7 @@ body { builtin_fn { name { fn_name_flat { - name: "charindex" - } - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 44 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 36 - start_line: 277 - } - v: "A" - } - } - src { - end_column: 44 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 36 - start_line: 277 - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 54 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 46 - start_line: 277 - } - v: "B" - } - } - src { - end_column: 54 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 46 - start_line: 277 - } - } - } - src { - end_column: 55 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 277 - } - } - } - cols { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "charindex" - } - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 82 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 57 - start_line: 277 - } - v: "A" - } - } - src { - end_column: 82 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 57 - start_line: 277 - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 82 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 57 - start_line: 277 - } - v: "B" - } - } - src { - end_column: 82 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 57 - start_line: 277 - } - } - } - src { - end_column: 82 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 57 - start_line: 277 - } - } - } - cols { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "charindex" + name: "char_index" } } } @@ -18883,39 +17715,7 @@ body { } } pos_args { - string_val { - src { - end_column: 107 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 84 - start_line: 277 - } - v: "B" - } - } - src { - end_column: 107 - end_line: 277 - file: "SRC_POSITION_TEST_MODE" - start_column: 84 - start_line: 277 - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { + string_val { src { end_column: 107 end_line: 277 @@ -18923,7 +17723,7 @@ body { start_column: 84 start_line: 277 } - v: 20 + v: "B" } } src { @@ -18935,6 +17735,18 @@ body { } } } + pos_args { + int64_val { + src { + end_column: 107 + end_line: 277 + file: "SRC_POSITION_TEST_MODE" + start_column: 84 + start_line: 277 + } + v: 20 + } + } src { end_column: 107 end_line: 277 @@ -18950,7 +17762,7 @@ body { builtin_fn { name { fn_name_flat { - name: "charindex" + name: "char_index" } } } @@ -20374,28 +19186,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 98 - end_line: 295 - file: "SRC_POSITION_TEST_MODE" - start_column: 69 - start_line: 295 - } - v: 12 - } - } + int64_val { src { end_column: 98 end_line: 295 @@ -20403,31 +19194,11 @@ body { start_column: 69 start_line: 295 } + v: 12 } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 98 - end_line: 295 - file: "SRC_POSITION_TEST_MODE" - start_column: 69 - start_line: 295 - } - v: 13 - } - } + int64_val { src { end_column: 98 end_line: 295 @@ -20435,6 +19206,7 @@ body { start_column: 69 start_line: 295 } + v: 13 } } pos_args { @@ -20636,28 +19408,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 65 - end_line: 297 - file: "SRC_POSITION_TEST_MODE" - start_column: 47 - start_line: 297 - } - v: 10 - } - } + int64_val { src { end_column: 65 end_line: 297 @@ -20665,6 +19416,7 @@ body { start_column: 47 start_line: 297 } + v: 10 } } src { @@ -20834,28 +19586,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - int64_val { - src { - end_column: 67 - end_line: 299 - file: "SRC_POSITION_TEST_MODE" - start_column: 48 - start_line: 299 - } - v: 10 - } - } + int64_val { src { end_column: 67 end_line: 299 @@ -20863,6 +19594,7 @@ body { start_column: 48 start_line: 299 } + v: 10 } } src { @@ -20892,159 +19624,25 @@ body { } } symbol { - value: "df136" - } - uid: 137 - var_id { - bitfield1: 137 - } - } -} -body { - assign { - expr { - sp_dataframe_select__columns { - cols { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "char" - } - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 35 - end_line: 301 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 301 - } - v: "A" - } - } - src { - end_column: 35 - end_line: 301 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 301 - } - } - } - src { - end_column: 35 - end_line: 301 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 301 - } - } - } - df { - sp_dataframe_ref { - id { - bitfield1: 112 - } - } - } - src { - end_column: 36 - end_line: 301 - file: "SRC_POSITION_TEST_MODE" - start_column: 16 - start_line: 301 - } - variadic: true - } - } - symbol { - value: "df137" - } - uid: 138 - var_id { - bitfield1: 138 - } - } -} -body { - assign { - expr { - sp_dataframe_select__columns { - cols { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "to_char" - } - } - } - } - pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "col" - } - } - } - } - pos_args { - string_val { - src { - end_column: 38 - end_line: 303 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 303 - } - v: "A" - } - } - src { - end_column: 38 - end_line: 303 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 303 - } - } - } - src { - end_column: 38 - end_line: 303 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 303 - } - } - } + value: "df136" + } + uid: 137 + var_id { + bitfield1: 137 + } + } +} +body { + assign { + expr { + sp_dataframe_select__columns { cols { apply_expr { fn { builtin_fn { name { fn_name_flat { - name: "to_char" + name: "char" } } } @@ -21063,33 +19661,63 @@ body { pos_args { string_val { src { - end_column: 56 - end_line: 303 + end_column: 35 + end_line: 301 file: "SRC_POSITION_TEST_MODE" - start_column: 48 - start_line: 303 + start_column: 26 + start_line: 301 } - v: "B" + v: "A" } } src { - end_column: 56 - end_line: 303 + end_column: 35 + end_line: 301 file: "SRC_POSITION_TEST_MODE" - start_column: 48 - start_line: 303 + start_column: 26 + start_line: 301 } } } src { - end_column: 63 - end_line: 303 + end_column: 35 + end_line: 301 file: "SRC_POSITION_TEST_MODE" - start_column: 40 - start_line: 303 + start_column: 26 + start_line: 301 } } } + df { + sp_dataframe_ref { + id { + bitfield1: 112 + } + } + } + src { + end_column: 36 + end_line: 301 + file: "SRC_POSITION_TEST_MODE" + start_column: 16 + start_line: 301 + } + variadic: true + } + } + symbol { + value: "df137" + } + uid: 138 + var_id { + bitfield1: 138 + } + } +} +body { + assign { + expr { + sp_dataframe_select__columns { cols { apply_expr { fn { @@ -21134,28 +19762,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - string_val { - src { - end_column: 84 - end_line: 303 - file: "SRC_POSITION_TEST_MODE" - start_column: 65 - start_line: 303 - } - v: "bcd" - } - } + string_val { src { end_column: 84 end_line: 303 @@ -21163,6 +19770,7 @@ body { start_column: 65 start_line: 303 } + v: "bcd" } } src { @@ -25571,6 +24179,18 @@ body { v: "B" } } + pos_args { + bool_val { + src { + end_column: 48 + end_line: 363 + file: "SRC_POSITION_TEST_MODE" + start_column: 26 + start_line: 363 + } + v: true + } + } src { end_column: 48 end_line: 363 @@ -25635,6 +24255,18 @@ body { } } } + pos_args { + bool_val { + src { + end_column: 83 + end_line: 363 + file: "SRC_POSITION_TEST_MODE" + start_column: 50 + start_line: 363 + } + v: true + } + } src { end_column: 83 end_line: 363 @@ -25679,6 +24311,17 @@ body { v: "A" } } + pos_args { + bool_val { + src { + end_column: 114 + end_line: 363 + file: "SRC_POSITION_TEST_MODE" + start_column: 85 + start_line: 363 + } + } + } src { end_column: 114 end_line: 363 @@ -26008,28 +24651,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - bool_val { - src { - end_column: 41 - end_line: 371 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 371 - } - v: true - } - } + bool_val { src { end_column: 41 end_line: 371 @@ -26037,30 +24659,11 @@ body { start_column: 26 start_line: 371 } + v: true } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - bool_val { - src { - end_column: 41 - end_line: 371 - file: "SRC_POSITION_TEST_MODE" - start_column: 26 - start_line: 371 - } - } - } + bool_val { src { end_column: 41 end_line: 371 @@ -26123,28 +24726,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - bool_val { - src { - end_column: 64 - end_line: 371 - file: "SRC_POSITION_TEST_MODE" - start_column: 43 - start_line: 371 - } - v: true - } - } + bool_val { src { end_column: 64 end_line: 371 @@ -26152,30 +24734,11 @@ body { start_column: 43 start_line: 371 } + v: true } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - bool_val { - src { - end_column: 64 - end_line: 371 - file: "SRC_POSITION_TEST_MODE" - start_column: 43 - start_line: 371 - } - } - } + bool_val { src { end_column: 64 end_line: 371 @@ -26238,27 +24801,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - bool_val { - src { - end_column: 99 - end_line: 371 - file: "SRC_POSITION_TEST_MODE" - start_column: 66 - start_line: 371 - } - } - } + bool_val { src { end_column: 99 end_line: 371 @@ -26269,28 +24812,7 @@ body { } } pos_args { - apply_expr { - fn { - builtin_fn { - name { - fn_name_flat { - name: "lit" - } - } - } - } - pos_args { - bool_val { - src { - end_column: 99 - end_line: 371 - file: "SRC_POSITION_TEST_MODE" - start_column: 66 - start_line: 371 - } - v: true - } - } + bool_val { src { end_column: 99 end_line: 371 @@ -26298,6 +24820,7 @@ body { start_column: 66 start_line: 371 } + v: true } } src { diff --git a/tests/ast/data/windows.test b/tests/ast/data/windows.test index 272559c87ad..be96f9fff2d 100644 --- a/tests/ast/data/windows.test +++ b/tests/ast/data/windows.test @@ -557,6 +557,9 @@ body { } } } + null_order { + sp_null_order_default: true + } src { end_column: 51 end_line: 35 diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py index 03adbde4e3a..c2082c6c762 100644 --- a/tests/integ/modin/frame/test_apply.py +++ b/tests/integ/modin/frame/test_apply.py @@ -96,7 +96,8 @@ def test_axis_1_basic_types_without_type_hints(data, func, return_type): # this test processes functions without type hints and invokes the UDTF solution. native_df = native_pd.DataFrame(data, columns=["A", "b"]) snow_df = pd.DataFrame(data, columns=["A", "b"]) - with SqlCounter(query_count=5): + # np.min is mapped to sql builtin function. + with SqlCounter(query_count=1 if func == np.min else 5): eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.apply(func, axis=1)) @@ -612,14 +613,14 @@ def test_apply_bug_1650918(data, apply_func): TRANSFORM_TEST_MAP = [ [[[0, 1, 2], [1, 2, 3]], lambda x: x + 1, 16], - [[[0, 1, 2], [1, 2, 3]], np.exp, 16], + [[[0, 1, 2], [1, 2, 3]], np.exp, 1], [[[0, 1, 2], [1, 2, 3]], "exp", None], [[["Leonhard", "Jianzhun"]], lambda x: x + " is awesome!!", 11], - [[[1.3, 2.5]], np.sqrt, 11], + [[[1.3, 2.5]], np.sqrt, 1], [[[1.3, 2.5]], "sqrt", None], - [[[1.3, 2.5]], np.log, 11], + [[[1.3, 2.5]], np.log, 1], [[[1.3, 2.5]], "log", None], - [[[1.3, 2.5]], np.square, 11], + [[[1.3, 2.5]], np.square, 1], [[[1.3, 2.5]], "square", None], [[[1.5, float("nan")]], lambda x: np.sqrt(x), 11], ] @@ -961,3 +962,27 @@ def func(row): native_ans = native_df.apply(func.func, axis=1) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_ans, native_ans) + + +@pytest.mark.parametrize( + "func", [np.sum, np.min, np.max, np.mean, np.median, np.std, np.var] +) +@sql_count_checker(query_count=1) +def test_apply_numpy_aggregate_functions(func): + native_df = native_pd.DataFrame( + {"A": [1, 2, 3, 4, 5], "B": [7, -20, 4.0, 7.0, None]} + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.apply(func)) + + +@pytest.mark.parametrize( + "func", [np.square, np.sin, np.sinh, np.sqrt, np.exp, np.log, np.log1p, np.absolute] +) +@sql_count_checker(query_count=1) +def test_apply_numpy_universal_functions(func): + native_df = native_pd.DataFrame( + {"A": [1, 2, 3, 4, 5], "B": [7, 20, 4.0, 7.0, None]} + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.apply(func)) diff --git a/tests/integ/modin/frame/test_apply_axis_0.py b/tests/integ/modin/frame/test_apply_axis_0.py index a6160b1a548..248b92ad367 100644 --- a/tests/integ/modin/frame/test_apply_axis_0.py +++ b/tests/integ/modin/frame/test_apply_axis_0.py @@ -88,11 +88,12 @@ def test_axis_0_basic_types_without_type_hints(data, func, return_type): # this test processes functions without type hints and invokes the UDTF solution. native_df = native_pd.DataFrame(data, columns=["A", "b"]) snow_df = pd.DataFrame(data, columns=["A", "b"]) + # np.min is mapped to builtin function so no UDTF is required. with SqlCounter( - query_count=11, - join_count=2, - udtf_count=2, - high_count_expected=True, + query_count=1 if func == np.min else 11, + join_count=0 if func == np.min else 2, + udtf_count=0 if func == np.min else 2, + high_count_expected=func != np.min, high_count_reason="SNOW-1650644 & SNOW-1345395: Avoid extra caching and repeatedly creating same temp function", ): eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.apply(func, axis=0)) diff --git a/tests/integ/modin/frame/test_applymap.py b/tests/integ/modin/frame/test_applymap.py index 517b5ce12e8..4486aea7a33 100644 --- a/tests/integ/modin/frame/test_applymap.py +++ b/tests/integ/modin/frame/test_applymap.py @@ -28,25 +28,33 @@ ) +@pytest.fixture(params=["applymap", "map"]) +def method(request): + """ + method name to test. + """ + return request.param + + @pytest.mark.parametrize("data,func,return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP) @sql_count_checker(query_count=7, udf_count=1) -def test_applymap_basic_without_type_hints(data, func, return_type): +def test_applymap_basic_without_type_hints(data, func, return_type, method): frame_data = {0: data, 1: data} native_df = native_pd.DataFrame(frame_data) snow_df = pd.DataFrame(frame_data) - eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.applymap(func)) + eval_snowpark_pandas_result(snow_df, native_df, lambda x: getattr(x, method)(func)) @pytest.mark.parametrize("data,func,return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP) @sql_count_checker(query_count=7, udf_count=1) -def test_applymap_basic_with_type_hints(data, func, return_type): +def test_applymap_basic_with_type_hints(data, func, return_type, method): func_with_type_hint = create_func_with_return_type_hint(func, return_type) frame_data = {0: data, 1: data} native_df = native_pd.DataFrame(frame_data) snow_df = pd.DataFrame(frame_data) eval_snowpark_pandas_result( - snow_df, native_df, lambda x: x.applymap(func_with_type_hint) + snow_df, native_df, lambda x: getattr(x, method)(func_with_type_hint) ) @@ -107,32 +115,32 @@ def test_applymap_numpy(func): native_df = native_pd.DataFrame(data) snow_df = pd.DataFrame(data) - with SqlCounter(query_count=7, udf_count=1): + with SqlCounter(query_count=1): eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.applymap(func)) @sql_count_checker(query_count=0) -def test_applymap_na_action_ignore(): +def test_applymap_na_action_ignore(method): snow_df = pd.DataFrame([1, 1.1, "NaN", None], dtype="Float64") msg = "Snowpark pandas applymap API doesn't yet support na_action == 'ignore'" with pytest.raises(NotImplementedError, match=msg): - snow_df.applymap(lambda x: x is None, na_action="ignore") + getattr(snow_df, method)(lambda x: x is None, na_action="ignore") data = ["cat", "dog", np.nan, "rabbit"] snow_df = pd.DataFrame(data) with pytest.raises(NotImplementedError, match=msg): - snow_df.applymap("I am a {}".format, na_action="ignore") + getattr(snow_df, method)("I am a {}".format, na_action="ignore") @pytest.mark.parametrize("invalid_input", ["min", [np.min], {"a": np.max}]) @sql_count_checker(query_count=0) -def test_applymap_invalid_input(invalid_input): +def test_applymap_invalid_input(invalid_input, method): snow_df = pd.DataFrame([1]) native_df = native_pd.DataFrame([1]) eval_snowpark_pandas_result( snow_df, native_df, - lambda x: x.applymap(invalid_input), + lambda x: getattr(x, method)(invalid_input), expect_exception=True, expect_exception_match="is not callable", assert_exception_equal=False, diff --git a/tests/integ/modin/frame/test_from_dict.py b/tests/integ/modin/frame/test_from_dict.py new file mode 100644 index 00000000000..e305a1f2389 --- /dev/null +++ b/tests/integ/modin/frame/test_from_dict.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# +import modin.pandas as pd +import pandas as native_pd + +from tests.integ.modin.utils import assert_frame_equal +from tests.integ.utils.sql_counter import sql_count_checker + + +@sql_count_checker(query_count=1) +def test_from_dict_basic(): + data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} + + assert_frame_equal( + pd.DataFrame.from_dict(data), + native_pd.DataFrame.from_dict(data), + check_dtype=False, + ) + + +@sql_count_checker(query_count=1) +def test_from_dict_nested_dict(): + data = {"col_1": {"a": 1, "b": 2}, "col_2": {"c": 3, "d": 4}} + + assert_frame_equal( + pd.DataFrame.from_dict(data), + native_pd.DataFrame.from_dict(data), + check_dtype=False, + ) + + +@sql_count_checker(query_count=1) +def test_from_dict_orient_index(): + data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]} + + assert_frame_equal( + pd.DataFrame.from_dict(data, orient="index"), + native_pd.DataFrame.from_dict(data, orient="index"), + check_dtype=False, + ) + + +@sql_count_checker(query_count=1) +def test_from_dict_orient_index_columns(): + data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]} + + assert_frame_equal( + pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]), + native_pd.DataFrame.from_dict( + data, orient="index", columns=["A", "B", "C", "D"] + ), + check_dtype=False, + ) + + +@sql_count_checker(query_count=1) +def test_from_dict_orient_tight(): + data = { + "index": [("a", "b"), ("a", "c")], + "columns": [("x", 1), ("y", 2)], + "data": [[1, 3], [2, 4]], + "index_names": ["n1", "n2"], + "column_names": ["z1", "z2"], + } + + assert_frame_equal( + pd.DataFrame.from_dict(data, orient="tight"), + native_pd.DataFrame.from_dict(data, orient="tight"), + check_dtype=False, + ) + + +@sql_count_checker(query_count=7) +def test_from_dict_series_values(): + # TODO(SNOW-1857349): Proved a lazy implementation for this case + data = {i: pd.Series(range(1)) for i in range(2)} + + assert_frame_equal( + pd.DataFrame.from_dict(data), + native_pd.DataFrame.from_dict(data), + check_dtype=False, + ) diff --git a/tests/integ/modin/frame/test_from_records.py b/tests/integ/modin/frame/test_from_records.py new file mode 100644 index 00000000000..89703c708c2 --- /dev/null +++ b/tests/integ/modin/frame/test_from_records.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# +import modin.pandas as pd +import pandas as native_pd +import numpy as np +import pytest + +from tests.integ.modin.utils import assert_frame_equal +from tests.integ.utils.sql_counter import sql_count_checker + + +@sql_count_checker(query_count=1) +def test_from_records_structured_ndarray(): + data = np.array( + [(3, "a"), (2, "b"), (1, "c"), (0, "d")], + dtype=[("col_1", "i4"), ("col_2", "U1")], + ) + assert_frame_equal( + pd.DataFrame.from_records(data), + native_pd.DataFrame.from_records(data), + check_dtype=False, + ) + + +@sql_count_checker(query_count=1) +def test_from_records_list_of_dicts(): + data = [ + {"col_1": 3, "col_2": "a"}, + {"col_1": 2, "col_2": "b"}, + {"col_1": 1, "col_2": "c"}, + {"col_1": 0, "col_2": "d"}, + ] + + assert_frame_equal( + pd.DataFrame.from_records(data), + native_pd.DataFrame.from_records(data), + check_dtype=False, + ) + + +@sql_count_checker(query_count=1) +def test_from_records_list_of_records(): + data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")] + + assert_frame_equal( + pd.DataFrame.from_records(data), + native_pd.DataFrame.from_records(data), + check_dtype=False, + ) + + +@sql_count_checker(query_count=0) +def test_from_records_neg(): + data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas 'DataFrame.from_records' method does not yet support 'data' parameter of type 'DataFrame'", + ): + pd.DataFrame.from_records(data), diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index 10d1e84c568..ff4636a8bd9 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -284,6 +284,54 @@ def test_groupby_agg_with_float_dtypes_named_agg() -> None: ) +@pytest.mark.parametrize( + "grpby_fn", + [ + lambda gr: gr.quantile(), + lambda gr: gr.quantile(q=0.3), + ], +) +@sql_count_checker(query_count=1) +def test_groupby_agg_quantile_with_int_dtypes(grpby_fn) -> None: + native_df = native_pd.DataFrame( + { + "col1_grp": ["g1", "g2", "g0", "g0", "g2", "g3", "g0", "g2", "g3"], + "col2_int64": np.arange(9, dtype="int64") // 3, + "col3_int_identical": [2] * 9, + "col4_int32": np.arange(9, dtype="int32") // 4, + "col5_int16": np.arange(9, dtype="int16") // 3, + "col6_mixed": np.concatenate( + [ + np.arange(3, dtype="int64") // 3, + np.arange(3, dtype="int32") // 3, + np.arange(3, dtype="int16") // 3, + ] + ), + "col7_int_missing": [5, 6, np.nan, 2, 1, np.nan, 5, np.nan, np.nan], + "col8_mixed_missing": np.concatenate( + [ + np.arange(2, dtype="int64") // 3, + [np.nan], + np.arange(2, dtype="int32") // 3, + [np.nan], + np.arange(2, dtype="int16") // 3, + [np.nan], + ] + ), + } + ) + snowpark_pandas_df = pd.DataFrame(native_df) + by = "col1_grp" + snowpark_pandas_groupby = snowpark_pandas_df.groupby(by=by) + pandas_groupby = native_df.groupby(by=by) + eval_snowpark_pandas_result( + snowpark_pandas_groupby, + pandas_groupby, + grpby_fn, + comparator=assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, + ) + + @sql_count_checker(query_count=2) def test_groupby_agg_with_int_dtypes(int_to_decimal_float_agg_method) -> None: snowpark_pandas_df = pd.DataFrame( diff --git a/tests/integ/modin/groupby/test_groupby_unique.py b/tests/integ/modin/groupby/test_groupby_unique.py new file mode 100644 index 00000000000..c1e77f81d32 --- /dev/null +++ b/tests/integ/modin/groupby/test_groupby_unique.py @@ -0,0 +1,107 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.utils import eval_snowpark_pandas_result, create_test_dfs +import re +from tests.integ.utils.sql_counter import sql_count_checker +import json + + +@pytest.mark.parametrize( + "by,level", + [ + ("animal", None), + (["animal"], None), + (["animal", "height_in"], None), + (None, (0, 1)), + (None, (0,)), + (None, "animal_index"), + ], +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("axis", ["index", 0]) +@sql_count_checker(query_count=1) +def test_all_params(by, level, as_index, sort, dropna, axis): + eval_snowpark_pandas_result( + *create_test_dfs( + [ + ("Chihuahua", "dog", 6.1), + ("Beagle", "dog", 15.2), + ("Chihuahua", "dog", 6.9), + ("Persian", "cat", 9.2), + ("Persian", None, None), + ("Chihuahua", "dog", 7), + ("Persian", "cat", 8.8), + ], + columns=["breed", "animal", "height_in"], + index=native_pd.MultiIndex.from_tuples( + [ + ("c", "d"), + ("b", "d"), + ("c", "d"), + ("p", "c"), + (None, None), + ("c", "d"), + ("p", "c"), + ], + names=("breed_index", "animal_index"), + ), + ), + lambda df: df.groupby( + by=by, level=level, as_index=as_index, sort=sort, dropna=dropna, axis=axis + )["breed"].unique(), + # pandas fails to propagate attrs through SeriesGroupBy.unique() + test_attrs=False + ) + + +@pytest.mark.xfail( + strict=True, raises=json.decoder.JSONDecodeError, reason="SNOW-1859090" +) +def test_aggregating_string_column_with_nulls(): + eval_snowpark_pandas_result( + *create_test_dfs( + [ + ( + "Chihuahua", + "dog", + ), + ( + "Beagle", + "dog", + ), + ( + "Chihuahua", + "dog", + ), + ( + "Persian", + "cat", + ), + ("Persian", "cat"), + ("Chihuahua", "dog"), + (None, "cat"), + ], + columns=["breed", "animal"], + ), + lambda df: df.groupby("animal")["breed"].unique(), + # pandas fails to propagate attrs through SeriesGroupBy.unique() + test_attrs=False + ) + + +@sql_count_checker(query_count=0) +def test_axis_1(): + eval_snowpark_pandas_result( + *create_test_dfs([["a", "a"], ["c", "d"]]), + lambda df: df.groupby(0, axis=1)[1], + expect_exception=True, + expect_exception_type=ValueError, + expect_exception_match=re.escape(r"Cannot subset columns when using axis=1") + ) diff --git a/tests/integ/modin/interoperability/plotly/test_plotly.py b/tests/integ/modin/interoperability/plotly/test_plotly.py new file mode 100644 index 00000000000..2dc9ae59d55 --- /dev/null +++ b/tests/integ/modin/interoperability/plotly/test_plotly.py @@ -0,0 +1,211 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import plotly.express as px +import pytest +import pandas as native_pd + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.modin.utils import eval_snowpark_pandas_result + +# Integration tests for plotly.express module (https://plotly.com/python-api-reference/plotly.express.html). +# To add tests for additional APIs, +# - Call the method with Snowpark pandas and native pandas df input and get the JSON representation with +# `to_plotly_json()`. +# - Assert correctness of the plot produced using `assert_plotly_equal` function defined below. + + +def assert_plotly_equal(expect, got): + # referenced from cudf plotly integration test + # https://github.com/rapidsai/cudf/blob/main/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py#L10 + + assert type(expect) == type(got) + if isinstance(expect, dict): + assert expect.keys() == got.keys() + for k in expect.keys(): + assert_plotly_equal(expect[k], got[k]) + elif isinstance(got, list): + assert len(expect) == len(got) + for i in range(len(expect)): + assert_plotly_equal(expect[i], got[i]) + elif isinstance(expect, np.ndarray): + if isinstance(expect[0], float): + np.testing.assert_allclose(expect, got) + else: + assert (expect == got).all() + else: + assert expect == got + + +@pytest.fixture() +def test_dfs(): + nsamps = 50 + rng = np.random.default_rng(seed=42) + data = { + "x": rng.random(nsamps), + "y": rng.random(nsamps), + "category": rng.integers(0, 5, nsamps), + "category2": rng.integers(0, 5, nsamps), + } + snow_df = pd.DataFrame(data) + native_df = native_pd.DataFrame(data) + return snow_df, native_df + + +@sql_count_checker(query_count=1) +def test_scatter(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.scatter(df, x="x", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_line(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.line(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_area(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.area(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_timeline(): + native_df = native_pd.DataFrame( + [ + dict(Task="Job A", Start="2009-01-01", Finish="2009-02-28"), + dict(Task="Job B", Start="2009-03-05", Finish="2009-04-15"), + dict(Task="Job C", Start="2009-02-20", Finish="2009-05-30"), + ] + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: px.timeline( + df, x_start="Start", x_end="Finish", y="Task" + ).to_plotly_json(), + comparator=assert_plotly_equal, + ) + + +@sql_count_checker(query_count=1) +def test_violin(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.violin(df, y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_bar(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.bar(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_histogram(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.histogram(df, x="category").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_pie(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.pie(df, values="category", names="category2").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_treemap(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.treemap(df, names="category", values="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_sunburst(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.sunburst(df, names="category", values="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_icicle(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.icicle(df, names="category", values="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_scatter_matrix(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.scatter_matrix(df, dimensions=["category"]).to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_funnel(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.funnel(df, x="x", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_density_heatmap(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.density_heatmap(df, x="x", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=1) +def test_box(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.box(df, x="category", y="y").to_plotly_json(), + comparator=assert_plotly_equal + ) + + +@sql_count_checker(query_count=4) +def test_imshow(test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + lambda df: px.imshow(df, x=df.columns, y=df.index).to_plotly_json(), + comparator=assert_plotly_equal + ) diff --git a/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py new file mode 100644 index 00000000000..971b47c07ae --- /dev/null +++ b/tests/integ/modin/interoperability/scikit-learn/test_scikit_learn.py @@ -0,0 +1,211 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +from sklearn.decomposition import PCA +from sklearn.preprocessing import MaxAbsScaler + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.utils.sql_counter import sql_count_checker +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import RandomizedSearchCV +from sklearn.cluster import KMeans +from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +import numpy as np +import pytest + +""" +------ +README +------ + +This test suite tests scikit-learn's interoperability with Snowpark pandas. + +Generally, scikit-learn seems to work with Snowpark pandas inputs via a +combination of the dataframe interchange protocol and converting Snowpark +pandas inputs to numpy with methods like __array__() and np.asarray(). Some +scikit-learn methods may cause Snowpark pandas to execute many Snowflake +queries or to materialize Snowpark pandas data one or more times. We don't +plan to fix the performance of scikit-learn with Snowpark pandas inputs, and +we recommend that users convert their data to native pandas before passing it +to scikit-learn if scikit-learn takes too long with Snowpark pandas inputs. + +We group the tests into scenarios into the following use cases, listed under +https://scikit-learn.org/stable/index.html: + +- Classification +- Regression +- Clustering +- Dimensionality reduction +- Model selection +- Preprocessing + +Many scikit-learn methods produce non-deterministic results, and not all of +them provide a way to seed the results so that they are consistent for a test. +Generally, we only validate that 1) we can pass Snowpark pandas dataframe/series +into methods that accept native pandas inputs and 2) the outputs have the correct +type and, in case they are numpy arrays, they have the correct shape. + +To test interoperability with a particular scikit-learn method: + +1) Read about what the method does and how to use it +2) Start writing a test case under the test class for the category that the + method belongs to (e.g. under TestClassification for + LinearDiscriminantAnalysis) +2) Construct a use case that works with native pandas and produces a meaningful + result (for example, train a model on pandas training data and fit it to test + data) +3) Write a test case checking that replacing the pandas input with Snowpark + pandas produces results of the same type and, in the case of array-like + outputs, of the same dimensions. `assert_numpy_results_valid` can validate + numpy results. Avoid checking that the values in the result are the same + values we would get if we use pandas, because many scikit-learn methods + are non-deterministic. +4) Wrap the test with an empty sql_count_checker() decorator to see how many + queries and joins it requires. If it it requires a very large number of + queries, see if you can simplify the test case so that it causes fewer + queries, so that the test finishes quickly. If you can't reduce the number of + queries to a reasonable level, you should pass the SQL count checker the + `no_check=True` parameter because the number of queries is likely to vary + across scikit-learn and Snowpark pandas versions, and we don't gain much + insight by adjusting the query count every time it changes. +5) Add a row describing interoperability with the new method in the + [documentation](docs/source/modin/interoperability.rst) +""" + + +def assert_numpy_results_valid(snow_result, pandas_result) -> None: + assert isinstance(snow_result, np.ndarray) + assert isinstance(pandas_result, np.ndarray) + # Generally a meaningful test case should produce a non-empty result + assert pandas_result.size > 0 + assert snow_result.shape == pandas_result.shape + + +@pytest.fixture() +def test_dfs(): + data = { + "feature1": [1, 5, 3, 4, 4, 6, 7, 2, 9, 70], + "feature2": [2, 4, 1, 3, 5, 7, 6, 3, 10, 9], + "target": [0, 0, 1, 0, 1, 1, 1, 0, 1, 0], + } + return create_test_dfs(data) + + +class TestClassification: + @sql_count_checker(query_count=6) + def test_linear_discriminant_analysis(self, test_dfs): + def get_predictions(df) -> np.ndarray: + X = df[["feature1", "feature2"]] + y = df["target"] + train_size = 8 + X_train, X_test = X.iloc[:train_size], X.iloc[train_size:] + y_train = y.iloc[:train_size] + return LinearDiscriminantAnalysis().fit(X_train, y_train).predict(X_test) + + eval_snowpark_pandas_result( + *test_dfs, get_predictions, comparator=assert_numpy_results_valid + ) + + +class TestRegression: + @sql_count_checker(query_count=6) + def test_logistic_regression(self, test_dfs): + def get_predictions(df) -> np.ndarray: + X = df[["feature1", "feature2"]] + y = df["target"] + train_size = 8 + X_train, X_test = X.iloc[:train_size], X.iloc[train_size:] + y_train = y.iloc[:train_size] + return LogisticRegression().fit(X_train, y_train).predict(X_test) + + eval_snowpark_pandas_result( + *test_dfs, get_predictions, comparator=assert_numpy_results_valid + ) + + +class TestClustering: + @sql_count_checker(query_count=3) + def test_clustering(self, test_dfs): + def get_cluster_centers(df) -> np.ndarray: + return KMeans(n_clusters=3).fit(df).cluster_centers_ + + eval_snowpark_pandas_result( + *test_dfs, get_cluster_centers, comparator=assert_numpy_results_valid + ) + + +class TestDimensionalityReduction: + @sql_count_checker(query_count=3) + def test_principal_component_analysis(self, test_dfs): + def get_principal_components(df) -> np.ndarray: + return PCA(n_components=2).fit(df).components_ + + eval_snowpark_pandas_result( + *test_dfs, get_principal_components, comparator=assert_numpy_results_valid + ) + + +class TestModelSelection: + @sql_count_checker( + # Model search is a complex, iterative process. Pushing it down to + # Snowflake requires many queries (approximately 31 for this case). + # Since the number of queries and the number of joins are so large, they + # are likely to change due to changes in both scikit-learn and Snowpark + # pandas. We don't get much insight from the exact number of queries, so + # we skip the query count check. The recommended solution to this query + # explosion is for users to convert the Snowpark pandas object to pandas + # with to_pandas() and pass the result to scikit-learn. + no_check=True + ) + def test_randomized_search_cv(self, test_dfs): + def get_best_estimator(df) -> dict: + # Initialize the hyperparameter search with parameters that will + # reduce the search time as much as possible. + return ( + RandomizedSearchCV( + LogisticRegression(), + param_distributions={ + "C": [0.001], + }, + # cv=2 means 2-fold validation, which requires the fewest queries. + cv=2, + # Test just one combination of parameters. + n_iter=1, + # refit=False means that the search doesn't have to actually + # train a model using the parameters that it chooses. Setting + # refit=False should further reduce the number of queries. + refit=False, + ) + .fit(df[["feature1", "feature2"]], df["target"]) + .best_params_ + ) + + def validate_search_results(snow_estimator, pandas_estimator): + assert isinstance(snow_estimator, dict) + assert isinstance(pandas_estimator, dict) + + eval_snowpark_pandas_result( + *test_dfs, get_best_estimator, comparator=validate_search_results + ) + + +class TestPreprocessing: + @sql_count_checker(query_count=5) + def test_maxabs(self, test_dfs): + eval_snowpark_pandas_result( + *test_dfs, + MaxAbsScaler().fit_transform, + comparator=assert_numpy_results_valid + ) + + +""" +------ +README +------ + +Please see the README at the top of this file for instructions on adding test +cases. +""" diff --git a/tests/integ/modin/series/test_apply_and_map.py b/tests/integ/modin/series/test_apply_and_map.py index f776863fa6e..81a64bee99a 100644 --- a/tests/integ/modin/series/test_apply_and_map.py +++ b/tests/integ/modin/series/test_apply_and_map.py @@ -169,7 +169,7 @@ def create_func_with_return_type_hint(func: Callable, return_type: str) -> Calla return d["f"] -TEST_NUMPY_FUNCS = [np.min, np.sqrt, np.tan, np.sum, np.median] +TEST_NUMPY_FUNCS = [np.min, np.sqrt, np.tan, np.sum, np.square, np.log1p, np.exp2] @pytest.mark.parametrize("method", ["apply", "map"]) @@ -412,7 +412,7 @@ def test_builtin_function(self, method, func): ) @pytest.mark.parametrize("func", TEST_NUMPY_FUNCS) - @sql_count_checker(query_count=4, udf_count=1) + @sql_count_checker(query_count=1) def test_apply_and_map_numpy(self, method, func): data = [1.0, 2.0, 3.0] native_series = native_pd.Series(data) diff --git a/tests/integ/modin/series/test_between.py b/tests/integ/modin/series/test_between.py new file mode 100644 index 00000000000..bb40808eed8 --- /dev/null +++ b/tests/integ/modin/series/test_between.py @@ -0,0 +1,135 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import datetime as dt + +import numpy as np +import pandas as native_pd +import pytest + +import modin.pandas as pd +import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.exceptions import SnowparkSQLException +from tests.integ.modin.utils import ( + eval_snowpark_pandas_result, + create_test_series, + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, +) +from tests.integ.utils.sql_counter import sql_count_checker + + +@sql_count_checker(query_count=1) +def test_series_between_default_inclusive(): + eval_snowpark_pandas_result( + *create_test_series(list(range(0, 10))), lambda ser: ser.between(2, 8) + ) + + +# tuples of (data, low, high) +BETWEEN_TEST_ARGUMENTS = [ + ([0.8, 1.1, 0.9, 1.2], 0.9, 1.1), + ([0.8, -1.1, 0.9, 1.2], -1, 1.14), + # strings are compared lexicographically + (["quick", "brown", "fox", "Quick", "Brown", "Fox"], "Zeta", "alpha"), + (["quick", "brown", "fox", "Quick", "Brown", "Fox"], "Delta", "kappa"), + ( + [ + dt.datetime(2024, 10, 11, 17, 5), + dt.datetime(2020, 1, 2, 2, 40), + dt.datetime(1998, 7, 7, 12, 33), + ], + dt.datetime(2019, 1, 1, 0, 0), + dt.datetime(2021, 1, 1, 0, 0), + ), +] + + +@pytest.mark.parametrize("data, low, high", BETWEEN_TEST_ARGUMENTS) +@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) +@sql_count_checker(query_count=1) +def test_series_between(data, low, high, inclusive): + eval_snowpark_pandas_result( + *create_test_series(data), lambda ser: ser.between(low, high, inclusive) + ) + + +@pytest.mark.parametrize( + "data, low, high", + [ + ([0.8, 1.1, 0.9, 1.2, np.nan], np.nan, 1.1), + ([0.8, 1.1, 0.9, 1.2, np.nan], np.nan, np.nan), + ([0.8, 1.1, 0.9, 1.2, np.nan], -1, 1.1), + ([None, "", "aa", "aaaa"], "", "aaa"), + ([None, "", "aa", "aaaa"], None, "aaa"), + ([None, "", "aa", "aaaa"], None, None), + ], +) +@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) +@sql_count_checker(query_count=1) +def test_series_between_with_nulls(data, low, high, inclusive): + # In Snowflake SQL, comparisons with NULL will always result in a NULL value in the output. + # Any comparison with NULL will return NULL, though the conjunction FALSE AND NULL will + # short-circuit and return FALSE. + eval_snowpark_pandas_result( + *create_test_series(data), + lambda ser: ser.between(low, high, inclusive).astype(bool), + ) + + +@pytest.mark.parametrize("data, low, high", BETWEEN_TEST_ARGUMENTS) +@sql_count_checker(query_count=1) +def test_series_between_flip_left_right(data, low, high): + # When left/right are out of order, comparisons are still performed (high >= low is not enforced) + eval_snowpark_pandas_result( + *create_test_series(data), lambda ser: ser.between(high, low) + ) + + +@pytest.mark.parametrize( + "data, low, high", + [ + ([1, 2, 3, 4], [0, 1, 2, 3], [1.1, 1.9, 3.1, 3.9]), + (["a", "b", "aa", "aaa"], ["", "a", "ccc", "aaaa"], ["c", "bb", "aaa", "d"]), + ], +) +@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) +@sql_count_checker(query_count=1, join_count=3) +def test_series_between_series(data, low, high, inclusive): + eval_snowpark_pandas_result( + *create_test_series(data), + lambda ser: ser.between( + pd.Series(high) if isinstance(ser, pd.Series) else native_pd.Series(high), + pd.Series(low) if isinstance(ser, pd.Series) else native_pd.Series(low), + inclusive, + ), + ) + + +@sql_count_checker(query_count=1, join_count=3) +def test_series_between_series_different_dimensions(): + # When attempting to compare with low/high of different lengths, Snowflake will leave NULLs + # but pandas will raise an error. + data = [1.1] + low = [1, 2] + high = [1, 2, 3] + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + pd.Series(data).between(low, high), + native_pd.Series([False]), + ) + with pytest.raises( + ValueError, match="Can only compare identically-labeled Series objects" + ): + native_pd.Series(data).between(native_pd.Series(low), native_pd.Series(high)) + + +@sql_count_checker(query_count=0) +def test_series_between_invalid_comparison(): + with pytest.raises( + TypeError, match="Invalid comparison between dtype=int64 and str" + ): + native_pd.Series([1]).between("a", "b") + with pytest.raises( + SnowparkSQLException, match="Numeric value 'a' is not recognized" + ): + pd.Series([1]).between("a", "b").to_pandas() diff --git a/tests/integ/modin/series/test_dt_accessor.py b/tests/integ/modin/series/test_dt_accessor.py index b41ad143afb..d687f8fc277 100644 --- a/tests/integ/modin/series/test_dt_accessor.py +++ b/tests/integ/modin/series/test_dt_accessor.py @@ -13,6 +13,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import create_test_series, eval_snowpark_pandas_result from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +from tests.utils import IS_WINDOWS dt_properties = pytest.mark.parametrize( "property_name", @@ -433,6 +434,79 @@ def test_days_in_month(property): ) +@sql_count_checker(query_count=1) +@pytest.mark.parametrize( + "date_format", + [ + "a%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%b", + "%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%b", + "a%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%", + "%d-%m-%Y-%H-%M-%S-%f-%j-%X-%%", + "%%%M", + "%%M", + "abc%", + ], +) +def test_strftime(date_format): + if IS_WINDOWS and date_format == "abc%": + pytest.skip( + "Windows test shows that native pandas leaves the input value unchanged when date_format='abc%'" + ) + + datetime_index = native_pd.DatetimeIndex( + [ + "2014-04-04 23:56:01.000000001", + "2014-07-18 21:24:02.000000002", + "2015-11-22 22:14:03.000000003", + "2015-11-23 20:12:04.1234567890", + pd.NaT, + ], + ) + native_ser = native_pd.Series(datetime_index) + snow_ser = pd.Series(native_ser) + eval_snowpark_pandas_result( + snow_ser, + native_ser, + lambda s: s.dt.strftime(date_format=date_format), + ) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize( + "date_format", + [ + "%a", + "%A", + "%w", + "%b", + "%B", + "%y", + "%I", + "%p", + "%z", + "%Z", + "%U", + "%W", + "%c", + "%x", + ], +) +def test_strftime_neg(date_format): + datetime_index = native_pd.DatetimeIndex( + [ + "2014-04-04 23:56:01.000000001", + "2014-07-18 21:24:02.000000002", + "2015-11-22 22:14:03.000000003", + "2015-11-23 20:12:04.1234567890", + pd.NaT, + ], + ) + native_ser = native_pd.Series(datetime_index) + snow_ser = pd.Series(native_ser) + with pytest.raises(NotImplementedError): + snow_ser.dt.strftime(date_format=date_format) + + @dt_properties @sql_count_checker(query_count=1) def test_dt_property_with_tz(property_name): diff --git a/tests/integ/modin/series/test_dt_accessor_unsupported.py b/tests/integ/modin/series/test_dt_accessor_unsupported.py index 82978886f1e..dbe70dac01e 100644 --- a/tests/integ/modin/series/test_dt_accessor_unsupported.py +++ b/tests/integ/modin/series/test_dt_accessor_unsupported.py @@ -26,42 +26,3 @@ def test_dt_namespace_accessor_datetime64(self, freq): msg = "Snowpark pandas doesn't yet support the property 'Series.dt.freq'" with pytest.raises(NotImplementedError, match=msg): ser.dt.freq - - @pytest.mark.parametrize( - "date, format_string, expected", - [ - ( - native_pd.date_range("20130101", periods=5), - "%Y/%m/%d", - native_pd.Series( - [ - "2013/01/01", - "2013/01/02", - "2013/01/03", - "2013/01/04", - "2013/01/05", - ] - ), - ), - ( - native_pd.date_range("2015-02-03 11:22:33.4567", periods=5), - "%Y/%m/%d %H-%M-%S", - native_pd.Series( - [ - "2015/02/03 11-22-33", - "2015/02/04 11-22-33", - "2015/02/05 11-22-33", - "2015/02/06 11-22-33", - "2015/02/07 11-22-33", - ] - ), - ), - ], - ) - @sql_count_checker(query_count=0) - def test_strftime(self, date, format_string, expected): - # GH 10086 - ser = pd.Series(date) - msg = "Snowpark pandas doesn't yet support the method 'Series.dt.strftime'" - with pytest.raises(NotImplementedError, match=msg): - ser.dt.strftime(format_string) diff --git a/tests/integ/modin/test_apply_snowpark_python_functions.py b/tests/integ/modin/test_apply_snowpark_python_functions.py index 5e5911a92c5..61982714380 100644 --- a/tests/integ/modin/test_apply_snowpark_python_functions.py +++ b/tests/integ/modin/test_apply_snowpark_python_functions.py @@ -10,7 +10,8 @@ import pytest from tests.integ.modin.utils import assert_frame_equal, assert_series_equal -from tests.integ.utils.sql_counter import sql_count_checker +from tests.integ.utils.sql_counter import sql_count_checker, SqlCounter +from tests.utils import running_on_jenkins @sql_count_checker(query_count=4) @@ -71,21 +72,48 @@ def test_apply_snowpark_python_function_not_implemented(): pd.DataFrame({"a": [1, 2, 3]}).apply(asc, args=(1, 2)) -@sql_count_checker(query_count=1) -@pytest.mark.skip("SNOW-1758914 snowflake.cortex.summarize error on GCP") -def test_apply_snowflake_cortex_summarize(): +@pytest.mark.skipif( + running_on_jenkins(), + reason="TODO: SNOW-1859087 snowflake.cortex.summarize SSL error", +) +def test_apply_snowflake_cortex_summarize(session): from snowflake.snowpark.functions import snowflake_cortex_summarize - content = """pandas on Snowflake lets you run your pandas code in a distributed manner directly on your data in - Snowflake. Just by changing the import statement and a few lines of code, you can get the familiar pandas experience - you know and love with the scalability and security benefits of Snowflake. With pandas on Snowflake, you can work - with much larger datasets and avoid the time and expense of porting your pandas pipelines to other big data - frameworks or provisioning large and expensive machines. It runs workloads natively in Snowflake through - transpilation to SQL, enabling it to take advantage of parallelization and the data governance and security - benefits of Snowflake. pandas on Snowflake is delivered through the Snowpark pandas API as part of the Snowpark - Python library, which enables scalable data processing of Python code within the Snowflake platform. -""" - s = pd.Series([content]) - summary = s.apply(snowflake_cortex_summarize).iloc[0] - # this length check is to get around the fact that this function may not be deterministic - assert 0 < len(summary) < len(content) + # TODO: SNOW-1758914 snowflake.cortex.summarize error on GCP + with SqlCounter(query_count=0): + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + + with SqlCounter(query_count=1): + content = """pandas on Snowflake lets you run your pandas code in a distributed manner directly on your data in + Snowflake. Just by changing the import statement and a few lines of code, you can get the familiar pandas experience + you know and love with the scalability and security benefits of Snowflake. With pandas on Snowflake, you can work + with much larger datasets and avoid the time and expense of porting your pandas pipelines to other big data + frameworks or provisioning large and expensive machines. It runs workloads natively in Snowflake through + transpilation to SQL, enabling it to take advantage of parallelization and the data governance and security + benefits of Snowflake. pandas on Snowflake is delivered through the Snowpark pandas API as part of the Snowpark + Python library, which enables scalable data processing of Python code within the Snowflake platform. + """ + s = pd.Series([content]) + summary = s.apply(snowflake_cortex_summarize).iloc[0] + # this length check is to get around the fact that this function may not be deterministic + assert 0 < len(summary) < len(content) + + +@pytest.mark.skipif( + running_on_jenkins(), + reason="TODO: SNOW-1859087 snowflake.cortex.sentiment SSL error", +) +def test_apply_snowflake_cortex_sentiment(session): + from snowflake.snowpark.functions import snowflake_cortex_sentiment + + # TODO: SNOW-1758914 snowflake.cortex.sentiment error on GCP + with SqlCounter(query_count=0): + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + + with SqlCounter(query_count=1): + content = "A very very bad review!" + s = pd.Series([content]) + sentiment = s.apply(snowflake_cortex_sentiment).iloc[0] + assert -1 <= sentiment <= 0 diff --git a/tests/integ/modin/test_scikit.py b/tests/integ/modin/test_scikit.py deleted file mode 100644 index c28a6f59ba9..00000000000 --- a/tests/integ/modin/test_scikit.py +++ /dev/null @@ -1,25 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -import modin.pandas as pd -from sklearn.decomposition import PCA -from sklearn.preprocessing import MaxAbsScaler - -import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.utils.sql_counter import sql_count_checker - - -@sql_count_checker(query_count=5) -def test_maxabs(): - data = [[1.0, -1.0, 2.0], [2.0, 0.0, 0.0], [0.0, 1.0, -1.0]] - X = pd.DataFrame(data) - MaxAbsScaler().fit_transform(X) - - -@sql_count_checker(query_count=3) -def test_pca(): - data = [[1.0, -1.0, 2.0], [2.0, 0.0, 0.0], [0.0, 1.0, -1.0]] - X = pd.DataFrame(data) - pca = PCA() - pca.fit(X) diff --git a/tests/integ/scala/test_datatype_suite.py b/tests/integ/scala/test_datatype_suite.py index 63c24b7e35c..71dd6d6b1d8 100644 --- a/tests/integ/scala/test_datatype_suite.py +++ b/tests/integ/scala/test_datatype_suite.py @@ -9,6 +9,7 @@ import pytest +import snowflake.snowpark.context as context from snowflake.connector.options import installed_pandas from snowflake.snowpark import Row from snowflake.snowpark.exceptions import SnowparkSQLException @@ -57,24 +58,28 @@ # make sure dataframe creation is the same as _create_test_dataframe -_STRUCTURE_DATAFRAME_QUERY = """ +_STRUCTURED_DATAFRAME_QUERY = """ select object_construct('k1', 1) :: map(varchar, int) as map, - object_construct('A', 'foo', 'B', 0.05) :: object(A varchar, B float) as obj, + object_construct('A', 'foo', 'b', 0.05) :: object(A varchar, b float) as obj, [1.0, 3.1, 4.5] :: array(float) as arr """ -# make sure dataframe creation is the same as _STRUCTURE_DATAFRAME_QUERY -def _create_test_dataframe(s): +# make sure dataframe creation is the same as _STRUCTURED_DATAFRAME_QUERY +def _create_test_dataframe(s, structured_type_support): + nested_field_name = "b" if structured_type_support else "B" df = s.create_dataframe([1], schema=["a"]).select( object_construct(lit("k1"), lit(1)) .cast(MapType(StringType(), IntegerType(), structured=True)) .alias("map"), - object_construct(lit("A"), lit("foo"), lit("B"), lit(0.05)) + object_construct(lit("A"), lit("foo"), lit(nested_field_name), lit(0.05)) .cast( StructType( - [StructField("A", StringType()), StructField("B", DoubleType())], + [ + StructField("A", StringType()), + StructField(nested_field_name, DoubleType()), + ], structured=True, ) ) @@ -86,55 +91,6 @@ def _create_test_dataframe(s): return df -STRUCTURED_TYPES_EXAMPLES = { - True: ( - _STRUCTURE_DATAFRAME_QUERY, - [ - ("MAP", "map"), - ("OBJ", "struct"), - ("ARR", "array"), - ], - StructType( - [ - StructField( - "MAP", - MapType(StringType(16777216), LongType(), structured=True), - nullable=True, - ), - StructField( - "OBJ", - StructType( - [ - StructField("A", StringType(16777216), nullable=True), - StructField("B", DoubleType(), nullable=True), - ], - structured=True, - ), - nullable=True, - ), - StructField( - "ARR", ArrayType(DoubleType(), structured=True), nullable=True - ), - ] - ), - ), - False: ( - _STRUCTURE_DATAFRAME_QUERY, - [ - ("MAP", "map"), - ("OBJ", "map"), - ("ARR", "array"), - ], - StructType( - [ - StructField("MAP", MapType(StringType(), StringType()), nullable=True), - StructField("OBJ", MapType(StringType(), StringType()), nullable=True), - StructField("ARR", ArrayType(StringType()), nullable=True), - ] - ), - ), -} - ICEBERG_CONFIG = { "catalog": "SNOWFLAKE", "external_volume": "python_connector_iceberg_exvol", @@ -142,6 +98,61 @@ def _create_test_dataframe(s): } +def _create_example(structured_types_enabled): + if structured_types_enabled: + return ( + _STRUCTURED_DATAFRAME_QUERY, + [ + ("MAP", "map"), + ("OBJ", "struct"), + ("ARR", "array"), + ], + StructType( + [ + StructField( + "MAP", + MapType(StringType(16777216), LongType(), structured=True), + nullable=True, + ), + StructField( + "OBJ", + StructType( + [ + StructField("A", StringType(16777216), nullable=True), + StructField("b", DoubleType(), nullable=True), + ], + structured=True, + ), + nullable=True, + ), + StructField( + "ARR", ArrayType(DoubleType(), structured=True), nullable=True + ), + ] + ), + ) + else: + return ( + _STRUCTURED_DATAFRAME_QUERY, + [ + ("MAP", "map"), + ("OBJ", "map"), + ("ARR", "array"), + ], + StructType( + [ + StructField( + "MAP", MapType(StringType(), StringType()), nullable=True + ), + StructField( + "OBJ", MapType(StringType(), StringType()), nullable=True + ), + StructField("ARR", ArrayType(StringType()), nullable=True), + ] + ), + ) + + @pytest.fixture(scope="module") def structured_type_support(session, local_testing_mode): yield structured_types_supported(session, local_testing_mode) @@ -149,14 +160,17 @@ def structured_type_support(session, local_testing_mode): @pytest.fixture(scope="module") def examples(structured_type_support): - yield STRUCTURED_TYPES_EXAMPLES[structured_type_support] + yield _create_example(structured_type_support) @pytest.fixture(scope="module") def structured_type_session(session, structured_type_support): if structured_type_support: with structured_types_enabled_session(session) as sess: + semantics_enabled = context._should_use_structured_type_semantics + context._should_use_structured_type_semantics = True yield sess + context._should_use_structured_type_semantics = semantics_enabled else: yield session @@ -365,9 +379,9 @@ def test_dtypes(session): "config.getoption('local_testing_mode', default=False)", reason="FEAT: SNOW-1372813 Cast to StructType not supported", ) -def test_structured_dtypes(structured_type_session, examples): +def test_structured_dtypes(structured_type_session, examples, structured_type_support): query, expected_dtypes, expected_schema = examples - df = _create_test_dataframe(structured_type_session) + df = _create_test_dataframe(structured_type_session, structured_type_support) assert df.schema == expected_schema assert df.dtypes == expected_dtypes @@ -380,13 +394,16 @@ def test_structured_dtypes(structured_type_session, examples): "config.getoption('local_testing_mode', default=False)", reason="FEAT: SNOW-1372813 Cast to StructType not supported", ) -def test_structured_dtypes_select(structured_type_session, examples): +def test_structured_dtypes_select( + structured_type_session, examples, structured_type_support +): query, expected_dtypes, expected_schema = examples - df = _create_test_dataframe(structured_type_session) + df = _create_test_dataframe(structured_type_session, structured_type_support) + nested_field_name = "b" if context._should_use_structured_type_semantics else "B" flattened_df = df.select( df.map["k1"].alias("value1"), df.obj["A"].alias("a"), - col("obj")["B"].alias("b"), + col("obj")[nested_field_name].alias("b"), df.arr[0].alias("value2"), df.arr[1].alias("value3"), col("arr")[2].alias("value4"), @@ -395,7 +412,7 @@ def test_structured_dtypes_select(structured_type_session, examples): [ StructField("VALUE1", LongType(), nullable=True), StructField("A", StringType(16777216), nullable=True), - StructField("B", DoubleType(), nullable=True), + StructField(nested_field_name, DoubleType(), nullable=True), StructField("VALUE2", DoubleType(), nullable=True), StructField("VALUE3", DoubleType(), nullable=True), StructField("VALUE4", DoubleType(), nullable=True), @@ -420,11 +437,13 @@ def test_structured_dtypes_select(structured_type_session, examples): reason="FEAT: SNOW-1372813 Cast to StructType not supported", ) def test_structured_dtypes_pandas(structured_type_session, structured_type_support): - pdf = _create_test_dataframe(structured_type_session).to_pandas() + pdf = _create_test_dataframe( + structured_type_session, structured_type_support + ).to_pandas() if structured_type_support: assert ( pdf.to_json() - == '{"MAP":{"0":[["k1",1.0]]},"OBJ":{"0":{"A":"foo","B":0.05}},"ARR":{"0":[1.0,3.1,4.5]}}' + == '{"MAP":{"0":[["k1",1.0]]},"OBJ":{"0":{"A":"foo","b":0.05}},"ARR":{"0":[1.0,3.1,4.5]}}' ) else: assert ( @@ -445,7 +464,7 @@ def test_structured_dtypes_iceberg( and iceberg_supported(structured_type_session, local_testing_mode) ): pytest.skip("Test requires iceberg support and structured type support.") - query, expected_dtypes, expected_schema = STRUCTURED_TYPES_EXAMPLES[True] + query, expected_dtypes, expected_schema = _create_example(True) table_name = f"snowpark_structured_dtypes_{uuid.uuid4().hex[:5]}".upper() dynamic_table_name = f"snowpark_dynamic_iceberg_{uuid.uuid4().hex[:5]}".upper() @@ -467,7 +486,7 @@ def test_structured_dtypes_iceberg( ) assert save_ddl[0][0] == ( f"create or replace ICEBERG TABLE {table_name.upper()} (\n\t" - "MAP MAP(STRING, LONG),\n\tOBJ OBJECT(A STRING, B DOUBLE),\n\tARR ARRAY(DOUBLE)\n)\n " + "MAP MAP(STRING, LONG),\n\tOBJ OBJECT(A STRING, b DOUBLE),\n\tARR ARRAY(DOUBLE)\n)\n " "EXTERNAL_VOLUME = 'PYTHON_CONNECTOR_ICEBERG_EXVOL'\n CATALOG = 'SNOWFLAKE'\n " "BASE_LOCATION = 'python_connector_merge_gate/';" ) @@ -524,27 +543,27 @@ def test_iceberg_nested_fields( "NESTED_DATA", StructType( [ - StructField('"camelCase"', StringType(), nullable=True), - StructField('"snake_case"', StringType(), nullable=True), - StructField('"PascalCase"', StringType(), nullable=True), + StructField("camelCase", StringType(), nullable=True), + StructField("snake_case", StringType(), nullable=True), + StructField("PascalCase", StringType(), nullable=True), StructField( - '"nested_map"', + "nested_map", MapType( StringType(), StructType( [ StructField( - '"inner_camelCase"', + "inner_camelCase", StringType(), nullable=True, ), StructField( - '"inner_snake_case"', + "inner_snake_case", StringType(), nullable=True, ), StructField( - '"inner_PascalCase"', + "inner_PascalCase", StringType(), nullable=True, ), @@ -600,11 +619,14 @@ def test_iceberg_nested_fields( Utils.drop_table(structured_type_session, transformed_table_name) -@pytest.mark.skip( - reason="SNOW-1819531: Error in _contains_external_cte_ref when analyzing lqb" +@pytest.mark.xfail( + "config.getoption('local_testing_mode', default=False)", + reason="local testing does not fully support structured types yet.", + run=False, ) +@pytest.mark.parametrize("cte_enabled", [True, False]) def test_struct_dtype_iceberg_lqb( - structured_type_session, local_testing_mode, structured_type_support + structured_type_session, local_testing_mode, structured_type_support, cte_enabled ): if not ( structured_type_support @@ -641,12 +663,14 @@ def test_struct_dtype_iceberg_lqb( is_query_compilation_stage_enabled = ( structured_type_session._query_compilation_stage_enabled ) + is_cte_optimization_enabled = structured_type_session._cte_optimization_enabled is_large_query_breakdown_enabled = ( structured_type_session._large_query_breakdown_enabled ) original_bounds = structured_type_session._large_query_breakdown_complexity_bounds try: structured_type_session._query_compilation_stage_enabled = True + structured_type_session._cte_optimization_enabled = cte_enabled structured_type_session._large_query_breakdown_enabled = True structured_type_session._large_query_breakdown_complexity_bounds = (300, 600) @@ -707,6 +731,7 @@ def test_struct_dtype_iceberg_lqb( structured_type_session._query_compilation_stage_enabled = ( is_query_compilation_stage_enabled ) + structured_type_session._cte_optimization_enabled = is_cte_optimization_enabled structured_type_session._large_query_breakdown_enabled = ( is_large_query_breakdown_enabled ) @@ -730,11 +755,11 @@ def test_structured_dtypes_iceberg_create_from_values( ): pytest.skip("Test requires iceberg support and structured type support.") - _, __, expected_schema = STRUCTURED_TYPES_EXAMPLES[True] + _, __, expected_schema = _create_example(True) table_name = f"snowpark_structured_dtypes_{uuid.uuid4().hex[:5]}" data = [ - ({"x": 1}, {"A": "a", "B": 1}, [1, 1, 1]), - ({"x": 2}, {"A": "b", "B": 2}, [2, 2, 2]), + ({"x": 1}, {"A": "a", "b": 1}, [1, 1, 1]), + ({"x": 2}, {"A": "b", "b": 2}, [2, 2, 2]), ] try: create_df = structured_type_session.create_dataframe( @@ -760,7 +785,7 @@ def test_structured_dtypes_iceberg_udf( and iceberg_supported(structured_type_session, local_testing_mode) ): pytest.skip("Test requires iceberg support and structured type support.") - query, expected_dtypes, expected_schema = STRUCTURED_TYPES_EXAMPLES[True] + query, expected_dtypes, expected_schema = _create_example(True) table_name = f"snowpark_structured_dtypes_udf_test{uuid.uuid4().hex[:5]}" @@ -945,8 +970,8 @@ def test_structured_type_print_schema( " | |-- key: StringType()\n" " | |-- value: ArrayType\n" " | | |-- element: StructType\n" - ' | | | |-- "FIELD1": StringType() (nullable = True)\n' - ' | | | |-- "FIELD2": LongType() (nullable = True)\n' + ' | | | |-- "Field1": StringType() (nullable = True)\n' + ' | | | |-- "Field2": LongType() (nullable = True)\n' ) # Test that depth works as expected diff --git a/tests/integ/test_catalog.py b/tests/integ/test_catalog.py new file mode 100644 index 00000000000..2f9430c645b --- /dev/null +++ b/tests/integ/test_catalog.py @@ -0,0 +1,527 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +from unittest.mock import patch +import uuid +import pytest + +from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted +from snowflake.snowpark.catalog import Catalog +from snowflake.snowpark.session import Session +from snowflake.snowpark.types import IntegerType + +pytestmark = [ + pytest.mark.xfail( + "config.getoption('local_testing_mode', default=False)", + reason="deepcopy is not supported and required by local testing", + run=False, + ) +] + +CATALOG_TEMP_OBJECT_PREFIX = "SP_CATALOG_TEMP" + + +def get_temp_name(type: str) -> str: + return f"{CATALOG_TEMP_OBJECT_PREFIX}_{type}_{uuid.uuid4().hex[:6]}".upper() + + +def create_temp_db(session) -> str: + original_db = session.get_current_database() + temp_db = get_temp_name("DB") + session._run_query(f"create or replace database {temp_db}") + session.use_database(original_db) + return temp_db + + +@pytest.fixture(scope="module") +def temp_db1(session): + temp_db = create_temp_db(session) + yield temp_db + session._run_query(f"drop database if exists {temp_db}") + + +@pytest.fixture(scope="module") +def temp_db2(session): + temp_db = create_temp_db(session) + yield temp_db + session._run_query(f"drop database if exists {temp_db}") + + +def create_temp_schema(session, db: str) -> str: + original_db = session.get_current_database() + original_schema = session.get_current_schema() + temp_schema = get_temp_name("SCHEMA") + session._run_query(f"create or replace schema {db}.{temp_schema}") + + session.use_database(original_db) + session.use_schema(original_schema) + return temp_schema + + +@pytest.fixture(scope="module") +def temp_schema1(session, temp_db1): + temp_schema = create_temp_schema(session, temp_db1) + yield temp_schema + session._run_query(f"drop schema if exists {temp_db1}.{temp_schema}") + + +@pytest.fixture(scope="module") +def temp_schema2(session, temp_db1): + temp_schema = create_temp_schema(session, temp_db1) + yield temp_schema + session._run_query(f"drop schema if exists {temp_db1}.{temp_schema}") + + +def create_temp_table(session, db: str, schema: str) -> str: + temp_table = get_temp_name("TABLE") + session._run_query( + f"create or replace temp table {db}.{schema}.{temp_table} (a int, b string)" + ) + return temp_table + + +@pytest.fixture(scope="module") +def temp_table1(session, temp_db1, temp_schema1): + temp_table = create_temp_table(session, temp_db1, temp_schema1) + yield temp_table + session._run_query(f"drop table if exists {temp_db1}.{temp_schema1}.{temp_table}") + + +@pytest.fixture(scope="module") +def temp_table2(session, temp_db1, temp_schema1): + temp_table = create_temp_table(session, temp_db1, temp_schema1) + yield temp_table + session._run_query(f"drop table if exists {temp_db1}.{temp_schema1}.{temp_table}") + + +def create_temp_view(session, db: str, schema: str) -> str: + temp_schema = get_temp_name("VIEW") + session._run_query( + f"create or replace temp view {db}.{schema}.{temp_schema} as select 1 as a, '2' as b" + ) + return temp_schema + + +@pytest.fixture(scope="module") +def temp_view1(session, temp_db1, temp_schema1): + temp_view = create_temp_view(session, temp_db1, temp_schema1) + yield temp_view + session._run_query(f"drop view if exists {temp_db1}.{temp_schema1}.{temp_view}") + + +@pytest.fixture(scope="module") +def temp_view2(session, temp_db1, temp_schema1): + temp_view = create_temp_view(session, temp_db1, temp_schema1) + yield temp_view + session._run_query(f"drop view if exists {temp_db1}.{temp_schema1}.{temp_view}") + + +def create_temp_procedure(session: Session, db, schema) -> str: + temp_procedure = get_temp_name("PROCEDURE") + session.sproc.register( + lambda _, x: x + 1, + return_type=IntegerType(), + input_types=[IntegerType()], + name=f"{db}.{schema}.{temp_procedure}", + packages=["snowflake-snowpark-python"], + ) + return temp_procedure + + +@pytest.fixture(scope="module") +def temp_procedure1(session, temp_db1, temp_schema1): + temp_procedure = create_temp_procedure(session, temp_db1, temp_schema1) + yield temp_procedure + session._run_query( + f"drop procedure if exists {temp_db1}.{temp_schema1}.{temp_procedure}(int)" + ) + + +@pytest.fixture(scope="module") +def temp_procedure2(session, temp_db1, temp_schema1): + temp_procedure = create_temp_procedure(session, temp_db1, temp_schema1) + yield temp_procedure + session._run_query( + f"drop procedure if exists {temp_db1}.{temp_schema1}.{temp_procedure}(int)" + ) + + +def create_temp_udf(session: Session, db, schema) -> str: + temp_udf = get_temp_name("UDF") + session.udf.register( + lambda x: x + 1, + return_type=IntegerType(), + input_types=[IntegerType()], + name=f"{db}.{schema}.{temp_udf}", + ) + return temp_udf + + +@pytest.fixture(scope="module") +def temp_udf1(session, temp_db1, temp_schema1): + temp_udf = create_temp_udf(session, temp_db1, temp_schema1) + yield temp_udf + session._run_query( + f"drop function if exists {temp_db1}.{temp_schema1}.{temp_udf}(int)" + ) + + +@pytest.fixture(scope="module") +def temp_udf2(session, temp_db1, temp_schema1): + temp_udf = create_temp_udf(session, temp_db1, temp_schema1) + yield temp_udf + session._run_query( + f"drop function if exists {temp_db1}.{temp_schema1}.{temp_udf}(int)" + ) + + +DOES_NOT_EXIST_PATTERN = "does_not_exist_.*" + + +def test_list_db(session, temp_db1, temp_db2): + catalog: Catalog = session.catalog + db_list = catalog.list_databases(pattern=f"{CATALOG_TEMP_OBJECT_PREFIX}_DB_*") + assert {db.name for db in db_list} >= {temp_db1, temp_db2} + + db_list = catalog.list_databases(like=f"{CATALOG_TEMP_OBJECT_PREFIX}_DB_%") + assert {db.name for db in db_list} >= {temp_db1, temp_db2} + + +def test_list_schema(session, temp_db1, temp_schema1, temp_schema2): + catalog: Catalog = session.catalog + assert ( + len(catalog.list_databases(pattern=f"{CATALOG_TEMP_OBJECT_PREFIX}_SCHEMA_.*")) + == 0 + ) + + schema_list = catalog.list_schemas( + pattern=f"{CATALOG_TEMP_OBJECT_PREFIX}_SCHEMA_.*", database=temp_db1 + ) + assert {schema.name for schema in schema_list} >= {temp_schema1, temp_schema2} + + schema_list = catalog.list_schemas( + like=f"{CATALOG_TEMP_OBJECT_PREFIX}_SCHEMA_%", database=temp_db1 + ) + assert {schema.name for schema in schema_list} >= {temp_schema1, temp_schema2} + + +def test_list_tables(session, temp_db1, temp_schema1, temp_table1, temp_table2): + catalog: Catalog = session.catalog + + assert len(catalog.list_tables(pattern=DOES_NOT_EXIST_PATTERN)) == 0 + assert ( + len( + catalog.list_tables( + pattern=DOES_NOT_EXIST_PATTERN, database=temp_db1, schema=temp_schema1 + ) + ) + == 0 + ) + + table_list = catalog.list_tables(database=temp_db1, schema=temp_schema1) + assert {table.name for table in table_list} == {temp_table1, temp_table2} + + table_list = catalog.list_tables( + database=temp_db1, + schema=temp_schema1, + like=f"{CATALOG_TEMP_OBJECT_PREFIX}_TABLE_%", + ) + assert {table.name for table in table_list} == {temp_table1, temp_table2} + + cols = catalog.list_columns(temp_table1, database=temp_db1, schema=temp_schema1) + assert len(cols) == 2 + assert cols[0].name == "A" + assert cols[0].datatype == "NUMBER(38,0)" + assert cols[0].nullable is True + assert cols[1].name == "B" + assert cols[1].datatype == "VARCHAR(16777216)" + assert cols[1].nullable is True + + +def test_list_views(session, temp_db1, temp_schema1, temp_view1, temp_view2): + catalog: Catalog = session.catalog + + assert len(catalog.list_views(pattern=DOES_NOT_EXIST_PATTERN)) == 0 + assert ( + len( + catalog.list_views( + pattern=DOES_NOT_EXIST_PATTERN, database=temp_db1, schema=temp_schema1 + ) + ) + == 0 + ) + + view_list = catalog.list_views(database=temp_db1, schema=temp_schema1) + assert {view.name for view in view_list} >= {temp_view1, temp_view2} + + view_list = catalog.list_views( + database=temp_db1, + schema=temp_schema1, + pattern=f"{CATALOG_TEMP_OBJECT_PREFIX}_VIEW.*", + ) + assert {view.name for view in view_list} >= {temp_view1, temp_view2} + + view_list = catalog.list_views( + database=temp_db1, + schema=temp_schema1, + like=f"{CATALOG_TEMP_OBJECT_PREFIX}_VIEW%", + ) + assert {view.name for view in view_list} >= {temp_view1, temp_view2} + + +def test_list_procedures( + session, temp_db1, temp_schema1, temp_procedure1, temp_procedure2 +): + catalog: Catalog = session.catalog + + assert len(catalog.list_procedures(pattern=DOES_NOT_EXIST_PATTERN)) == 0 + assert ( + len( + catalog.list_procedures( + pattern=DOES_NOT_EXIST_PATTERN, database=temp_db1, schema=temp_schema1 + ) + ) + == 0 + ) + + procedure_list = catalog.list_procedures( + database=temp_db1, + schema=temp_schema1, + pattern=f"{CATALOG_TEMP_OBJECT_PREFIX}_PROCEDURE_.*", + ) + assert {procedure.name for procedure in procedure_list} >= { + temp_procedure1, + temp_procedure2, + } + + procedure_list = catalog.list_procedures( + database=temp_db1, + schema=temp_schema1, + like=f"{CATALOG_TEMP_OBJECT_PREFIX}_PROCEDURE_%", + ) + assert {procedure.name for procedure in procedure_list} >= { + temp_procedure1, + temp_procedure2, + } + + +@pytest.mark.xfail(reason="SNOW-1787268: Bug in snowflake api functions iter") +def test_list_udfs(session, temp_db1, temp_schema1, temp_udf1, temp_udf2): + catalog: Catalog = session.catalog + + assert len(catalog.list_functions(pattern=DOES_NOT_EXIST_PATTERN)) == 0 + assert ( + len( + catalog.list_functions( + pattern=DOES_NOT_EXIST_PATTERN, database=temp_db1, schema=temp_schema1 + ) + ) + == 0 + ) + udf_list = catalog.list_functions( + database=temp_db1, + schema=temp_schema1, + pattern=f"{CATALOG_TEMP_OBJECT_PREFIX}_UDF_.*", + ) + assert {udf.name for udf in udf_list} >= {temp_udf1, temp_udf2} + + +def test_get_db_schema(session): + catalog: Catalog = session.catalog + current_db = session.get_current_database() + current_schema = session.get_current_schema() + assert catalog.get_database(current_db).name == unquote_if_quoted(current_db) + assert catalog.get_schema(current_schema).name == unquote_if_quoted(current_schema) + + +def test_get_table_view(session, temp_db1, temp_schema1, temp_table1, temp_view1): + catalog: Catalog = session.catalog + table = catalog.get_table(temp_table1, database=temp_db1, schema=temp_schema1) + assert table.name == temp_table1 + assert table.database_name == temp_db1 + assert table.schema_name == temp_schema1 + + view = catalog.get_view(temp_view1, database=temp_db1, schema=temp_schema1) + assert view.name == temp_view1 + assert view.database_name == temp_db1 + assert view.schema_name == temp_schema1 + + +def test_get_function_procedure_udf( + session, temp_db1, temp_schema1, temp_procedure1, temp_udf1 +): + catalog: Catalog = session.catalog + + procedure = catalog.get_procedure( + temp_procedure1, [IntegerType()], database=temp_db1, schema=temp_schema1 + ) + assert procedure.name == temp_procedure1 + assert procedure.database_name == temp_db1 + assert procedure.schema_name == temp_schema1 + + udf = catalog.get_user_defined_function( + temp_udf1, [IntegerType()], database=temp_db1, schema=temp_schema1 + ) + assert udf.name == temp_udf1 + assert udf.database_name == temp_db1 + assert udf.schema_name == temp_schema1 + + +def test_set_db_schema(session, temp_db1, temp_db2, temp_schema1, temp_schema2): + catalog = session.catalog + + original_db = session.get_current_database() + original_schema = session.get_current_schema() + try: + catalog.set_current_database(temp_db1) + catalog.set_current_schema(temp_schema1) + assert session.get_current_database() == f'"{temp_db1}"' + assert session.get_current_schema() == f'"{temp_schema1}"' + + catalog.set_current_schema(temp_schema2) + assert session.get_current_schema() == f'"{temp_schema2}"' + + catalog.set_current_database(temp_db2) + assert session.get_current_database() == f'"{temp_db2}"' + finally: + session.use_database(original_db) + session.use_schema(original_schema) + + +def test_exists_db_schema(session, temp_db1, temp_schema1): + catalog = session.catalog + assert catalog.database_exists(temp_db1) + assert not catalog.database_exists("does_not_exist") + + assert catalog.schema_exists(temp_schema1, database=temp_db1) + assert not catalog.schema_exists(temp_schema1, database="does_not_exist") + + +def test_exists_table_view(session, temp_db1, temp_schema1, temp_table1, temp_view1): + catalog = session.catalog + db1_obj = catalog._root.databases[temp_db1].fetch() + schema1_obj = catalog._root.databases[temp_db1].schemas[temp_schema1].fetch() + + assert catalog.table_exists(temp_table1, database=temp_db1, schema=temp_schema1) + assert catalog.table_exists(temp_table1, database=db1_obj, schema=schema1_obj) + table = catalog.get_table(temp_table1, database=temp_db1, schema=temp_schema1) + assert catalog.table_exists(table) + assert not catalog.table_exists( + "does_not_exist", database=temp_db1, schema=temp_schema1 + ) + + assert catalog.view_exists(temp_view1, database=temp_db1, schema=temp_schema1) + assert catalog.view_exists(temp_view1, database=db1_obj, schema=schema1_obj) + view = catalog.get_view(temp_view1, database=temp_db1, schema=temp_schema1) + assert catalog.view_exists(view) + assert not catalog.view_exists( + "does_not_exist", database=temp_db1, schema=temp_schema1 + ) + + +def test_exists_function_procedure_udf( + session, temp_db1, temp_schema1, temp_procedure1, temp_udf1 +): + catalog = session.catalog + db1_obj = catalog._root.databases[temp_db1].fetch() + schema1_obj = catalog._root.databases[temp_db1].schemas[temp_schema1].fetch() + + assert catalog.procedure_exists( + temp_procedure1, [IntegerType()], database=temp_db1, schema=temp_schema1 + ) + assert catalog.procedure_exists( + temp_procedure1, [IntegerType()], database=db1_obj, schema=schema1_obj + ) + proc = catalog.get_procedure( + temp_procedure1, [IntegerType()], database=temp_db1, schema=temp_schema1 + ) + assert catalog.procedure_exists(proc) + assert not catalog.procedure_exists( + "does_not_exist", [], database=temp_db1, schema=temp_schema1 + ) + + assert catalog.user_defined_function_exists( + temp_udf1, [IntegerType()], database=temp_db1, schema=temp_schema1 + ) + assert catalog.user_defined_function_exists( + temp_udf1, [IntegerType()], database=db1_obj, schema=schema1_obj + ) + udf = catalog.get_user_defined_function( + temp_udf1, [IntegerType()], database=temp_db1, schema=temp_schema1 + ) + assert catalog.user_defined_function_exists(udf) + assert not catalog.user_defined_function_exists( + "does_not_exist", [], database=temp_db1, schema=temp_schema1 + ) + + +@pytest.mark.parametrize("use_object", [True, False]) +def test_drop(session, use_object): + catalog = session.catalog + + original_db = session.get_current_database() + original_schema = session.get_current_schema() + try: + temp_db = create_temp_db(session) + temp_schema = create_temp_schema(session, temp_db) + temp_table = create_temp_table(session, temp_db, temp_schema) + temp_view = create_temp_view(session, temp_db, temp_schema) + if use_object: + temp_schema = catalog._root.databases[temp_db].schemas[temp_schema].fetch() + temp_db = catalog._root.databases[temp_db].fetch() + + assert catalog.database_exists(temp_db) + assert catalog.schema_exists(temp_schema, database=temp_db) + assert catalog.table_exists(temp_table, database=temp_db, schema=temp_schema) + assert catalog.view_exists(temp_view, database=temp_db, schema=temp_schema) + + catalog.drop_table(temp_table, database=temp_db, schema=temp_schema) + catalog.drop_view(temp_view, database=temp_db, schema=temp_schema) + + assert not catalog.table_exists( + temp_table, database=temp_db, schema=temp_schema + ) + assert not catalog.view_exists(temp_view, database=temp_db, schema=temp_schema) + + catalog.drop_schema(temp_schema, database=temp_db) + assert not catalog.schema_exists(temp_schema, database=temp_db) + + catalog.drop_database(temp_db) + assert not catalog.database_exists(temp_db) + finally: + session.use_database(original_db) + session.use_schema(original_schema) + + +def test_parse_names_negative(session): + catalog = session.catalog + with pytest.raises( + ValueError, + match="Unexpected type. Expected str or Database, got ''", + ): + catalog.database_exists(123) + + with pytest.raises( + ValueError, match="Unexpected type. Expected str or Schema, got ''" + ): + catalog.schema_exists(123) + + with pytest.raises( + ValueError, + match="arg_types must be provided when function/procedure is a string", + ): + catalog.procedure_exists("proc") + + with patch.object(session, "get_current_database", return_value=None): + with pytest.raises( + ValueError, + match="No database detected. Please provide database to proceed.", + ): + catalog._parse_database(database=None) + + with patch.object(session, "get_current_schema", return_value=None): + with pytest.raises( + ValueError, match="No schema detected. Please provide schema to proceed." + ): + catalog._parse_schema(schema=None) diff --git a/tests/integ/test_function.py b/tests/integ/test_function.py index 2b22fe692df..05d912fc0ce 100644 --- a/tests/integ/test_function.py +++ b/tests/integ/test_function.py @@ -127,6 +127,7 @@ reverse, sequence, size, + snowflake_cortex_sentiment, snowflake_cortex_summarize, split, sqrt, @@ -174,7 +175,7 @@ TimestampType, VariantType, ) -from tests.utils import TestData, Utils +from tests.utils import TestData, Utils, running_on_jenkins def test_order(session): @@ -2272,8 +2273,15 @@ def test_ln(session): "config.getoption('local_testing_mode', default=False)", reason="FEAT: snowflake_cortex functions not supported", ) -@pytest.mark.skip("SNOW-1758914 snowflake.cortex.summarize error on GCP") +@pytest.mark.skipif( + running_on_jenkins(), + reason="TODO: SNOW-1859087 snowflake.cortex.summarize SSL error", +) def test_snowflake_cortex_summarize(session): + # TODO: SNOW-1758914 snowflake.cortex.summarize error on GCP + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + content = """In Snowpark, the main way in which you query and process data is through a DataFrame. This topic explains how to work with DataFrames. To retrieve and manipulate data, you use the DataFrame class. A DataFrame represents a relational dataset that is evaluated lazily: it only executes when a specific action is triggered. In a sense, a DataFrame is like a query that needs to be evaluated in order to retrieve data. @@ -2302,3 +2310,27 @@ def test_snowflake_cortex_summarize(session): # this length check is to get around the fact that this function may not be deterministic assert 0 < len(summary_from_col) < len(content) assert 0 < len(summary_from_str) < len(content) + + +@pytest.mark.skipif( + "config.getoption('local_testing_mode', default=False)", + reason="FEAT: snowflake_cortex functions not supported", +) +@pytest.mark.skipif( + running_on_jenkins(), + reason="TODO: SNOW-1859087 snowflake.cortex.sentiment SSL error", +) +def test_snowflake_cortex_sentiment(session): + # TODO: SNOW-1758914 snowflake.cortex.sentiment error on GCP + if session.connection.host == "sfctest0.us-central1.gcp.snowflakecomputing.com": + return + content = "A very very bad review!" + df = session.create_dataframe([[content]], schema=["content"]) + + sentiment_from_col = df.select( + snowflake_cortex_sentiment(col("content")) + ).collect()[0][0] + sentiment_from_str = df.select(snowflake_cortex_sentiment(content)).collect()[0][0] + + assert -1 <= sentiment_from_col <= 0 + assert -1 <= sentiment_from_str <= 0 diff --git a/tests/integ/test_lineage.py b/tests/integ/test_lineage.py index 3a18fcf26d0..b5a0e942183 100644 --- a/tests/integ/test_lineage.py +++ b/tests/integ/test_lineage.py @@ -200,19 +200,10 @@ def test_lineage_trace(session): df = remove_created_on_field(df.to_pandas()) expected_data = { - "SOURCE_OBJECT": [ - {"domain": "VIEW", "name": f"{db}.{schema}.V2", "status": "ACTIVE"}, - {"domain": "VIEW", "name": f"{db}.{schema}.V3", "status": "DELETED"}, - ], - "TARGET_OBJECT": [ - {"domain": "VIEW", "name": f"{db}.{schema}.V3", "status": "DELETED"}, - {"domain": "VIEW", "name": f"{db}.{schema}.V4", "status": "ACTIVE"}, - ], - "DIRECTION": [ - "Downstream", - "Downstream", - ], - "DISTANCE": [1, 2], + "SOURCE_OBJECT": [], + "TARGET_OBJECT": [], + "DIRECTION": [], + "DISTANCE": [], } expected_df = pd.DataFrame(expected_data) diff --git a/tests/integ/test_stored_procedure.py b/tests/integ/test_stored_procedure.py index 20c63d78642..9345bca0bb8 100644 --- a/tests/integ/test_stored_procedure.py +++ b/tests/integ/test_stored_procedure.py @@ -388,8 +388,8 @@ def test_stored_procedure_with_structured_returns( "OBJ", StructType( [ - StructField('"a"', StringType(16777216), nullable=True), - StructField('"b"', DoubleType(), nullable=True), + StructField("a", StringType(16777216), nullable=True), + StructField("b", DoubleType(), nullable=True), ], structured=True, ), diff --git a/tests/unit/compiler/test_large_query_breakdown.py b/tests/unit/compiler/test_large_query_breakdown.py index 7d9658ad78f..e7e18cd4dc0 100644 --- a/tests/unit/compiler/test_large_query_breakdown.py +++ b/tests/unit/compiler/test_large_query_breakdown.py @@ -135,3 +135,42 @@ def test_pipeline_breaker_node(mock_session, mock_analyzer, node_generator, expe large_query_breakdown._is_node_pipeline_breaker(select_snowflake_plan) is expected ), "SelectSnowflakePlan node is not detected as a pipeline breaker node" + + +@pytest.mark.parametrize( + "node_generator,expected", + [ + ( + lambda x: SelectStatement( + from_=empty_selectable, order_by=[empty_expression], analyzer=x + ), + True, + ), + ], +) +def test_relaxed_pipeline_breaker_node( + mock_session, mock_analyzer, node_generator, expected +): + large_query_breakdown = LargeQueryBreakdown( + mock_session, + mock_analyzer, + [], + mock_session.large_query_breakdown_complexity_bounds, + ) + node = node_generator(mock_analyzer) + + assert ( + large_query_breakdown._is_relaxed_pipeline_breaker(node) is expected + ), f"Node {type(node)} is not detected as a pipeline breaker node" + + resolved_node = mock_analyzer.resolve(node) + assert isinstance(resolved_node, SnowflakePlan) + assert ( + large_query_breakdown._is_relaxed_pipeline_breaker(resolved_node) is expected + ), f"Resolved node of {type(node)} is not detected as a pipeline breaker node" + + select_snowflake_plan = SelectSnowflakePlan(resolved_node, analyzer=mock_analyzer) + assert ( + large_query_breakdown._is_relaxed_pipeline_breaker(select_snowflake_plan) + is expected + ), "SelectSnowflakePlan node is not detected as a pipeline breaker node" diff --git a/tests/unit/compiler/test_replace_child_and_update_node.py b/tests/unit/compiler/test_replace_child_and_update_node.py index b3dcef9f180..de235a16d90 100644 --- a/tests/unit/compiler/test_replace_child_and_update_node.py +++ b/tests/unit/compiler/test_replace_child_and_update_node.py @@ -3,6 +3,7 @@ # import copy +from functools import partial from unittest import mock import pytest @@ -67,6 +68,7 @@ def mock_snowflake_plan() -> SnowflakePlan: with_query_block = WithQueryBlock(name="TEST_CTE", child=LogicalPlan()) fake_snowflake_plan.referenced_ctes = {with_query_block: 1} fake_snowflake_plan._cumulative_node_complexity = {} + fake_snowflake_plan._is_valid_for_replacement = True return fake_snowflake_plan @@ -82,6 +84,9 @@ def mock_resolve(x): fake_query_generator = mock.create_autospec(QueryGenerator) fake_query_generator.resolve.side_effect = mock_resolve fake_query_generator.session = mock_session + fake_query_generator.to_selectable = partial( + QueryGenerator.to_selectable, fake_query_generator + ) return fake_query_generator diff --git a/tests/unit/modin/test_groupby_unsupported.py b/tests/unit/modin/test_groupby_unsupported.py index 17c0aca1059..0ec9ecd3bfe 100644 --- a/tests/unit/modin/test_groupby_unsupported.py +++ b/tests/unit/modin/test_groupby_unsupported.py @@ -24,7 +24,6 @@ (lambda se: se.groupby("A").nlargest(4), "nlargest"), (lambda se: se.groupby("A").nsmallest(4), "nsmallest"), (lambda se: se.groupby("A").nth(5), "nth"), - (lambda se: se.groupby("A").unique(), "unique"), (lambda se: se.groupby("A").ohlc(), "ohlc"), (lambda se: se.groupby("A").prod(), "prod"), (lambda se: se.groupby("A").resample("3T"), "resample"), diff --git a/tests/unit/modin/test_series_dt.py b/tests/unit/modin/test_series_dt.py index d3265c5554a..70c4daf08c1 100644 --- a/tests/unit/modin/test_series_dt.py +++ b/tests/unit/modin/test_series_dt.py @@ -35,7 +35,6 @@ def mock_query_compiler_for_dt_series() -> SnowflakeQueryCompiler: [ (lambda s: s.dt.timetz, "timetz"), (lambda s: s.dt.to_period(), "to_period"), - (lambda s: s.dt.strftime(date_format="YY/MM/DD"), "strftime"), (lambda s: s.dt.qyear, "qyear"), (lambda s: s.dt.start_time, "start_time"), (lambda s: s.dt.end_time, "end_time"), diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py index 3fa742074bf..5bc96f2e572 100644 --- a/tests/unit/modin/test_unsupported.py +++ b/tests/unit/modin/test_unsupported.py @@ -71,8 +71,6 @@ def test_unsupported_general(general_method, kwargs): ["combine", {"other": "", "func": ""}], ["combine_first", {"other": ""}], ["filter", {}], - ["from_dict", {"data": ""}], - ["from_records", {"data": ""}], ["hist", {}], ["infer_objects", {}], ["interpolate", {}], @@ -135,7 +133,6 @@ def test_unsupported_df(df_method, kwargs): ["asof", {"where": ""}], ["at_time", {"time": ""}], ["autocorr", {}], - ["between", {"left": "", "right": ""}], ["between_time", {"start_time": "", "end_time": ""}], ["bool", {}], ["clip", {}], diff --git a/tests/unit/test_datatype_mapper.py b/tests/unit/test_datatype_mapper.py index af8b9cd3c1c..210ddd50866 100644 --- a/tests/unit/test_datatype_mapper.py +++ b/tests/unit/test_datatype_mapper.py @@ -5,14 +5,18 @@ import datetime from decimal import Decimal +from unittest.mock import MagicMock import pytest +from snowflake.snowpark import Session from snowflake.snowpark._internal.analyzer.datatype_mapper import ( numeric_to_sql_without_cast, schema_expression, to_sql, + to_sql_no_cast, ) +from snowflake.snowpark._internal.udf_utils import generate_call_python_sp_sql from snowflake.snowpark.types import ( ArrayType, BinaryType, @@ -156,6 +160,118 @@ def test_to_sql(): ) +def test_to_sql_system_function(): + # Test nulls + assert to_sql_no_cast(None, NullType()) == "NULL" + assert to_sql_no_cast(None, ArrayType(DoubleType())) == "NULL" + assert to_sql_no_cast(None, MapType(IntegerType(), ByteType())) == "NULL" + assert to_sql_no_cast(None, StructType([])) == "NULL" + assert to_sql_no_cast(None, GeographyType()) == "NULL" + assert to_sql_no_cast(None, GeometryType()) == "NULL" + + assert to_sql_no_cast(None, IntegerType()) == "NULL" + assert to_sql_no_cast(None, ShortType()) == "NULL" + assert to_sql_no_cast(None, ByteType()) == "NULL" + assert to_sql_no_cast(None, LongType()) == "NULL" + assert to_sql_no_cast(None, FloatType()) == "NULL" + assert to_sql_no_cast(None, StringType()) == "NULL" + assert to_sql_no_cast(None, DoubleType()) == "NULL" + assert to_sql_no_cast(None, BooleanType()) == "NULL" + + assert to_sql_no_cast(None, "Not any of the previous types") == "NULL" + + # Test non-nulls + assert ( + to_sql_no_cast("\\ ' ' abc \n \\", StringType()) + == "'\\\\ '' '' abc \\n \\\\'" + ) + assert ( + to_sql_no_cast("\\ ' ' abc \n \\", StringType()) + == "'\\\\ '' '' abc \\n \\\\'" + ) + assert to_sql_no_cast(1, ByteType()) == "1" + assert to_sql_no_cast(1, ShortType()) == "1" + assert to_sql_no_cast(1, IntegerType()) == "1" + assert to_sql_no_cast(1, LongType()) == "1" + assert to_sql_no_cast(1, BooleanType()) == "1" + assert to_sql_no_cast(0, ByteType()) == "0" + assert to_sql_no_cast(0, ShortType()) == "0" + assert to_sql_no_cast(0, IntegerType()) == "0" + assert to_sql_no_cast(0, LongType()) == "0" + assert to_sql_no_cast(0, BooleanType()) == "0" + + assert to_sql_no_cast(float("nan"), FloatType()) == "'NAN'" + assert to_sql_no_cast(float("inf"), FloatType()) == "'INF'" + assert to_sql_no_cast(float("-inf"), FloatType()) == "'-INF'" + assert to_sql_no_cast(1.2, FloatType()) == "1.2" + + assert to_sql_no_cast(float("nan"), DoubleType()) == "'NAN'" + assert to_sql_no_cast(float("inf"), DoubleType()) == "'INF'" + assert to_sql_no_cast(float("-inf"), DoubleType()) == "'-INF'" + assert to_sql_no_cast(1.2, DoubleType()) == "1.2" + + assert to_sql_no_cast(Decimal(0.5), DecimalType(2, 1)) == "0.5" + + assert to_sql_no_cast(397, DateType()) == "'1971-02-02'" + + assert to_sql_no_cast(datetime.date(1971, 2, 2), DateType()) == "'1971-02-02'" + + assert ( + to_sql_no_cast(1622002533000000, TimestampType()) + == "'2021-05-26 04:15:33+00:00'" + ) + + assert ( + to_sql_no_cast(bytearray.fromhex("2Ef0 F1f2 "), BinaryType()) + == "b'.\\xf0\\xf1\\xf2'" + ) + + assert to_sql_no_cast([1, "2", 3.5], ArrayType()) == "PARSE_JSON('[1, \"2\", 3.5]')" + assert to_sql_no_cast({"'": '"'}, MapType()) == 'PARSE_JSON(\'{"\'\'": "\\\\""}\')' + assert to_sql_no_cast([{1: 2}], ArrayType()) == "PARSE_JSON('[{\"1\": 2}]')" + assert to_sql_no_cast({1: [2]}, MapType()) == "PARSE_JSON('{\"1\": [2]}')" + + assert to_sql_no_cast([1, bytearray(1)], ArrayType()) == "PARSE_JSON('[1, \"00\"]')" + + assert ( + to_sql_no_cast(["2", Decimal(0.5)], ArrayType()) == "PARSE_JSON('[\"2\", 0.5]')" + ) + + dt = datetime.datetime.today() + assert ( + to_sql_no_cast({1: dt}, MapType()) + == 'PARSE_JSON(\'{"1": "' + dt.isoformat() + "\"}')" + ) + + assert to_sql_no_cast([1, 2, 3.5], VectorType(float, 3)) == "[1, 2, 3.5]" + assert ( + to_sql_no_cast("POINT(-122.35 37.55)", GeographyType()) + == "TO_GEOGRAPHY('POINT(-122.35 37.55)')" + ) + assert ( + to_sql_no_cast("POINT(-122.35 37.55)", GeometryType()) + == "TO_GEOMETRY('POINT(-122.35 37.55)')" + ) + assert to_sql_no_cast("1", VariantType()) == "PARSE_JSON('\"1\"')" + assert ( + to_sql_no_cast([1, 2, 3.5, 4.1234567, -3.8], VectorType("float", 5)) + == "[1, 2, 3.5, 4.1234567, -3.8]" + ) + assert to_sql_no_cast([1, 2, 3], VectorType(int, 3)) == "[1, 2, 3]" + assert ( + to_sql_no_cast([1, 2, 31234567, -1928, 0, -3], VectorType(int, 5)) + == "[1, 2, 31234567, -1928, 0, -3]" + ) + + +def test_generate_call_python_sp_sql(): + fake_session = MagicMock(Session) + assert ( + generate_call_python_sp_sql(fake_session, "system$wait", 1) + == "CALL system$wait(1)" + ) + + @pytest.mark.parametrize( "timezone, expected", [ diff --git a/tox.ini b/tox.ini index 462138104a0..86c3736d246 100644 --- a/tox.ini +++ b/tox.ini @@ -184,6 +184,7 @@ commands = pyright src/snowflake/snowpark/_internal/analyzer pyright src/snowflake/snowpark/_internal/compiler pyright src/snowflake/snowpark/stored_procedure_profiler.py + pyright src/snowflake/snowpark/catalog.py [testenv:protoc] description = generate python code from protobuf