diff --git a/CHANGELOG.md b/CHANGELOG.md index 1104ab872f6..fef749247d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,11 @@ ### Snowpark Python API Updates #### Improvements - - Added support server side string size limitations. +#### Bug Fixes +- Fixed a bug where SQL generated for selecting `*` column has an incorrect subquery. + ### Snowpark Local Testing Updates #### New Features diff --git a/src/snowflake/snowpark/_internal/analyzer/expression.py b/src/snowflake/snowpark/_internal/analyzer/expression.py index 03a2fbd68bc..fe4446749d0 100644 --- a/src/snowflake/snowpark/_internal/analyzer/expression.py +++ b/src/snowflake/snowpark/_internal/analyzer/expression.py @@ -226,7 +226,14 @@ def __init__( self.df_alias = df_alias def dependent_column_names(self) -> Optional[AbstractSet[str]]: - return derive_dependent_columns(*self.expressions) + # When the column is `df['*']`, `expressions` contains Attributes from all columns + # When the column is `col('*')` or just '*' string, `expressions` is empty, + # but its dependent columns should be all columns too + return ( + derive_dependent_columns(*self.expressions) + if self.expressions + else COLUMN_DEPENDENCY_ALL + ) @property def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]: diff --git a/tests/integ/test_simplifier_suite.py b/tests/integ/test_simplifier_suite.py index 9d60f64eca8..1c7dbb4abec 100644 --- a/tests/integ/test_simplifier_suite.py +++ b/tests/integ/test_simplifier_suite.py @@ -22,6 +22,7 @@ iff, lit, min as min_, + object_construct_keep_null, row_number, seq1, sql_expr, @@ -1331,3 +1332,19 @@ def test_data_generator_with_filter(session): df.with_column("B", seq1()).with_column("C", min_("B").over()).filter(df.A == 1) ) Utils.check_answer(df, [Row(1, 1, 0)]) + + +def test_star_column(session): + # convert to a table + df = session.create_dataframe( + [[0, "a"], [1, "b"]], schema=["a", "b"] + ).cache_result() + # select a column and rename it twice + df1 = df.select(col("a").as_("x"), "b").select(col("x").as_("y"), "b") + df2 = df1.select(object_construct_keep_null("*")) + # expect that no subquery is flattened + query = df2.queries["queries"][0] + assert query.count("SELECT") == 3 + Utils.check_answer( + df2, [Row('{\n "B": "a",\n "Y": 0\n}'), Row('{\n "B": "b",\n "Y": 1\n}')] + )