SNOW-1539454: Fix Star expression column dependency (#1945)

snowflakedb · Jul 19, 2024 · c482c53 · c482c53
1 parent 2fbd524
commit c482c53
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,9 +5,11 @@
 ### Snowpark Python API Updates
 
 #### Improvements
-
 - Added support server side string size limitations.
 
+#### Bug Fixes
+- Fixed a bug where SQL generated for selecting `*` column has an incorrect subquery.
+
 ### Snowpark Local Testing Updates
 #### New Features
 

diff --git a/src/snowflake/snowpark/_internal/analyzer/expression.py b/src/snowflake/snowpark/_internal/analyzer/expression.py
@@ -226,7 +226,14 @@ def __init__(
         self.df_alias = df_alias
 
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
-        return derive_dependent_columns(*self.expressions)
+        # When the column is `df['*']`, `expressions` contains Attributes from all columns
+        # When the column is `col('*')` or just '*' string, `expressions` is empty,
+        # but its dependent columns should be all columns too
+        return (
+            derive_dependent_columns(*self.expressions)
+            if self.expressions
+            else COLUMN_DEPENDENCY_ALL
+        )
 
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:

diff --git a/tests/integ/test_simplifier_suite.py b/tests/integ/test_simplifier_suite.py
@@ -22,6 +22,7 @@
     iff,
     lit,
     min as min_,
+    object_construct_keep_null,
     row_number,
     seq1,
     sql_expr,
@@ -1331,3 +1332,19 @@ def test_data_generator_with_filter(session):
         df.with_column("B", seq1()).with_column("C", min_("B").over()).filter(df.A == 1)
     )
     Utils.check_answer(df, [Row(1, 1, 0)])
+
+
+def test_star_column(session):
+    # convert to a table
+    df = session.create_dataframe(
+        [[0, "a"], [1, "b"]], schema=["a", "b"]
+    ).cache_result()
+    # select a column and rename it twice
+    df1 = df.select(col("a").as_("x"), "b").select(col("x").as_("y"), "b")
+    df2 = df1.select(object_construct_keep_null("*"))
+    # expect that no subquery is flattened
+    query = df2.queries["queries"][0]
+    assert query.count("SELECT") == 3
+    Utils.check_answer(
+        df2, [Row('{\n  "B": "a",\n  "Y": 0\n}'), Row('{\n  "B": "b",\n  "Y": 1\n}')]
+    )