From e9422a10d81dbc3e3a9b49765f8edd3fd3f64e9b Mon Sep 17 00:00:00 2001
From: Varnika Budati <varnika.budati@snowflake.com>
Date: Mon, 29 Apr 2024 16:43:53 -0700
Subject: [PATCH] Add details about loc bug and test

---
 .../snowpark/modin/plugin/PANDAS_CHANGELOG.md |  3 +++
 .../modin/plugin/_internal/indexing_utils.py  |  4 ++--
 tests/integ/modin/frame/test_loc.py           | 20 +++++++++++--------
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
index 9afd0560f38..ff6e9431084 100644
--- a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
+++ b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
@@ -6,6 +6,9 @@
 - Fixed incorrect regex used in `Series.str.contains`.
 - Fixed DataFrame's `__getitem__` with boolean DataFrame key.
 
+### Behavior Changes
+- As a part of the transition to pandas 2.2.1, pandas `df.loc` and `__setitem__` have buggy behavior when a column key is used to assign a DataFrame item to a DataFrame (a scalar column key and DataFrame item are used for assignment (https://github.com/pandas-dev/pandas/issues/58482)). Snowpark pandas deviates from this behavior and will maintain the same behavior as pandas from versions 1.5.x.
+
 ## 1.14.0a2 (2024-04-18)
 
 ### Behavior Changes
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py
index 3906dd748df..42ef43cae20 100644
--- a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py
+++ b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py
@@ -1791,7 +1791,7 @@ def _set_2d_labels_helper_for_frame_item(
 
     Args:
         internal_frame: the internal frame for the main dataframe/series
-        index: the internal frame for the index. Note that index can be None and we can save one join for this case.
+        index: the internal frame for the index. Note that index can be None, and we can save one join for this case.
         item: the internal frame for the item
         matching_item_columns_by_label: whether matching item columns by labels or positions
         matching_item_rows_by_label: whether matching item rows by labels or positions
@@ -1799,7 +1799,7 @@ def _set_2d_labels_helper_for_frame_item(
         index_is_bool_indexer: if True, the index is a boolean indexer
 
     Returns:
-        the frame joined with internal frame, index, and, item
+        the frame joined with internal frame, index, and item
     """
     if not matching_item_columns_by_label:
         expected_num_cols_item = len(col_info.column_pandas_labels)
diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py
index f9de4c9b071..0472d4d5f8d 100644
--- a/tests/integ/modin/frame/test_loc.py
+++ b/tests/integ/modin/frame/test_loc.py
@@ -2736,13 +2736,7 @@ def set_loc_helper(df):
     "key",
     [
         ["A"],  # matching_item_columns_by_label = True
-        pytest.param(
-            "A",  # matching_item_columns_by_label = False
-            marks=pytest.mark.xfail(
-                strict=True,
-                reason="SNOW-1057861: Investigate locset behavior with missing index value",
-            ),
-        ),
+        "A",  # matching_item_columns_by_label = False
     ],
 )
 def test_df_loc_set_item_df_single_value(key, val_index, val_columns):
@@ -2750,6 +2744,7 @@ def test_df_loc_set_item_df_single_value(key, val_index, val_columns):
         [[91, -2, 83, 74], [95, -6, 87, 78], [99, -10, 811, 712], [913, -14, 815, 716]],
         index=["x", "x", "z", "w"],
         columns=["A", "B", "C", "D"],
+        dtype=float,
     )
 
     val = native_pd.DataFrame([100], columns=val_columns, index=val_index)
@@ -2758,7 +2753,16 @@ def setitem(df):
         if isinstance(df, pd.DataFrame):
             df.loc[:, key] = pd.DataFrame(val)
         else:
-            df.loc[:, key] = val
+            # There is a bug in pandas when assigning a DataFrame item when the key is a scalar.
+            # In the case of this test, that is when `key == "A"`.
+            # To make sure Snowpark pandas works as expected, the column key is hard coded to ["A"], and the result
+            # for `df.loc[:, "A"] = val` is evaluated.
+            # SNOW-1057861, pandas issue: https://github.com/pandas-dev/pandas/issues/58482
+            if key == "A" and val_index == ["x"] and val_columns == ["Z"]:
+                df.iloc[[0, 1], 0] = 100
+                df.iloc[[2, 3], 0] = np.nan
+            else:
+                df.loc[:, ["A"]] = val
 
     with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(