From e9422a10d81dbc3e3a9b49765f8edd3fd3f64e9b Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 29 Apr 2024 16:43:53 -0700 Subject: [PATCH] Add details about loc bug and test --- .../snowpark/modin/plugin/PANDAS_CHANGELOG.md | 3 +++ .../modin/plugin/_internal/indexing_utils.py | 4 ++-- tests/integ/modin/frame/test_loc.py | 20 +++++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md index 9afd0560f38..ff6e9431084 100644 --- a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md +++ b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md @@ -6,6 +6,9 @@ - Fixed incorrect regex used in `Series.str.contains`. - Fixed DataFrame's `__getitem__` with boolean DataFrame key. +### Behavior Changes +- As a part of the transition to pandas 2.2.1, pandas `df.loc` and `__setitem__` have buggy behavior when a column key is used to assign a DataFrame item to a DataFrame (a scalar column key and DataFrame item are used for assignment (https://github.com/pandas-dev/pandas/issues/58482)). Snowpark pandas deviates from this behavior and will maintain the same behavior as pandas from versions 1.5.x. + ## 1.14.0a2 (2024-04-18) ### Behavior Changes diff --git a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py index 3906dd748df..42ef43cae20 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py @@ -1791,7 +1791,7 @@ def _set_2d_labels_helper_for_frame_item( Args: internal_frame: the internal frame for the main dataframe/series - index: the internal frame for the index. Note that index can be None and we can save one join for this case. + index: the internal frame for the index. Note that index can be None, and we can save one join for this case. item: the internal frame for the item matching_item_columns_by_label: whether matching item columns by labels or positions matching_item_rows_by_label: whether matching item rows by labels or positions @@ -1799,7 +1799,7 @@ def _set_2d_labels_helper_for_frame_item( index_is_bool_indexer: if True, the index is a boolean indexer Returns: - the frame joined with internal frame, index, and, item + the frame joined with internal frame, index, and item """ if not matching_item_columns_by_label: expected_num_cols_item = len(col_info.column_pandas_labels) diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index f9de4c9b071..0472d4d5f8d 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -2736,13 +2736,7 @@ def set_loc_helper(df): "key", [ ["A"], # matching_item_columns_by_label = True - pytest.param( - "A", # matching_item_columns_by_label = False - marks=pytest.mark.xfail( - strict=True, - reason="SNOW-1057861: Investigate locset behavior with missing index value", - ), - ), + "A", # matching_item_columns_by_label = False ], ) def test_df_loc_set_item_df_single_value(key, val_index, val_columns): @@ -2750,6 +2744,7 @@ def test_df_loc_set_item_df_single_value(key, val_index, val_columns): [[91, -2, 83, 74], [95, -6, 87, 78], [99, -10, 811, 712], [913, -14, 815, 716]], index=["x", "x", "z", "w"], columns=["A", "B", "C", "D"], + dtype=float, ) val = native_pd.DataFrame([100], columns=val_columns, index=val_index) @@ -2758,7 +2753,16 @@ def setitem(df): if isinstance(df, pd.DataFrame): df.loc[:, key] = pd.DataFrame(val) else: - df.loc[:, key] = val + # There is a bug in pandas when assigning a DataFrame item when the key is a scalar. + # In the case of this test, that is when `key == "A"`. + # To make sure Snowpark pandas works as expected, the column key is hard coded to ["A"], and the result + # for `df.loc[:, "A"] = val` is evaluated. + # SNOW-1057861, pandas issue: https://github.com/pandas-dev/pandas/issues/58482 + if key == "A" and val_index == ["x"] and val_columns == ["Z"]: + df.iloc[[0, 1], 0] = 100 + df.iloc[[2, 3], 0] = np.nan + else: + df.loc[:, ["A"]] = val with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result(