Skip to content

Commit

Permalink
Add details about loc bug and test
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-vbudati committed Apr 29, 2024
1 parent 8d97559 commit e9422a1
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 10 deletions.
3 changes: 3 additions & 0 deletions src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
- Fixed incorrect regex used in `Series.str.contains`.
- Fixed DataFrame's `__getitem__` with boolean DataFrame key.

### Behavior Changes
- As a part of the transition to pandas 2.2.1, pandas `df.loc` and `__setitem__` have buggy behavior when a column key is used to assign a DataFrame item to a DataFrame (a scalar column key and DataFrame item are used for assignment (https://github.com/pandas-dev/pandas/issues/58482)). Snowpark pandas deviates from this behavior and will maintain the same behavior as pandas from versions 1.5.x.

## 1.14.0a2 (2024-04-18)

### Behavior Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1791,15 +1791,15 @@ def _set_2d_labels_helper_for_frame_item(
Args:
internal_frame: the internal frame for the main dataframe/series
index: the internal frame for the index. Note that index can be None and we can save one join for this case.
index: the internal frame for the index. Note that index can be None, and we can save one join for this case.
item: the internal frame for the item
matching_item_columns_by_label: whether matching item columns by labels or positions
matching_item_rows_by_label: whether matching item rows by labels or positions
col_info: the column information extracted from columns input
index_is_bool_indexer: if True, the index is a boolean indexer
Returns:
the frame joined with internal frame, index, and, item
the frame joined with internal frame, index, and item
"""
if not matching_item_columns_by_label:
expected_num_cols_item = len(col_info.column_pandas_labels)
Expand Down
20 changes: 12 additions & 8 deletions tests/integ/modin/frame/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2736,20 +2736,15 @@ def set_loc_helper(df):
"key",
[
["A"], # matching_item_columns_by_label = True
pytest.param(
"A", # matching_item_columns_by_label = False
marks=pytest.mark.xfail(
strict=True,
reason="SNOW-1057861: Investigate locset behavior with missing index value",
),
),
"A", # matching_item_columns_by_label = False
],
)
def test_df_loc_set_item_df_single_value(key, val_index, val_columns):
native_df = native_pd.DataFrame(
[[91, -2, 83, 74], [95, -6, 87, 78], [99, -10, 811, 712], [913, -14, 815, 716]],
index=["x", "x", "z", "w"],
columns=["A", "B", "C", "D"],
dtype=float,
)

val = native_pd.DataFrame([100], columns=val_columns, index=val_index)
Expand All @@ -2758,7 +2753,16 @@ def setitem(df):
if isinstance(df, pd.DataFrame):
df.loc[:, key] = pd.DataFrame(val)
else:
df.loc[:, key] = val
# There is a bug in pandas when assigning a DataFrame item when the key is a scalar.
# In the case of this test, that is when `key == "A"`.
# To make sure Snowpark pandas works as expected, the column key is hard coded to ["A"], and the result
# for `df.loc[:, "A"] = val` is evaluated.
# SNOW-1057861, pandas issue: https://github.com/pandas-dev/pandas/issues/58482
if key == "A" and val_index == ["x"] and val_columns == ["Z"]:
df.iloc[[0, 1], 0] = 100
df.iloc[[2, 3], 0] = np.nan
else:
df.loc[:, ["A"]] = val

with SqlCounter(query_count=1, join_count=1):
eval_snowpark_pandas_result(
Expand Down

0 comments on commit e9422a1

Please sign in to comment.