Skip to content

Commit

Permalink
TST (string dtype): fix groupby xfails with using_infer_string + upda…
Browse files Browse the repository at this point in the history
…te error message (pandas-dev#59430)

Co-authored-by: Joris Van den Bossche <[email protected]>
(cherry picked from commit e5dd89d)
  • Loading branch information
jbrockmendel authored and jorisvandenbossche committed Nov 8, 2024
1 parent cacd4bb commit a3cf70f
Show file tree
Hide file tree
Showing 13 changed files with 166 additions and 43 deletions.
14 changes: 14 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2285,6 +2285,20 @@ def _groupby_op(
**kwargs,
):
if isinstance(self.dtype, StringDtype):
if how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
return super()._groupby_op(
how=how,
has_dropped_na=has_dropped_na,
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2369,6 +2369,20 @@ def _groupby_op(
# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
if op.how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
if op.how not in ["any", "all"]:
# Fail early to avoid conversion to object
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4394,9 +4394,9 @@ def quantile(
starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
if is_object_dtype(vals.dtype):
if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
raise TypeError(
"'quantile' cannot be performed against 'object' dtypes!"
f"dtype '{vals.dtype}' does not support operation 'quantile'"
)

inference: DtypeObj | None = None
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2083,7 +2083,7 @@ def test_unstack_period_frame(self):
@pytest.mark.filterwarnings(
"ignore:The previous implementation of stack is deprecated"
)
def test_stack_multiple_bug(self, future_stack):
def test_stack_multiple_bug(self, future_stack, using_infer_string):
# bug when some uniques are not present in the data GH#3170
id_col = ([1] * 3) + ([2] * 3)
name = (["a"] * 3) + (["b"] * 3)
Expand All @@ -2095,6 +2095,8 @@ def test_stack_multiple_bug(self, future_stack):
multi.columns.name = "Params"
unst = multi.unstack("ID")
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
unst.resample("W-THU").mean()
down = unst.resample("W-THU").mean(numeric_only=True)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,14 @@ def test_cython_agg_return_dict():

def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)

grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
msg = "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
expected = grouped.agg(np.sum)
expected = grouped.agg(np.sum).astype(object)
tm.assert_series_equal(summed, expected)


Expand Down
9 changes: 4 additions & 5 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
msg = "dtype 'object' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("key").quantile()


Expand Down Expand Up @@ -253,7 +254,6 @@ def test_groupby_quantile_nullable_array(values, q):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
Expand All @@ -263,9 +263,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
):
msg = "dtype '.*' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("a").quantile(q, numeric_only=numeric_only)


Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ def test_frame_multi_key_function_list():
tm.assert_frame_equal(agged, expected)


def test_frame_multi_key_function_list_partial_failure():
def test_frame_multi_key_function_list_partial_failure(using_infer_string):
data = DataFrame(
{
"A": [
Expand Down Expand Up @@ -691,6 +691,8 @@ def test_frame_multi_key_function_list_partial_failure():
grouped = data.groupby(["A", "B"])
funcs = ["mean", "std"]
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)

Expand Down Expand Up @@ -981,9 +983,11 @@ def test_groupby_multi_corner(df):
tm.assert_frame_equal(agged, expected)


def test_raises_on_nuisance(df):
def test_raises_on_nuisance(df, using_infer_string):
grouped = df.groupby("A")
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -1026,15 +1030,18 @@ def test_keep_nuisance_agg(df, agg_function):
["sum", "mean", "prod", "std", "var", "sem", "median"],
)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_omit_nuisance_agg(df, agg_function, numeric_only):
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
# GH 38774, GH 38815
grouped = df.groupby("A")

no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
if agg_function in ("std", "sem"):
if using_infer_string:
msg = f"dtype 'str' does not support operation '{agg_function}'"
klass = TypeError
elif agg_function in ("std", "sem"):
klass = ValueError
msg = "could not convert string to float: 'one'"
else:
Expand All @@ -1055,16 +1062,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
tm.assert_frame_equal(result, expected)


def test_raise_on_nuisance_python_single(df):
def test_raise_on_nuisance_python_single(df, using_infer_string):
# GH 38815
grouped = df.groupby("A")
with pytest.raises(ValueError, match="could not convert"):

err = ValueError
msg = "could not convert"
if using_infer_string:
err = TypeError
msg = "dtype 'str' does not support operation 'skew'"
with pytest.raises(err, match=msg):
grouped.skew()


def test_raise_on_nuisance_python_multiple(three_group):
def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
grouped = three_group.groupby(["A", "B"])
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -1102,12 +1117,16 @@ def test_nonsense_func():
df.groupby(lambda x: x + "foo")


def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
def test_wrap_aggregated_output_multindex(
multiindex_dataframe_random_data, using_infer_string
):
df = multiindex_dataframe_random_data.T
df["baz", "two"] = "peekaboo"

keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
df.groupby(keys).agg("mean")
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
Expand Down Expand Up @@ -1299,8 +1318,10 @@ def test_groupby_with_hier_columns():

def test_grouping_ndarray(df):
grouped = df.groupby(df["A"].values)
grouped2 = df.groupby(df["A"].rename(None))

result = grouped.sum()
expected = df.groupby(df["A"].rename(None)).sum()
expected = grouped2.sum()
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1793,8 +1814,8 @@ def test_no_dummy_key_names(df):
result = df.groupby(df["A"].values).sum()
assert result.index.name is None

result = df.groupby([df["A"].values, df["B"].values]).sum()
assert result.index.names == (None, None)
result2 = df.groupby([df["A"].values, df["B"].values]).sum()
assert result2.index.names == (None, None)


def test_groupby_sort_multiindex_series():
Expand Down Expand Up @@ -2099,6 +2120,7 @@ def get_categorical_invalid_expected():
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
is_dt64 = df.dtypes.iloc[0].kind == "M"
is_cat = isinstance(values, Categorical)
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)

if (
isinstance(values, Categorical)
Expand All @@ -2123,13 +2145,15 @@ def get_categorical_invalid_expected():

if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if is_dt64 or is_cat or is_per:
if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
# GH#41291
# datetime64 -> prod and sum are invalid
if is_dt64:
msg = "datetime64 type does not support"
elif is_per:
msg = "Period type does not support"
elif is_str:
msg = f"dtype 'str' does not support operation '{op}'"
else:
msg = "category type does not support"
if op == "skew":
Expand Down Expand Up @@ -3083,7 +3107,7 @@ def test_obj_with_exclusions_duplicate_columns():
def test_groupby_numeric_only_std_no_result(numeric_only):
# GH 51080
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
df = DataFrame(dicts_non_numeric)
df = DataFrame(dicts_non_numeric, dtype=object)
dfgb = df.groupby("a", as_index=False, sort=False)

if numeric_only:
Expand Down Expand Up @@ -3142,10 +3166,14 @@ def test_grouping_with_categorical_interval_columns():
def test_groupby_sum_on_nan_should_return_nan(bug_var):
# GH 24196
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
if isinstance(bug_var, str):
df = df.astype(object)
dfgb = df.groupby(lambda x: x)
result = dfgb.sum(min_count=1)

expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
expected_df = DataFrame(
[bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
)
tm.assert_frame_equal(result, expected_df)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby_subclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj):

df = obj(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
Expand Down
20 changes: 14 additions & 6 deletions pandas/tests/groupby/test_numeric_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def df(self):
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"string": Series(["a", "b", "c"], dtype="str"),
"object": Series(["a", "b", "c"], dtype=object),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
Expand All @@ -41,6 +42,7 @@ def df(self):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -113,6 +115,7 @@ def test_first_last(self, df, method):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -160,7 +163,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):

# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if method.startswith("cum") else TypeError
exception = (
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
)

if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
# The methods default to numeric_only=False and raise TypeError
Expand All @@ -171,6 +176,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -181,7 +187,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
"category type does not support sum operations",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
re.escape(f"agg function failed [how->{method},dtype->str]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -199,7 +205,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
re.escape(f"agg function failed [how->{method},dtype->str]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand Down Expand Up @@ -384,7 +390,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "idxmin":
if kernel == "quantile":
msg = "dtype 'object' does not support operation 'quantile'"
elif kernel == "idxmin":
msg = "'<' not supported between instances of 'type' and 'type'"
elif kernel == "idxmax":
msg = "'>' not supported between instances of 'type' and 'type'"
Expand Down Expand Up @@ -458,7 +466,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
# that succeed should not be allowed to fail (without deprecation, at least)
if groupby_func in fails_on_numeric_object and dtype is object:
if groupby_func == "quantile":
msg = "cannot be performed against 'object' dtypes"
msg = "dtype 'object' does not support operation 'quantile'"
else:
msg = "is not supported for object dtype"
warn = FutureWarning if groupby_func == "fillna" else None
Expand Down
Loading

0 comments on commit a3cf70f

Please sign in to comment.