From 5034b780e90f499d4da8dc2e461aca19304c5263 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:38:20 +0100 Subject: [PATCH] Backport PR #57174 on branch 2.2.x (BUG: Interchange protocol implementation allows non-string column names) (#57203) Backport PR #57174: BUG: Interchange protocol implementation allows non-string column names Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 8 ++++++++ pandas/core/interchange/dataframe.py | 2 +- pandas/tests/interchange/test_impl.py | 26 ++++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 56e645d4c55db..13d5024b5a131 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 96757072dd854..e273ecad8b51e 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -77,6 +77,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 4f08b2c2b3a7b..1ffe0e8e8dbb0 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df + self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy def __dataframe__( diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 209944427d5dc..d47b533f92235 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -180,8 +180,6 @@ def test_missing_from_masked(): } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -382,6 +380,30 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='object'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + @pytest.mark.parametrize( "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] )