Backport PR pandas-dev#57089: BUG: wide_to_long with string columns

meeseeksmachine · Jan 28, 2024 · a67defa · a67defa
1 parent 1550858
commit a67defa
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 4 deletions.
diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
@@ -16,6 +16,7 @@ Fixed regressions
 - Fixed memory leak in :func:`read_csv` (:issue:`57039`)
 - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
 - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
 - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
 - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)

diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
@@ -458,8 +458,7 @@ def wide_to_long(
 
     def get_var_names(df, stub: str, sep: str, suffix: str):
         regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
-        pattern = re.compile(regex)
-        return df.columns[df.columns.str.match(pattern)]
+        return df.columns[df.columns.str.match(regex)]
 
     def melt_stub(df, stub: str, i, j, value_vars, sep: str):
         newdf = melt(

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1336,14 +1336,14 @@ def contains(
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def match(self, pat, case: bool = True, flags: int = 0, na=None):
+    def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
         """
         Determine if each string starts with a match of a regular expression.
 
         Parameters
         ----------
         pat : str
-            Character sequence or regular expression.
+            Character sequence.
         case : bool, default True
             If True, case sensitive.
         flags : int, default 0 (no flags)

diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
@@ -1220,3 +1220,33 @@ def test_missing_stubname(self, dtype):
         new_level = expected.index.levels[0].astype(dtype)
         expected.index = expected.index.set_levels(new_level, level=0)
         tm.assert_frame_equal(result, expected)
+
+
+def test_wide_to_long_pyarrow_string_columns():
+    # GH 57066
+    pytest.importorskip("pyarrow")
+    df = DataFrame(
+        {
+            "ID": {0: 1},
+            "R_test1": {0: 1},
+            "R_test2": {0: 1},
+            "R_test3": {0: 2},
+            "D": {0: 1},
+        }
+    )
+    df.columns = df.columns.astype("string[pyarrow_numpy]")
+    result = wide_to_long(
+        df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*"
+    )
+    expected = DataFrame(
+        [[1, 1], [1, 1], [1, 2]],
+        columns=Index(["D", "R"], dtype=object),
+        index=pd.MultiIndex.from_arrays(
+            [
+                [1, 1, 1],
+                Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"),
+            ],
+            names=["ID", "UNPIVOTED"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)