Skip to content

Commit

Permalink
Backport PR pandas-dev#56772: Support large strings in interchange pr…
Browse files Browse the repository at this point in the history
…otocol
  • Loading branch information
phofl authored and meeseeksmachine committed Jan 9, 2024
1 parent c4e04e0 commit 8d5c021
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 7 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,7 @@ Sparse

Other
^^^^^
- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype:
# This can be removed if/when pyarrow addresses it:
# https://github.com/apache/arrow/issues/34462
return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]")
if pa.types.is_string(self.pyarrow_dtype):
if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string(
self.pyarrow_dtype
):
# pa.string().to_pandas_dtype() = object which we don't want
return np.dtype(str)
try:
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,12 +301,9 @@ def _get_data_buffer(
buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))

# Define the dtype for the returned buffer
dtype = (
DtypeKind.STRING,
8,
ArrowCTypes.STRING,
Endianness.NATIVE,
) # note: currently only support native endianness
# TODO: this will need correcting
# https://github.com/pandas-dev/pandas/issues/54781
dtype = self.dtype
else:
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")

Expand Down
1 change: 1 addition & 0 deletions pandas/core/interchange/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"float": "f", # float32
"double": "g", # float64
"string": "u",
"large_string": "U",
"binary": "z",
"time32[s]": "tts",
"time32[ms]": "ttm",
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,3 +362,12 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
interchange.get_column_by_name = lambda _: column
monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
pd.api.interchange.from_dataframe(df)


def test_large_string():
# GH#56702
pytest.importorskip("pyarrow")
df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
result = pd.api.interchange.from_dataframe(df.__dataframe__())
expected = pd.DataFrame({"a": ["x"]}, dtype="object")
tm.assert_frame_equal(result, expected)

0 comments on commit 8d5c021

Please sign in to comment.