diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 8f542265fd02d..886b058c8caac 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1582,6 +1582,12 @@ tasks: # ensure we have at least one build with parquet encryption disabled PARQUET_REQUIRE_ENCRYPTION: "OFF" {% endif %} + {% if pandas_version == "nightly" %} + # TODO can be removed once this is enabled by default in pandas >= 3 + # This is to enable the Pandas feature. + # See: https://github.com/pandas-dev/pandas/pull/58459 + PANDAS_FUTURE_INFER_STRING: "1" + {% endif %} {% if not cache_leaf %} # use the latest pandas release, so prevent reusing any cached layers flags: --no-leaf-cache diff --git a/docker-compose.yml b/docker-compose.yml index 0882121d598bb..9fe1d247e01fc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1323,6 +1323,7 @@ services: PYTEST_ARGS: # inherit HYPOTHESIS_PROFILE: # inherit PYARROW_TEST_HYPOTHESIS: # inherit + PANDAS_FUTURE_INFER_STRING: # inherit volumes: *conda-volumes command: *python-conda-command diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index eaedbf1e38580..451701f7f02b3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -117,6 +117,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size): "return a pyarrow Array or ChunkedArray.") if isinstance(res, ChunkedArray) and res.num_chunks==1: res = res.chunk(0) + if type is not None and res.type != type: + res = res.cast(type) return res diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 74f0d981b52f4..5be6f03f86ed6 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 - bint _is_v1, _is_ge_v21, _is_ge_v3 + bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict def __init__(self): self._lock = Lock() @@ -80,6 +80,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') + self._is_ge_v3_strict = self._loose_version >= Version('3.0.0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -174,6 +175,20 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v3 + def is_ge_v3_strict(self): + self._check_import() + return self._is_ge_v3_strict + + def uses_string_dtype(self): + if self.is_ge_v3_strict(): + return True + try: + if self.pd.options.future.infer_string: + return True + except: + pass + return False + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 7fbde36bc23e9..072e3b84195f1 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -781,10 +781,12 @@ def table_to_dataframe( table, index = _reconstruct_index(table, index_descriptors, all_columns, types_mapper) ext_columns_dtypes = _get_extension_dtypes( - table, all_columns, types_mapper) + table, all_columns, types_mapper, options, categories) else: index = _pandas_api.pd.RangeIndex(table.num_rows) - ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper) + ext_columns_dtypes = _get_extension_dtypes( + table, [], types_mapper, options, categories + ) _check_data_column_metadata_consistency(all_columns) columns = _deserialize_column_index(table, all_columns, column_indexes) @@ -829,7 +831,7 @@ def table_to_dataframe( } -def _get_extension_dtypes(table, columns_metadata, types_mapper=None): +def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories): """ Based on the stored column pandas metadata and the extension types in the arrow schema, infer which columns should be converted to a @@ -842,12 +844,25 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): and then we can check if this dtype supports conversion from arrow. """ + strings_to_categorical = options["strings_to_categorical"] + categories = categories or [] + ext_columns = {} # older pandas version that does not yet support extension dtypes if _pandas_api.extension_dtype is None: return ext_columns + # for pandas 3.0+, use pandas' new default string dtype + if _pandas_api.uses_string_dtype() and not strings_to_categorical: + for field in table.schema: + if ( + pa.types.is_string(field.type) + or pa.types.is_large_string(field.type) + or pa.types.is_string_view(field.type) + ) and field.name not in categories: + ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan) + # infer the extension columns from the pandas metadata for col_meta in columns_metadata: try: @@ -861,6 +876,19 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): # that are certainly numpy dtypes pandas_dtype = _pandas_api.pandas_dtype(dtype) if isinstance(pandas_dtype, _pandas_api.extension_dtype): + if isinstance(pandas_dtype, _pandas_api.pd.StringDtype): + # when the metadata indicate to use the string dtype, + # ignore this in case: + # - it is specified to convert strings / this column to categorical + # - the column itself is dictionary encoded and would otherwise be + # converted to categorical + if strings_to_categorical or name in categories: + continue + try: + if pa.types.is_dictionary(table.schema.field(name).type): + continue + except KeyError: + pass if hasattr(pandas_dtype, "__from_arrow__"): ext_columns[name] = pandas_dtype @@ -1133,6 +1161,20 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) + elif ( + level.dtype == "str" and numpy_dtype == "object" + and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"]) + ): + # the metadata indicate that the original dataframe used object dtype, + # but ignore this and keep string dtype if: + # - the original columns used mixed types -> we don't attempt to faithfully + # roundtrip in this case, but keep the column names as strings + # - the original columns were inferred to be strings but stored in object + # dtype -> we don't restore the object dtype because all metadata + # generated using pandas < 3 will have this case by default, and + # for pandas >= 3 we want to use the default string dtype for .columns + new_levels.append(level) + continue elif level.dtype != dtype: level = level.astype(dtype) # ARROW-9096: if original DataFrame was upcast we keep that diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 110dab7d35538..4e0a103e31899 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2539,7 +2539,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr } if (options.strings_to_categorical) { for (int i = 0; i < static_cast(arrays->size()); i++) { - if (is_base_binary_like((*arrays)[i]->type()->id())) { + if (is_base_binary_like((*arrays)[i]->type()->id()) || + is_binary_view_like((*arrays)[i]->type()->id())) { columns_to_encode.push_back(i); } } @@ -2573,7 +2574,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, py_ref = nullptr; } - if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { + if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) || + is_binary_view_like(arr->type()->id()))) { if (options.zero_copy_only) { return Status::Invalid("Need to dictionary encode a column, but ", "only zero-copy conversions allowed"); diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index c16d2f9aacf74..46e4b3aa52768 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1020,7 +1020,7 @@ def test_replace_slice(): offsets = range(-3, 4) arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -1031,7 +1031,7 @@ def test_replace_slice(): assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -2125,7 +2125,8 @@ def test_strftime(): for fmt in formats: options = pc.StrftimeOptions(fmt) result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + # cast to the same type as result to ignore string vs large_string + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) fmt = "%Y-%m-%dT%H:%M:%S" @@ -2133,34 +2134,34 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) - expected = pa.array(ts.strftime(fmt + "%Z")) + expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S")) + expected = pa.array(ts.strftime("%S")).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S.%f")) + expected = pa.array(ts.strftime("%S.%f")).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Test timestamps without timezone @@ -2168,7 +2169,7 @@ def test_strftime(): ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) # Positional format assert pc.strftime(tsa, fmt) == result diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 18c8cd5b654e6..249fb621279a6 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -426,7 +426,11 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - _check_pandas_roundtrip(df, version=version) + if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + expected = df.astype("str") + else: + expected = df + _check_pandas_roundtrip(df, version=version, expected=expected) @pytest.mark.pandas diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 178a073ed59dc..606d798b1b32c 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -349,7 +349,10 @@ def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) - def test_index_metadata_field_name(self): + def test_index_metadata_field_name(self, request): + if _pandas_api.uses_string_dtype(): + # https://github.com/pandas-dev/pandas/issues/59879 + request.applymarker(pytest.mark.xfail(reason="bug in pandas string dtype")) # test None case, and strangely named non-index columns df = pd.DataFrame( [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)], @@ -411,7 +414,9 @@ def test_string_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] == 'stringz' assert column_indexes['name'] == column_indexes['field_name'] - assert column_indexes['numpy_type'] == 'object' + assert column_indexes['numpy_type'] == ( + 'str' if _pandas_api.uses_string_dtype() else 'object' + ) assert column_indexes['pandas_type'] == 'unicode' md = column_indexes['metadata'] @@ -1680,7 +1685,10 @@ def test_pandas_unicode(self): repeats = 1000 values = ['foo', None, 'bar', 'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) - field = pa.field('strings', pa.string()) + field = pa.field( + 'strings', + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) schema = pa.schema([field]) ex_values = ['foo', None, 'bar', 'mañana', None] expected = pd.DataFrame({'strings': ex_values * repeats}) @@ -3299,6 +3307,10 @@ def _assert_nunique(obj, expected): def test_to_pandas_deduplicate_strings_array_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3311,6 +3323,10 @@ def test_to_pandas_deduplicate_strings_array_types(): def test_to_pandas_deduplicate_strings_table_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3774,20 +3790,26 @@ def _check_to_pandas_memory_unchanged(obj, **kwargs): x = obj.to_pandas(**kwargs) # noqa # Memory allocation unchanged -- either zero copy or self-destructing - assert pa.total_allocated_bytes() == prior_allocation + if _pandas_api.uses_string_dtype(): + # for the string array of the columns Index + # -> increase the size to account for overallocation for small arrays + max_index_allocation = max(192, x.columns.nbytes * 2) + assert pa.total_allocated_bytes() <= (prior_allocation + max_index_allocation) + else: + assert pa.total_allocated_bytes() == prior_allocation def test_to_pandas_split_blocks(): # ARROW-3789 t = pa.table([ - pa.array([1, 2, 3, 4, 5], type='i1'), - pa.array([1, 2, 3, 4, 5], type='i4'), - pa.array([1, 2, 3, 4, 5], type='i8'), - pa.array([1, 2, 3, 4, 5], type='f4'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='i1'), + pa.array([1, 2, 3, 4, 5]*100, type='i4'), + pa.array([1, 2, 3, 4, 5]*100, type='i8'), + pa.array([1, 2, 3, 4, 5]*100, type='f4'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), ], ['f{}'.format(i) for i in range(8)]) _check_blocks_created(t, 8) @@ -3832,7 +3854,12 @@ def test_table_uses_memory_pool(): prior_allocation = pa.total_allocated_bytes() x = t.to_pandas() - assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8) + new_allocation = 3 * N * 8 + if _pandas_api.uses_string_dtype(): + # for the small columns Index + new_allocation += 128 + + assert pa.total_allocated_bytes() == (prior_allocation + new_allocation) # Check successful garbage collection x = None # noqa @@ -4110,7 +4137,10 @@ def test_dictionary_encoded_nested_to_pandas(): def test_dictionary_from_pandas(): cat = pd.Categorical(['a', 'b', 'a']) - expected_type = pa.dictionary(pa.int8(), pa.string()) + expected_type = pa.dictionary( + pa.int8(), + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) result = pa.array(cat) assert result.to_pylist() == ['a', 'b', 'a']