diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 09f0f2af8e5c6..40e3ea6450647 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] converters = self._clean_mapping(self.converters) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if try_parse_dates and self._should_parse_dates(i): arr = self._date_conv( arr, @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _convert_to_ndarrays( @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 7b70601addcad..5226476ef6eac 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c13b77f365496..d573b47bb3279 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( EmptyDataError, ParserError, @@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 0121af53f1aa4..cdd65223a9c9f 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -8,8 +8,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): + if using_infer_string and all_parsers.engine == "pyarrow": + # result should have string columns instead of object dtype + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 787941c5d0376..d28c43c45647a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas as pd @@ -24,6 +22,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 1501479510e17..5b72f76440349 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index a3c6dc8fd0898..1848e1e571fc1 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -186,7 +184,6 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 @@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6dbfed2b6ae83..9224b743b8917 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 @@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 32a8d3b81f470..80c32d3a6262e 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,9 +7,10 @@ import pytest -from pandas._config import using_string_dtype - -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 @@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 5f9823f7225f9..dd168aaa45808 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -260,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -306,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -324,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -435,8 +435,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -444,14 +442,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index be2015fca27d1..616fcb81cf055 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -16,8 +16,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs import parsing import pandas as pd @@ -1799,7 +1797,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -1807,7 +1804,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -2054,7 +2051,6 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -2087,7 +2083,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -2204,7 +2200,6 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -2214,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 01e576ba40f26..bc4c4c2e24e9c 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712