Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport PR #60321: TST (string dtype): resolve all xfails in IO pars… #60330

Open
wants to merge 2 commits into
base: 2.3.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
arrays = []
converters = self._clean_mapping(self.converters)

for i, arr in enumerate(index):
if self.index_names is not None:
names: Iterable = self.index_names
else:
names = itertools.cycle([None])
for i, (arr, name) in enumerate(zip(index, names)):
if try_parse_dates and self._should_parse_dates(i):
arr = self._date_conv(
arr,
Expand Down Expand Up @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
arr, _ = self._infer_types(
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
)
arrays.append(arr)

names = self.index_names
index = ensure_index_from_sequences(arrays, names)
if cast_type is not None:
# Don't perform RangeIndex inference
idx = Index(arr, name=name, dtype=cast_type)
else:
idx = ensure_index_from_sequences([arr], [name])
arrays.append(idx)

return index
if len(arrays) == 1:
return arrays[0]
else:
return MultiIndex.from_arrays(arrays)

@final
def _convert_to_ndarrays(
Expand Down Expand Up @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
dtype_dict: defaultdict[Hashable, Any]
if not is_dict_like(dtype):
# if dtype == None, default will be object.
default_dtype = dtype or object
dtype_dict = defaultdict(lambda: default_dtype)
dtype_dict = defaultdict(lambda: dtype)
else:
dtype = cast(dict, dtype)
dtype_dict = defaultdict(
lambda: object,
lambda: None,
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
)

Expand All @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
if (index_col is None or index_col is False) or index_names is None:
index = default_index(0)
else:
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
index = ensure_index_from_sequences(data, names=index_names)
# TODO: We could return default_index(0) if dtype_dict[name] is None
data = [
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
]
if len(data) == 1:
index = data[0]
else:
index = MultiIndex.from_arrays(data)
index_col.sort()

for i, n in enumerate(index_col):
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning

Expand Down Expand Up @@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_warn_if_chunks_have_mismatched_type(all_parsers):
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
Expand Down Expand Up @@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
"Specify dtype option on import or set low_memory=False.",
buf,
)

assert df.a.dtype == object
if parser.engine == "c" and parser.low_memory:
assert df.a.dtype == object
elif using_infer_string:
assert df.a.dtype == "str"
else:
assert df.a.dtype == object


@pytest.mark.parametrize("iterator", [True, False])
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/io/parser/common/test_file_buffer_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import (
EmptyDataError,
ParserError,
Expand Down Expand Up @@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path):
pytest.skip("Failing on: " + " ".join(platform.uname()))


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
def test_multi_index_no_level_names(
request, all_parsers, index_col, using_infer_string
):
if using_infer_string and all_parsers.engine == "pyarrow":
# result should have string columns instead of object dtype
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
Expand Down
22 changes: 17 additions & 5 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserWarning

import pandas as pd
Expand All @@ -24,6 +22,8 @@
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
Expand Down Expand Up @@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
Expand All @@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers):
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)

result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand All @@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers):
AMZN,20230301181139587,2023552585717889759,2023552585717263360
MSFT,20230301181139587,2023552585717889863,2023552585717263361
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
Expand All @@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers):
values = ["1", "4"]
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
tm.assert_frame_equal(result, expected)


def test_index_col_with_dtype_no_rangeindex(all_parsers):
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
result = all_parsers.read_csv(
data,
header=None,
names=["start", "stop", "bin_id"],
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
index_col="bin_id",
).index
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
tm.assert_index_equal(result, expected)
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
ParserError,
Expand Down Expand Up @@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
assert max(precise_errors) <= max(normal_errors)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtypes(c_parser_only):
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
Expand All @@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only):
dtype={"b": int, "c": float},
)

assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()


def test_disable_bool_parsing(c_parser_only):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -186,7 +184,6 @@ def convert_score(x):
tm.assert_frame_equal(results[0], results[1])


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
Expand All @@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
tm.assert_frame_equal(rs, xp)


Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers):
tm.assert_frame_equal(result, expected, check_index_type=False)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
Expand All @@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@

import pytest

from pandas._config import using_string_dtype

from pandas import DataFrame
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
Expand Down Expand Up @@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
parser.read_csv(StringIO(data), names=names)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
Expand All @@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers):

# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
expected = DataFrame(columns=Index([], dtype="str"))

for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
Expand Down
Loading
Loading