Skip to content

Commit

Permalink
fix: Pandas parser does fail to parse integer or boolean only datafra…
Browse files Browse the repository at this point in the history
…mes (#1683)

- fixes #1678 

Converting the Series returned by `iterrows()` to a dict converts
`np.int64` type to python's native `int` type and fixes the bug (same
with booleans).

+ Adding non-regression tests

- I was also concerned with the next lines, especially : `if value is
np.nan: value = None`
- it was untested, so I added a test. It looks like `to_dict` would not
change the behavior of `np.nan` conversion (see side note), so I left
this code unchanged.
- Primary keys are returned as `int`s or `tuple[int]`, no `np.int64`
there
- `Timestamps` types are kept unchanged, so the `if isinstance(value,
pd.Timestamp):` still applies.

## Side note

`np.nan` behavior is quite strange with `df.iterrows()` : in a number
column, it will be converted to `float("nan")`, whereas in string column
it will be kept as `np.nan`. Adding `to_dict()` to the row Series does
not change the types.
  • Loading branch information
pierrecamilleri authored Sep 16, 2024
1 parent 97f42ec commit 0e60e0b
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 7 deletions.
44 changes: 38 additions & 6 deletions frictionless/formats/pandas/__spec__/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from decimal import Decimal

import isodate
import numpy as np
import pandas as pd
import pytz
from dateutil.tz import tzoffset, tzutc
Expand All @@ -14,13 +15,44 @@


def test_pandas_parser():
dataframe = pd.DataFrame(data={"id": [1, 2], "name": ["english", "中国人"]})
test_cases = [
{
"name": "Integer type only dataframe, cf issue 1678",
"df_data": {"int": [1]},
"expected_header": ["int"],
"expected_rows": [{"int": 1}],
},
{
"name": "Boolean type only dataframe, cf issue 1678",
"df_data": {"bool": [True]},
"expected_header": ["bool"],
"expected_rows": [{"bool": True}],
},
{
"name": "Mixed types dataframe, chinese characters",
"df_data": {"id": [1, 2], "name": ["english", "中国人"]},
"expected_header": ["id", "name"],
"expected_rows": [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
],
},
]
for tc in test_cases:
dataframe = pd.DataFrame(data=tc["df_data"])

with TableResource(data=dataframe) as resource:
assert resource.header == tc["expected_header"], tc["name"]
assert resource.read_rows() == tc["expected_rows"], tc["name"]


def test_pandas_parser_with_nan():
dataframe = pd.DataFrame(data={"x": [np.nan]})

with TableResource(data=dataframe) as resource:
assert resource.header == ["id", "name"]
assert resource.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
test_name = 'np.nan converted to Decimal("NaN")'
row = resource.read_rows()[0]
assert row["x"].is_nan(), test_name


def test_pandas_parser_from_dataframe_with_primary_key_having_datetime():
Expand Down
2 changes: 1 addition & 1 deletion frictionless/formats/pandas/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def read_cell_stream_create(self):
pk = pk if isinstance(pk, tuple) else [pk] # type: ignore
value = pk[schema.primary_key.index(field.name)] # type: ignore
else:
value = item[field.name]
value = item.to_dict()[field.name]
if value is np.nan:
value = None
elif isinstance(value, pd.Timestamp):
Expand Down

0 comments on commit 0e60e0b

Please sign in to comment.