-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test: test data type inference with pyarrow dtypes (#215)
- Loading branch information
1 parent
9032158
commit e2889d4
Showing
1 changed file
with
112 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,115 +1,128 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from edvart import data_types | ||
|
||
from .pyarrow_utils import pyarrow_params | ||
|
||
def test_inference(): | ||
assert ( | ||
data_types.infer_data_type(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312])) | ||
== data_types.DataType.NUMERIC | ||
), "Should be numeric type" | ||
assert ( | ||
data_types.infer_data_type( | ||
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) | ||
) | ||
== data_types.DataType.DATE | ||
), "Should be date type" | ||
assert ( | ||
data_types.infer_data_type(pd.Series(["A", "B", "C", "C", "A", "B"])) | ||
== data_types.DataType.CATEGORICAL | ||
), "Should be categorical type" | ||
assert ( | ||
data_types.infer_data_type(pd.Series([True, False, False, True, True])) | ||
== data_types.DataType.BOOLEAN | ||
), "Should be boolean type" | ||
assert data_types.infer_data_type( | ||
pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING | ||
), "Should be missing" | ||
assert ( | ||
data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE | ||
), "Should be unique" | ||
assert ( | ||
data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC | ||
), "Should be numeric" | ||
assert ( | ||
data_types.infer_data_type(pd.Series(dtype=pd.Float64Dtype)) == data_types.DataType.UNKNOWN | ||
), "Should be unknown" | ||
assert data_types.infer_data_type( | ||
pd.Series([True, False]) == data_types.DataType.BOOLEAN | ||
), "Should be boolean" | ||
|
||
@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) | ||
@pytest.mark.parametrize( | ||
"data, expected", | ||
[ | ||
(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), data_types.DataType.NUMERIC), | ||
( | ||
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), | ||
data_types.DataType.DATE, | ||
), | ||
(pd.Series(["A", "B", "C", "C", "A", "B"]), data_types.DataType.CATEGORICAL), | ||
(pd.Series([True, False, False, True, True]), data_types.DataType.BOOLEAN), | ||
(pd.Series([None, None, np.nan, float("nan")]), data_types.DataType.MISSING), | ||
(pd.Series(list(range(10))), data_types.DataType.UNIQUE), | ||
(pd.Series([1] + list(range(100))), data_types.DataType.NUMERIC), | ||
(pd.Series(dtype=pd.Float64Dtype), data_types.DataType.UNKNOWN), | ||
(pd.Series([True, False]), data_types.DataType.BOOLEAN), | ||
], | ||
) | ||
def test_inference(data, expected, pyarrow_dtypes): | ||
if pyarrow_dtypes: | ||
data = data.convert_dtypes(dtype_backend="pyarrow") | ||
assert data_types.infer_data_type(data) == expected | ||
|
||
def test_missing_series(): | ||
assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing" | ||
assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing" | ||
assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing" | ||
assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing" | ||
|
||
@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) | ||
@pytest.mark.parametrize( | ||
"data, is_missing", | ||
[ | ||
(pd.Series([None, None, np.nan, float("nan")]), True), | ||
(pd.Series([pd.NA]), True), | ||
(pd.Series([1, np.nan, None]), False), | ||
(pd.Series(["2023-01-01", None]), False), | ||
], | ||
) | ||
def test_missing_series(data, is_missing, pyarrow_dtypes): | ||
if pyarrow_dtypes: | ||
data = data.convert_dtypes(dtype_backend="pyarrow") | ||
assert data_types.is_missing(data) == is_missing | ||
|
||
def test_numeric_series(): | ||
assert data_types.is_numeric( | ||
pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]) | ||
), "Should be numeric type" | ||
assert data_types.is_numeric(pd.Series([23, 45, 2, 1.2, -3, -66])), "Should be numeric type" | ||
assert not data_types.is_numeric( | ||
pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]) | ||
), "Should not be numeric type" | ||
assert not data_types.is_numeric( | ||
pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]) | ||
), "Should not be numeric type" | ||
assert not data_types.is_numeric( | ||
pd.Series([None, None, np.nan, float("nan")]) | ||
), "Should not be numeric" | ||
|
||
@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) | ||
@pytest.mark.parametrize( | ||
"data, is_numeric", | ||
[ | ||
(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), True), | ||
(pd.Series([23, 45, 2, 1.2, -3, -66]), True), | ||
(pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]), False), | ||
(pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]), False), | ||
(pd.Series([None, None, np.nan, float("nan")]), False), | ||
], | ||
) | ||
def test_numeric_series(data, is_numeric, pyarrow_dtypes): | ||
if pyarrow_dtypes: | ||
data = data.convert_dtypes(dtype_backend="pyarrow") | ||
assert data_types.is_numeric(data) == is_numeric | ||
|
||
def test_categorical_series(): | ||
assert data_types.is_categorical(pd.Series(["A", "B", "C", "D"])), "Should be categorical" | ||
assert data_types.is_categorical( | ||
pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]) | ||
), "Should be categorical" | ||
assert not data_types.is_categorical( | ||
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]) | ||
), "Should not be categorical" | ||
assert not data_types.is_categorical( | ||
pd.Series([None, None, np.nan, float("nan")]) | ||
), "Should not be categorical" | ||
assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical" | ||
|
||
@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) | ||
@pytest.mark.parametrize( | ||
"data, is_categorical", | ||
[ | ||
(pd.Series(["A", "B", "C", "D"]), True), | ||
(pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]), True), | ||
( | ||
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]), | ||
False, | ||
), | ||
(pd.Series([None, None, np.nan, float("nan")]), False), | ||
(pd.Series([pd.NA]), False), | ||
], | ||
) | ||
def test_categorical_series(data, is_categorical, pyarrow_dtypes): | ||
if pyarrow_dtypes: | ||
data = data.convert_dtypes(dtype_backend="pyarrow") | ||
assert data_types.is_categorical(data) == is_categorical | ||
|
||
def test_boolean_series(): | ||
assert data_types.is_boolean(pd.Series([True, False, False, True, True])), "Should be boolean" | ||
assert data_types.is_boolean(pd.Series([False, False, False])), "Should be boolean" | ||
assert data_types.is_boolean(pd.Series([True, True, True])), "Should be boolean" | ||
assert data_types.is_boolean(pd.Series([1, 0, 0, 1])), "Should be boolean" | ||
assert data_types.is_boolean(pd.Series([0, 0, 0, 0])), "Should be boolean" | ||
assert data_types.is_boolean(pd.Series([1, 1, 1, 1])), "Should be boolean" | ||
assert not data_types.is_boolean( | ||
pd.Series([True, False, False, True, True, "True"]) | ||
), "Should not be boolean" | ||
assert not data_types.is_boolean(pd.Series([2, 2, 2, 2])), "Should not be boolean" | ||
assert not data_types.is_boolean(pd.Series([1, 0, 0, 1, 3])), "Should not be boolean" | ||
assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean" | ||
assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean" | ||
assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean" | ||
assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean" | ||
|
||
@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) | ||
@pytest.mark.parametrize( | ||
"data, is_boolean", | ||
[ | ||
(pd.Series([True, False, False, True, True]), True), | ||
(pd.Series([False, False, False]), True), | ||
(pd.Series([True, True, True]), True), | ||
(pd.Series([1, 0, 0, 1]), True), | ||
(pd.Series([0, 0, 0, 0]), True), | ||
(pd.Series([1, 1, 1, 1]), True), | ||
(pd.Series([True, False, False, True, True, "True"]), False), | ||
(pd.Series([2, 2, 2, 2]), False), | ||
(pd.Series([1, 0, 0, 1, 3]), False), | ||
(pd.Series(["a", "abc", "2"]), False), | ||
(pd.Series(["A", "B", "A", "A", "B"]), False), | ||
(pd.Series([-0.2, 1.6567, 3, 4, 5]), False), | ||
(pd.Series([None]), False), | ||
], | ||
) | ||
def test_boolean_series(data, is_boolean, pyarrow_dtypes): | ||
if pyarrow_dtypes: | ||
data = data.convert_dtypes(dtype_backend="pyarrow") | ||
assert data_types.is_boolean(data) == is_boolean | ||
|
||
def test_date_series(): | ||
assert data_types.is_date( | ||
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]) | ||
), "Should be type date" | ||
assert data_types.is_date( | ||
pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]) | ||
), "Should be type date" | ||
assert not data_types.is_date( | ||
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]) | ||
), "Should not be type date" | ||
assert not data_types.is_date( | ||
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]) | ||
), "Should not be type date" | ||
assert not data_types.is_date(pd.Series([1, 2, 3, 4, 5])), "Should not be type date" | ||
assert not data_types.is_date(pd.Series([None, 2.0, 3, 4, 5])), "Should not be type date" | ||
assert data_types.is_date( | ||
pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]) | ||
), "Should be type date" | ||
|
||
@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params) | ||
@pytest.mark.parametrize( | ||
"data, is_date", | ||
[ | ||
(pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), True), | ||
(pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]), True), | ||
(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]), False), | ||
(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]), False), | ||
(pd.Series([1, 2, 3, 4, 5]), False), | ||
(pd.Series([None, 2.0, 3, 4, 5]), False), | ||
(pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]), True), | ||
], | ||
) | ||
def test_date_series(data, is_date, pyarrow_dtypes): | ||
if pyarrow_dtypes: | ||
data = data.convert_dtypes(dtype_backend="pyarrow") | ||
assert data_types.is_date(data) == is_date |