Skip to content

Commit

Permalink
test: test data type inference with pyarrow dtypes (#215)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbelak-dtml authored Mar 12, 2024
1 parent 9032158 commit e2889d4
Showing 1 changed file with 112 additions and 99 deletions.
211 changes: 112 additions & 99 deletions tests/test_data_type_inference.py
Original file line number Diff line number Diff line change
@@ -1,115 +1,128 @@
import numpy as np
import pandas as pd
import pytest

from edvart import data_types

from .pyarrow_utils import pyarrow_params

def test_inference():
assert (
data_types.infer_data_type(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]))
== data_types.DataType.NUMERIC
), "Should be numeric type"
assert (
data_types.infer_data_type(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
)
== data_types.DataType.DATE
), "Should be date type"
assert (
data_types.infer_data_type(pd.Series(["A", "B", "C", "C", "A", "B"]))
== data_types.DataType.CATEGORICAL
), "Should be categorical type"
assert (
data_types.infer_data_type(pd.Series([True, False, False, True, True]))
== data_types.DataType.BOOLEAN
), "Should be boolean type"
assert data_types.infer_data_type(
pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING
), "Should be missing"
assert (
data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE
), "Should be unique"
assert (
data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC
), "Should be numeric"
assert (
data_types.infer_data_type(pd.Series(dtype=pd.Float64Dtype)) == data_types.DataType.UNKNOWN
), "Should be unknown"
assert data_types.infer_data_type(
pd.Series([True, False]) == data_types.DataType.BOOLEAN
), "Should be boolean"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, expected",
[
(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), data_types.DataType.NUMERIC),
(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]),
data_types.DataType.DATE,
),
(pd.Series(["A", "B", "C", "C", "A", "B"]), data_types.DataType.CATEGORICAL),
(pd.Series([True, False, False, True, True]), data_types.DataType.BOOLEAN),
(pd.Series([None, None, np.nan, float("nan")]), data_types.DataType.MISSING),
(pd.Series(list(range(10))), data_types.DataType.UNIQUE),
(pd.Series([1] + list(range(100))), data_types.DataType.NUMERIC),
(pd.Series(dtype=pd.Float64Dtype), data_types.DataType.UNKNOWN),
(pd.Series([True, False]), data_types.DataType.BOOLEAN),
],
)
def test_inference(data, expected, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.infer_data_type(data) == expected

def test_missing_series():
assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing"
assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing"
assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing"
assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_missing",
[
(pd.Series([None, None, np.nan, float("nan")]), True),
(pd.Series([pd.NA]), True),
(pd.Series([1, np.nan, None]), False),
(pd.Series(["2023-01-01", None]), False),
],
)
def test_missing_series(data, is_missing, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_missing(data) == is_missing

def test_numeric_series():
assert data_types.is_numeric(
pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312])
), "Should be numeric type"
assert data_types.is_numeric(pd.Series([23, 45, 2, 1.2, -3, -66])), "Should be numeric type"
assert not data_types.is_numeric(
pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"])
), "Should not be numeric type"
assert not data_types.is_numeric(
pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"])
), "Should not be numeric type"
assert not data_types.is_numeric(
pd.Series([None, None, np.nan, float("nan")])
), "Should not be numeric"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_numeric",
[
(pd.Series([0.12, 3565.3, 234, 1, -49, 14, 5, 88, 12312]), True),
(pd.Series([23, 45, 2, 1.2, -3, -66]), True),
(pd.Series([23, 45, 2, 1, -3, -66, "NULL", "a string"]), False),
(pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]), False),
(pd.Series([None, None, np.nan, float("nan")]), False),
],
)
def test_numeric_series(data, is_numeric, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_numeric(data) == is_numeric

def test_categorical_series():
assert data_types.is_categorical(pd.Series(["A", "B", "C", "D"])), "Should be categorical"
assert data_types.is_categorical(
pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4])
), "Should be categorical"
assert not data_types.is_categorical(
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8])
), "Should not be categorical"
assert not data_types.is_categorical(
pd.Series([None, None, np.nan, float("nan")])
), "Should not be categorical"
assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_categorical",
[
(pd.Series(["A", "B", "C", "D"]), True),
(pd.Series([1, 2, 3, 4, 4, 4, 1, 1, 1, 2, 2, 3, 4]), True),
(
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]),
False,
),
(pd.Series([None, None, np.nan, float("nan")]), False),
(pd.Series([pd.NA]), False),
],
)
def test_categorical_series(data, is_categorical, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_categorical(data) == is_categorical

def test_boolean_series():
assert data_types.is_boolean(pd.Series([True, False, False, True, True])), "Should be boolean"
assert data_types.is_boolean(pd.Series([False, False, False])), "Should be boolean"
assert data_types.is_boolean(pd.Series([True, True, True])), "Should be boolean"
assert data_types.is_boolean(pd.Series([1, 0, 0, 1])), "Should be boolean"
assert data_types.is_boolean(pd.Series([0, 0, 0, 0])), "Should be boolean"
assert data_types.is_boolean(pd.Series([1, 1, 1, 1])), "Should be boolean"
assert not data_types.is_boolean(
pd.Series([True, False, False, True, True, "True"])
), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([2, 2, 2, 2])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([1, 0, 0, 1, 3])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_boolean",
[
(pd.Series([True, False, False, True, True]), True),
(pd.Series([False, False, False]), True),
(pd.Series([True, True, True]), True),
(pd.Series([1, 0, 0, 1]), True),
(pd.Series([0, 0, 0, 0]), True),
(pd.Series([1, 1, 1, 1]), True),
(pd.Series([True, False, False, True, True, "True"]), False),
(pd.Series([2, 2, 2, 2]), False),
(pd.Series([1, 0, 0, 1, 3]), False),
(pd.Series(["a", "abc", "2"]), False),
(pd.Series(["A", "B", "A", "A", "B"]), False),
(pd.Series([-0.2, 1.6567, 3, 4, 5]), False),
(pd.Series([None]), False),
],
)
def test_boolean_series(data, is_boolean, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_boolean(data) == is_boolean

def test_date_series():
assert data_types.is_date(
pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"])
), "Should be type date"
assert data_types.is_date(
pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"])
), "Should be type date"
assert not data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"])
), "Should not be type date"
assert not data_types.is_date(
pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3])
), "Should not be type date"
assert not data_types.is_date(pd.Series([1, 2, 3, 4, 5])), "Should not be type date"
assert not data_types.is_date(pd.Series([None, 2.0, 3, 4, 5])), "Should not be type date"
assert data_types.is_date(
pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None])
), "Should be type date"

@pytest.mark.parametrize("pyarrow_dtypes", pyarrow_params)
@pytest.mark.parametrize(
"data, is_date",
[
(pd.Series(["2014-01-01 12:05:02", "2014-01-02 13:05:02", "2014-12-03 14:05:02"]), True),
(pd.Series(["Mar 12 2018", "Dec 12 2018", "Jan 21 2020"]), True),
(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03T14:05:02", "nan"]), False),
(pd.Series(["2014-01-01", "2014-01-02", "2014-12-03 14:05:02", 1, 2, 3]), False),
(pd.Series([1, 2, 3, 4, 5]), False),
(pd.Series([None, 2.0, 3, 4, 5]), False),
(pd.Series([pd.Timestamp("20130101"), pd.Timestamp("20230102"), None]), True),
],
)
def test_date_series(data, is_date, pyarrow_dtypes):
if pyarrow_dtypes:
data = data.convert_dtypes(dtype_backend="pyarrow")
assert data_types.is_date(data) == is_date

0 comments on commit e2889d4

Please sign in to comment.