Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add missing data type #24

Merged
merged 4 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions edvart/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class DataType(IntEnum):
BOOLEAN = (3,)
DATE = (4,)
UNKNOWN = 5
MISSING = 6

def __str__(self):
return self.name.lower()
Expand All @@ -38,7 +39,8 @@ def infer_data_type(series: pd.Series, string_representation: bool = False) -> U
Inferred custom edvart data type or its string representation.
"""
ret = None

if is_missing(series):
ret = DataType.MISSING
if is_boolean(series):
ret = DataType.BOOLEAN
elif is_date(series):
Expand Down Expand Up @@ -67,6 +69,8 @@ def is_numeric(series: pd.Series) -> bool:
bool
Boolean indicating whether series contains only numbers.
"""
if is_missing(series):
return False
# When an unkown dtype is encountered, `np.issubdtype(series.dtype, np.number)`
# raises a TypeError. This happens for example if `series` is `pd.Categorical`
# If the dtype is unknown, we treat it as non-numeric, therefore return False.
Expand All @@ -76,6 +80,22 @@ def is_numeric(series: pd.Series) -> bool:
return False


def is_missing(series: pd.Series) -> bool:
"""Function to tell if the series contains only missing values.

Parameters
----------
series : pd.Series
Series from which to infer data type.

Returns
-------
bool
True if all values in the series are missing, False otherwise.
"""
return series.isnull().all()


def is_categorical(series: pd.Series, unique_value_count_threshold: int = 10) -> bool:
"""Heuristic to tell if a series is categorical.

Expand All @@ -93,7 +113,8 @@ def is_categorical(series: pd.Series, unique_value_count_threshold: int = 10) ->
Boolean indicating if series is categorical.
"""
return (
not is_boolean(series)
not is_missing(series)
and not is_boolean(series)
and not is_date(series)
and (
(
Expand All @@ -118,7 +139,9 @@ def is_boolean(series: pd.Series) -> bool:
bool
Boolean indicating if series is boolean.
"""
return pd.api.types.is_bool_dtype(series) or set(series.unique()) <= {1, 0, pd.NA}
return not is_missing(series) and (
pd.api.types.is_bool_dtype(series) or set(series.unique()) <= {1, 0, pd.NA}
)


def is_date(series: pd.Series) -> bool:
Expand All @@ -136,6 +159,13 @@ def is_date(series: pd.Series) -> bool:
"""
if isinstance(series.dtype, pd.PeriodDtype):
return True
no_numerics = np.all(~series.astype(str).str.isnumeric())
converted_series = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)
return converted_series.notna().all() and no_numerics and not is_numeric(series)
if is_missing(series) or is_numeric(series):
return False
contains_numerics = np.any(series.astype(str).str.isnumeric())
if contains_numerics:
return False
try:
converted_series = pd.to_datetime(series, errors="coerce", infer_datetime_format=True)
except ValueError:
return False
return converted_series.notna().all()
19 changes: 19 additions & 0 deletions tests/test_data_type_inference.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd

from edvart import data_types
Expand All @@ -20,6 +21,16 @@ def test_inference():
data_types.infer_data_type(pd.Series([True, False, False, True, True]))
== data_types.DataType.BOOLEAN
), "Should be boolean type"
assert data_types.infer_data_type(
pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING
), "Should be missing"


def test_missing_series():
assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing"
assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing"
assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing"
assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing"


def test_numeric_series():
Expand All @@ -33,6 +44,9 @@ def test_numeric_series():
assert not data_types.is_numeric(
pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"])
), "Should not be numeric type"
assert not data_types.is_numeric(
pd.Series([None, None, np.nan, float("nan")])
), "Should not be numeric"


def test_categorical_series():
Expand All @@ -43,6 +57,10 @@ def test_categorical_series():
assert not data_types.is_categorical(
pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8])
), "Should not be categorical"
assert not data_types.is_categorical(
pd.Series([None, None, np.nan, float("nan")])
), "Should not be categorical"
assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical"


def test_boolean_series():
Expand All @@ -60,6 +78,7 @@ def test_boolean_series():
assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean"
assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean"


def test_date_series():
Expand Down
1 change: 0 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,5 @@ def test_full_na_series():
warnings.simplefilter(action="error", category=RuntimeWarning)
result = func(series)
assert math.isnan(float(result))
assert utils.is_numeric(series)
assert utils.is_categorical(series)
lukany marked this conversation as resolved.
Show resolved Hide resolved
assert utils.num_unique_values(series) == 0