diff --git a/edvart/data_types.py b/edvart/data_types.py index 73c8159..a7d9bf8 100644 --- a/edvart/data_types.py +++ b/edvart/data_types.py @@ -17,6 +17,7 @@ class DataType(IntEnum): BOOLEAN = (3,) DATE = (4,) UNKNOWN = 5 + MISSING = 6 def __str__(self): return self.name.lower() @@ -38,7 +39,8 @@ def infer_data_type(series: pd.Series, string_representation: bool = False) -> U Inferred custom edvart data type or its string representation. """ ret = None - + if is_missing(series): + ret = DataType.MISSING if is_boolean(series): ret = DataType.BOOLEAN elif is_date(series): @@ -67,6 +69,8 @@ def is_numeric(series: pd.Series) -> bool: bool Boolean indicating whether series contains only numbers. """ + if is_missing(series): + return False # When an unkown dtype is encountered, `np.issubdtype(series.dtype, np.number)` # raises a TypeError. This happens for example if `series` is `pd.Categorical` # If the dtype is unknown, we treat it as non-numeric, therefore return False. @@ -76,6 +80,22 @@ def is_numeric(series: pd.Series) -> bool: return False +def is_missing(series: pd.Series) -> bool: + """Function to tell if the series contains only missing values. + + Parameters + ---------- + series : pd.Series + Series from which to infer data type. + + Returns + ------- + bool + True if all values in the series are missing, False otherwise. + """ + return series.isnull().all() + + def is_categorical(series: pd.Series, unique_value_count_threshold: int = 10) -> bool: """Heuristic to tell if a series is categorical. @@ -93,7 +113,8 @@ def is_categorical(series: pd.Series, unique_value_count_threshold: int = 10) -> Boolean indicating if series is categorical. """ return ( - not is_boolean(series) + not is_missing(series) + and not is_boolean(series) and not is_date(series) and ( ( @@ -118,7 +139,9 @@ def is_boolean(series: pd.Series) -> bool: bool Boolean indicating if series is boolean. """ - return pd.api.types.is_bool_dtype(series) or set(series.unique()) <= {1, 0, pd.NA} + return not is_missing(series) and ( + pd.api.types.is_bool_dtype(series) or set(series.unique()) <= {1, 0, pd.NA} + ) def is_date(series: pd.Series) -> bool: @@ -136,6 +159,13 @@ def is_date(series: pd.Series) -> bool: """ if isinstance(series.dtype, pd.PeriodDtype): return True - no_numerics = np.all(~series.astype(str).str.isnumeric()) - converted_series = pd.to_datetime(series, errors="coerce", infer_datetime_format=True) - return converted_series.notna().all() and no_numerics and not is_numeric(series) + if is_missing(series) or is_numeric(series): + return False + contains_numerics = np.any(series.astype(str).str.isnumeric()) + if contains_numerics: + return False + try: + converted_series = pd.to_datetime(series, errors="coerce", infer_datetime_format=True) + except ValueError: + return False + return converted_series.notna().all() diff --git a/tests/test_data_type_inference.py b/tests/test_data_type_inference.py index 385cdf8..dcc905b 100644 --- a/tests/test_data_type_inference.py +++ b/tests/test_data_type_inference.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from edvart import data_types @@ -20,6 +21,16 @@ def test_inference(): data_types.infer_data_type(pd.Series([True, False, False, True, True])) == data_types.DataType.BOOLEAN ), "Should be boolean type" + assert data_types.infer_data_type( + pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING + ), "Should be missing" + + +def test_missing_series(): + assert data_types.is_missing(pd.Series([None, None, np.nan, float("nan")])), "Should be missing" + assert data_types.is_missing(pd.Series([pd.NA])), "Should be missing" + assert not data_types.is_missing(pd.Series([1, np.nan, None])), "Should not be missing" + assert not data_types.is_missing(pd.Series(["2023-01-01", None])), "Should not be missing" def test_numeric_series(): @@ -33,6 +44,9 @@ def test_numeric_series(): assert not data_types.is_numeric( pd.Series([23, 45, 2, 1, -3, -66, "99", "-1207"]) ), "Should not be numeric type" + assert not data_types.is_numeric( + pd.Series([None, None, np.nan, float("nan")]) + ), "Should not be numeric" def test_categorical_series(): @@ -43,6 +57,10 @@ def test_categorical_series(): assert not data_types.is_categorical( pd.Series([1, 2, 31, 4, 52, 6, 87, 87.7, 9, 1, 3, 4, 1, 10, 123123, 9876, 1.2, 6.8]) ), "Should not be categorical" + assert not data_types.is_categorical( + pd.Series([None, None, np.nan, float("nan")]) + ), "Should not be categorical" + assert not data_types.is_categorical(pd.Series([pd.NA])), "Should not be categorical" def test_boolean_series(): @@ -60,6 +78,7 @@ def test_boolean_series(): assert not data_types.is_boolean(pd.Series(["a", "abc", "2"])), "Should not be boolean" assert not data_types.is_boolean(pd.Series(["A", "B", "A", "A", "B"])), "Should not be boolean" assert not data_types.is_boolean(pd.Series([-0.2, 1.6567, 3, 4, 5])), "Should not be boolean" + assert not data_types.is_boolean(pd.Series([None])), "Should not be boolean" def test_date_series(): diff --git a/tests/test_utils.py b/tests/test_utils.py index 5d11adf..b67be4b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -28,6 +28,5 @@ def test_full_na_series(): warnings.simplefilter(action="error", category=RuntimeWarning) result = func(series) assert math.isnan(float(result)) - assert utils.is_numeric(series) assert utils.is_categorical(series) assert utils.num_unique_values(series) == 0