From 7bbba9ec9111f49407a9fa089142a6ec68154b0f Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Fri, 22 Sep 2023 16:01:27 +0200 Subject: [PATCH] fix: unify description classes --- .../pandas/var_description/counts_pandas.py | 100 +++++++----------- .../pandas/var_description/default_pandas.py | 62 +++-------- .../model/var_description/counts.py | 3 + .../model/var_description/default.py | 17 +++ 4 files changed, 74 insertions(+), 108 deletions(-) diff --git a/src/ydata_profiling/model/pandas/var_description/counts_pandas.py b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py index c7da54769..6ffc3e6d5 100644 --- a/src/ydata_profiling/model/pandas/var_description/counts_pandas.py +++ b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py @@ -1,71 +1,53 @@ -from dataclasses import dataclass - import pandas as pd from ydata_profiling.config import Settings from ydata_profiling.model.var_description.counts import VarCounts -@dataclass -class VarCountsPandas(VarCounts): - value_counts_without_nan: pd.Series - """Counts of values in the series without NaN.""" - value_counts_index_sorted: pd.Series - """Sorted counts of values in the series without NaN.""" +def get_counts_pandas(config: Settings, series: pd.Series) -> VarCounts: + """Get a VarCounts object for a pandas series.""" + length = len(series) - def __init__(self, config: Settings, series: pd.Series): - """Counts the values in a series (with and without NaN, distinct). + try: + value_counts_with_nan = series.value_counts(dropna=False) + _ = set(value_counts_with_nan.index) + hashable = True + except: # noqa: E722 + hashable = False - Args: - config: report Settings object - series: Series for which we want to calculate the values. - summary: series' summary + value_counts_without_nan = None + value_counts_index_sorted = None + if hashable: + value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] - Returns: - A dictionary with the count values (with and without NaN, distinct). - """ - length = len(series) + null_index = value_counts_with_nan.index.isnull() + if null_index.any(): + n_missing = value_counts_with_nan[null_index].sum() + value_counts_without_nan = value_counts_with_nan[~null_index] + else: + n_missing = 0 + value_counts_without_nan = value_counts_with_nan try: - value_counts_with_nan = series.value_counts(dropna=False) - _ = set(value_counts_with_nan.index) - hashable = True - except: # noqa: E722 - hashable = False - - value_counts_without_nan = None - value_counts_index_sorted = None - if hashable: - value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] - - null_index = value_counts_with_nan.index.isnull() - if null_index.any(): - n_missing = value_counts_with_nan[null_index].sum() - value_counts_without_nan = value_counts_with_nan[~null_index] - else: - n_missing = 0 - value_counts_without_nan = value_counts_with_nan - - try: - value_counts_index_sorted = value_counts_without_nan.sort_index( - ascending=True - ) - ordering = True - except TypeError: - ordering = False - else: - n_missing = series.isna().sum() + value_counts_index_sorted = value_counts_without_nan.sort_index( + ascending=True + ) + ordering = True + except TypeError: ordering = False - - super().__init__( - hashable=hashable, - value_counts_without_nan=value_counts_without_nan, - value_counts_index_sorted=value_counts_index_sorted, - ordering=ordering, - n_missing=n_missing, - n=length, - p_missing=series.isna().sum() / length if length > 0 else 0, - count=length - series.isna().sum(), - memory_size=series.memory_usage(deep=config.memory_deep), - value_counts=None, - ) + else: + n_missing = series.isna().sum() + ordering = False + + return VarCounts( + hashable=hashable, + value_counts_without_nan=value_counts_without_nan, + value_counts_index_sorted=value_counts_index_sorted, + ordering=ordering, + n_missing=n_missing, + n=length, + p_missing=series.isna().sum() / length if length > 0 else 0, + count=length - series.isna().sum(), + memory_size=series.memory_usage(deep=config.memory_deep), + value_counts=None, + ) diff --git a/src/ydata_profiling/model/pandas/var_description/default_pandas.py b/src/ydata_profiling/model/pandas/var_description/default_pandas.py index 29c076511..af4ffa6bb 100644 --- a/src/ydata_profiling/model/pandas/var_description/default_pandas.py +++ b/src/ydata_profiling/model/pandas/var_description/default_pandas.py @@ -1,55 +1,27 @@ from __future__ import annotations -from dataclasses import dataclass - import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.pandas.var_description.counts_pandas import VarCountsPandas +from ydata_profiling.model.pandas.var_description.counts_pandas import get_counts_pandas from ydata_profiling.model.var_description.default import ( VarDescription, VarDescriptionHashable, ) -@dataclass -class VarDescriptionPandas(VarDescription): - """Default description for pandas columns.""" - - @classmethod - def from_var_counts( - cls, var_counts: VarCountsPandas, init_dict: dict - ) -> VarDescriptionPandas: - """Get a default description from a VarCountsPandas object.""" - return VarDescriptionPandas( - n=var_counts.n, - count=var_counts.count, - n_missing=var_counts.n_missing, - p_missing=var_counts.p_missing, - hashable=var_counts.hashable, - memory_size=var_counts.memory_size, - ordering=var_counts.ordering, - value_counts_index_sorted=var_counts.value_counts_index_sorted, - value_counts_without_nan=var_counts.value_counts_without_nan, - var_specific=init_dict, - ) - - -@dataclass -class VarDescriptionPandasHashable(VarDescriptionHashable): - """Default description for pandas columns that are hashable (common types).""" +def get_default_pandas_description( + config: Settings, series: pd.Series, init_dict: dict +) -> VarDescription | VarDescriptionHashable: + var_counts = get_counts_pandas(config, series) - @classmethod - def from_var_counts( - cls, var_counts: VarCountsPandas, init_dict: dict - ) -> VarDescriptionPandasHashable: - """Get a default description for a hashable column from a VarCountsPandas object.""" - _count = var_counts.count + if var_counts.hashable: + count = var_counts.count value_counts = var_counts.value_counts_without_nan distinct_count = len(value_counts) unique_count = value_counts.where(value_counts == 1).count() - return VarDescriptionPandasHashable( + return VarDescriptionHashable( n=var_counts.n, count=var_counts.count, n_missing=var_counts.n_missing, @@ -60,19 +32,11 @@ def from_var_counts( value_counts_index_sorted=var_counts.value_counts_index_sorted, value_counts_without_nan=var_counts.value_counts_without_nan, n_distinct=distinct_count, - p_distinct=distinct_count / _count if _count > 0 else 0, - is_unique=unique_count == _count and _count > 0, + p_distinct=distinct_count / count if count > 0 else 0, + is_unique=unique_count == count and count > 0, n_unique=unique_count, - p_unique=unique_count / _count if _count > 0 else 0, + p_unique=unique_count / count if count > 0 else 0, + value_counts=None, var_specific=init_dict, - value_counts=var_counts.value_counts, ) - - -def get_default_pandas_description( - config: Settings, series: pd.Series, init_dict: dict -) -> VarDescriptionPandas | VarDescriptionPandasHashable: - _var_counts = VarCountsPandas(config, series) - if _var_counts.hashable: - return VarDescriptionPandasHashable.from_var_counts(_var_counts, init_dict) - return VarDescriptionPandas.from_var_counts(_var_counts, init_dict) + return VarDescription.from_var_counts(var_counts, init_dict) diff --git a/src/ydata_profiling/model/var_description/counts.py b/src/ydata_profiling/model/var_description/counts.py index 4937e3f7e..70f96af20 100644 --- a/src/ydata_profiling/model/var_description/counts.py +++ b/src/ydata_profiling/model/var_description/counts.py @@ -22,3 +22,6 @@ class VarCounts: """Sorted counts of values in the series without NaN. Sorted by counts.""" ordering: Union[bool, list] memory_size: Union[int, list] + + value_counts: Any + """Counts of values in original series type. Values as index, counts as values.""" diff --git a/src/ydata_profiling/model/var_description/default.py b/src/ydata_profiling/model/var_description/default.py index c9da3363d..c9f4851c6 100644 --- a/src/ydata_profiling/model/var_description/default.py +++ b/src/ydata_profiling/model/var_description/default.py @@ -38,6 +38,23 @@ def __iter__(self) -> Iterator: """To support old dict like interface.""" return self.var_specific.__iter__() + @classmethod + def from_var_counts(cls, var_counts: VarCounts, init_dict: dict) -> VarDescription: + """Get a default description from a VarCounts object.""" + return VarDescription( + n=var_counts.n, + count=var_counts.count, + n_missing=var_counts.n_missing, + p_missing=var_counts.p_missing, + hashable=var_counts.hashable, + memory_size=var_counts.memory_size, + ordering=var_counts.ordering, + var_specific=init_dict, + value_counts_index_sorted=var_counts.value_counts_index_sorted, + value_counts_without_nan=var_counts.value_counts_without_nan, + value_counts=var_counts.value_counts, + ) + @dataclass class VarDescriptionHashable(VarDescription):