Skip to content

Commit

Permalink
fix: unify description classes
Browse files Browse the repository at this point in the history
  • Loading branch information
vorel99 committed Sep 22, 2023
1 parent 3fa69a9 commit 7bbba9e
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 108 deletions.
100 changes: 41 additions & 59 deletions src/ydata_profiling/model/pandas/var_description/counts_pandas.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,53 @@
from dataclasses import dataclass

import pandas as pd

from ydata_profiling.config import Settings
from ydata_profiling.model.var_description.counts import VarCounts


@dataclass
class VarCountsPandas(VarCounts):
value_counts_without_nan: pd.Series
"""Counts of values in the series without NaN."""
value_counts_index_sorted: pd.Series
"""Sorted counts of values in the series without NaN."""
def get_counts_pandas(config: Settings, series: pd.Series) -> VarCounts:
"""Get a VarCounts object for a pandas series."""
length = len(series)

def __init__(self, config: Settings, series: pd.Series):
"""Counts the values in a series (with and without NaN, distinct).
try:
value_counts_with_nan = series.value_counts(dropna=False)
_ = set(value_counts_with_nan.index)
hashable = True
except: # noqa: E722
hashable = False

Args:
config: report Settings object
series: Series for which we want to calculate the values.
summary: series' summary
value_counts_without_nan = None
value_counts_index_sorted = None
if hashable:
value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]

Returns:
A dictionary with the count values (with and without NaN, distinct).
"""
length = len(series)
null_index = value_counts_with_nan.index.isnull()
if null_index.any():
n_missing = value_counts_with_nan[null_index].sum()
value_counts_without_nan = value_counts_with_nan[~null_index]
else:
n_missing = 0
value_counts_without_nan = value_counts_with_nan

try:
value_counts_with_nan = series.value_counts(dropna=False)
_ = set(value_counts_with_nan.index)
hashable = True
except: # noqa: E722
hashable = False

value_counts_without_nan = None
value_counts_index_sorted = None
if hashable:
value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]

null_index = value_counts_with_nan.index.isnull()
if null_index.any():
n_missing = value_counts_with_nan[null_index].sum()
value_counts_without_nan = value_counts_with_nan[~null_index]
else:
n_missing = 0
value_counts_without_nan = value_counts_with_nan

try:
value_counts_index_sorted = value_counts_without_nan.sort_index(
ascending=True
)
ordering = True
except TypeError:
ordering = False
else:
n_missing = series.isna().sum()
value_counts_index_sorted = value_counts_without_nan.sort_index(
ascending=True
)
ordering = True
except TypeError:
ordering = False

super().__init__(
hashable=hashable,
value_counts_without_nan=value_counts_without_nan,
value_counts_index_sorted=value_counts_index_sorted,
ordering=ordering,
n_missing=n_missing,
n=length,
p_missing=series.isna().sum() / length if length > 0 else 0,
count=length - series.isna().sum(),
memory_size=series.memory_usage(deep=config.memory_deep),
value_counts=None,
)
else:
n_missing = series.isna().sum()
ordering = False

return VarCounts(
hashable=hashable,
value_counts_without_nan=value_counts_without_nan,
value_counts_index_sorted=value_counts_index_sorted,
ordering=ordering,
n_missing=n_missing,
n=length,
p_missing=series.isna().sum() / length if length > 0 else 0,
count=length - series.isna().sum(),
memory_size=series.memory_usage(deep=config.memory_deep),
value_counts=None,
)
62 changes: 13 additions & 49 deletions src/ydata_profiling/model/pandas/var_description/default_pandas.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,27 @@
from __future__ import annotations

from dataclasses import dataclass

import pandas as pd

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.var_description.counts_pandas import VarCountsPandas
from ydata_profiling.model.pandas.var_description.counts_pandas import get_counts_pandas
from ydata_profiling.model.var_description.default import (
VarDescription,
VarDescriptionHashable,
)


@dataclass
class VarDescriptionPandas(VarDescription):
"""Default description for pandas columns."""

@classmethod
def from_var_counts(
cls, var_counts: VarCountsPandas, init_dict: dict
) -> VarDescriptionPandas:
"""Get a default description from a VarCountsPandas object."""
return VarDescriptionPandas(
n=var_counts.n,
count=var_counts.count,
n_missing=var_counts.n_missing,
p_missing=var_counts.p_missing,
hashable=var_counts.hashable,
memory_size=var_counts.memory_size,
ordering=var_counts.ordering,
value_counts_index_sorted=var_counts.value_counts_index_sorted,
value_counts_without_nan=var_counts.value_counts_without_nan,
var_specific=init_dict,
)


@dataclass
class VarDescriptionPandasHashable(VarDescriptionHashable):
"""Default description for pandas columns that are hashable (common types)."""
def get_default_pandas_description(
config: Settings, series: pd.Series, init_dict: dict
) -> VarDescription | VarDescriptionHashable:
var_counts = get_counts_pandas(config, series)

@classmethod
def from_var_counts(
cls, var_counts: VarCountsPandas, init_dict: dict
) -> VarDescriptionPandasHashable:
"""Get a default description for a hashable column from a VarCountsPandas object."""
_count = var_counts.count
if var_counts.hashable:
count = var_counts.count
value_counts = var_counts.value_counts_without_nan
distinct_count = len(value_counts)
unique_count = value_counts.where(value_counts == 1).count()

return VarDescriptionPandasHashable(
return VarDescriptionHashable(
n=var_counts.n,
count=var_counts.count,
n_missing=var_counts.n_missing,
Expand All @@ -60,19 +32,11 @@ def from_var_counts(
value_counts_index_sorted=var_counts.value_counts_index_sorted,
value_counts_without_nan=var_counts.value_counts_without_nan,
n_distinct=distinct_count,
p_distinct=distinct_count / _count if _count > 0 else 0,
is_unique=unique_count == _count and _count > 0,
p_distinct=distinct_count / count if count > 0 else 0,
is_unique=unique_count == count and count > 0,
n_unique=unique_count,
p_unique=unique_count / _count if _count > 0 else 0,
p_unique=unique_count / count if count > 0 else 0,
value_counts=None,
var_specific=init_dict,
value_counts=var_counts.value_counts,
)


def get_default_pandas_description(
config: Settings, series: pd.Series, init_dict: dict
) -> VarDescriptionPandas | VarDescriptionPandasHashable:
_var_counts = VarCountsPandas(config, series)
if _var_counts.hashable:
return VarDescriptionPandasHashable.from_var_counts(_var_counts, init_dict)
return VarDescriptionPandas.from_var_counts(_var_counts, init_dict)
return VarDescription.from_var_counts(var_counts, init_dict)
3 changes: 3 additions & 0 deletions src/ydata_profiling/model/var_description/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ class VarCounts:
"""Sorted counts of values in the series without NaN. Sorted by counts."""
ordering: Union[bool, list]
memory_size: Union[int, list]

value_counts: Any
"""Counts of values in original series type. Values as index, counts as values."""
17 changes: 17 additions & 0 deletions src/ydata_profiling/model/var_description/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,23 @@ def __iter__(self) -> Iterator:
"""To support old dict like interface."""
return self.var_specific.__iter__()

@classmethod
def from_var_counts(cls, var_counts: VarCounts, init_dict: dict) -> VarDescription:
"""Get a default description from a VarCounts object."""
return VarDescription(
n=var_counts.n,
count=var_counts.count,
n_missing=var_counts.n_missing,
p_missing=var_counts.p_missing,
hashable=var_counts.hashable,
memory_size=var_counts.memory_size,
ordering=var_counts.ordering,
var_specific=init_dict,
value_counts_index_sorted=var_counts.value_counts_index_sorted,
value_counts_without_nan=var_counts.value_counts_without_nan,
value_counts=var_counts.value_counts,
)


@dataclass
class VarDescriptionHashable(VarDescription):
Expand Down

0 comments on commit 7bbba9e

Please sign in to comment.