Skip to content

Commit

Permalink
feat: fist version of the gap analysis tab for ts (#1410)
Browse files Browse the repository at this point in the history
* feat: fist version of the gap analysis tab for ts

* feat: add gap stats table

* fix: adjust gap plot the image size

* feat: new gap analysis visualization
  • Loading branch information
alexbarros authored Aug 8, 2023
1 parent 7e7634f commit 51fc7c7
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 7 deletions.
45 changes: 45 additions & 0 deletions src/ydata_profiling/model/pandas/describe_timeseries_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from statsmodels.tsa.stattools import adfuller

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
from ydata_profiling.model.summary_algorithms import (
describe_numeric_1d,
describe_timeseries_1d,
Expand Down Expand Up @@ -141,6 +142,49 @@ def get_fft_peaks(
return threshold, orig_peaks, peaks


def compute_gap_stats(series: pd.Series) -> pd.Series:
"""Computes the intertevals in the series normalized by the period.
Args:
series (pd.Series): time series data to analysis.
Returns:
A series with the gaps intervals.
"""

gap = series.dropna()
index_name = gap.index.name if gap.index.name else "index"
gap = gap.reset_index()[index_name]
gap.index.name = None

if isinstance(series.index, pd.DatetimeIndex):
period, frequency = get_period_and_frequency(series.index)
period = pd.Timedelta(f"{period} {frequency}")
base_frequency = pd.Timedelta(f"1 {frequency}")
else:
period = np.abs(np.diff(series.index)).mean()
base_frequency = 1

diff = gap.diff()
anchors = gap[diff > period].index
gaps = []
for i in anchors:
gaps.append(gap.loc[gap.index[[i - 1, i]]].values)

stats = {
"period": period / base_frequency,
"min": diff.min() / base_frequency,
"max": diff.max() / base_frequency,
"mean": diff.mean() / base_frequency,
"std": diff.std() / base_frequency,
"series": series,
"gaps": gaps,
}
if isinstance(series.index, pd.DatetimeIndex):
stats["frequency"] = frequency
return stats


@describe_timeseries_1d.register
@series_hashable
@series_handle_nulls
Expand All @@ -164,5 +208,6 @@ def pandas_describe_timeseries_1d(
stats["stationary"] = is_stationary and not stats["seasonal"]
stats["addfuller"] = p_value
stats["series"] = series
stats["gap_stats"] = compute_gap_stats(series)

return config, series, stats
6 changes: 2 additions & 4 deletions src/ydata_profiling/model/pandas/timeseries_index_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pandas.api.types import is_numeric_dtype

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
from ydata_profiling.model.timeseries_index import get_time_index_description


Expand All @@ -21,10 +22,7 @@ def pandas_get_time_index_description(
start = df.index.min()
end = df.index.max()
if isinstance(df.index, pd.DatetimeIndex):
freq = df.index.inferred_freq
delta = abs(np.diff(df.index)).mean()
delta = delta.astype(f"timedelta64[{df.index.inferred_freq}]")
period = delta.astype(float)
period, freq = get_period_and_frequency(df.index)
else:
freq = None
period = abs(np.diff(df.index)).mean()
Expand Down
21 changes: 21 additions & 0 deletions src/ydata_profiling/model/pandas/utils_pandas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from typing import Tuple

import numpy as np
import pandas as pd


def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
Expand All @@ -25,3 +28,21 @@ def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
else:
w_median = s_data[idx + 1]
return w_median


def get_period_and_frequency(index: pd.DatetimeIndex) -> Tuple[float, str]:
delta = abs(np.diff(index)).mean()
delta = pd.Timedelta(delta)
if delta.days > 0:
frequency = "Days"
period = delta / pd.Timedelta(days=1)
elif delta.seconds > 0:
frequency = "Seconds"
period = delta / pd.Timedelta(seconds=1)
elif delta.microseconds > 0:
frequency = "Microseconds"
period = delta / pd.Timedelta(microseconds=1)
else:
frequency = "Nanoseconds"
period = delta.nanoseconds / pd.Timedelta(nanoseconds=1)
return period, frequency
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,80 @@
VariableInfo,
)
from ydata_profiling.report.structure.variables.render_common import render_common
from ydata_profiling.visualisation.plot import histogram, mini_ts_plot, plot_acf_pacf
from ydata_profiling.visualisation.plot import (
histogram,
mini_ts_plot,
plot_acf_pacf,
plot_timeseries_gap_analysis,
)


def _render_gap_tab(config: Settings, summary: dict) -> Container:
gap_stats = [
{
"name": "period",
"value": fmt_numeric(
summary["gap_stats"]["period"], precision=config.report.precision
),
},
]
if "frequency" in summary["gap_stats"]:
gap_stats.append(
{
"name": "frequency",
"value": summary["gap_stats"]["frequency"],
}
)
gap_stats.extend(
[
{
"name": "min inverval",
"value": fmt_numeric(
summary["gap_stats"]["min"], precision=config.report.precision
),
},
{
"name": "max inverval",
"value": fmt_numeric(
summary["gap_stats"]["max"], precision=config.report.precision
),
},
{
"name": "mean inverval",
"value": fmt_numeric(
summary["gap_stats"]["mean"], precision=config.report.precision
),
},
{
"name": "interval std",
"value": fmt_numeric(
summary["gap_stats"]["std"], precision=config.report.precision
),
},
]
)
gap_table = Table(
gap_stats,
name="Intervals statistics",
style=config.html.style,
)

gap_plot = Image(
plot_timeseries_gap_analysis(
config, summary["gap_stats"]["series"], summary["gap_stats"]["gaps"]
),
image_format=config.plot.image_format,
alt="Gap plot",
name="",
anchor_id=f"{summary['varid']}_gap_plot",
)
return Container(
[gap_table, gap_plot],
image_format=config.plot.image_format,
sequence_type="grid",
name="Gap analysis",
anchor_id=f"{summary['varid']}_gap_analysis",
)


def render_timeseries(config: Settings, summary: dict) -> dict:
Expand Down Expand Up @@ -289,8 +362,10 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
anchor_id=f"{varid}_ts_plot",
)

ts_gap = _render_gap_tab(config, summary)

template_variables["bottom"] = Container(
[statistics, hist, ts_plot, fq, evs, acf_pacf],
[statistics, hist, ts_plot, ts_gap, fq, evs, acf_pacf],
sequence_type="tabs",
anchor_id=f"{varid}bottom",
)
Expand Down
48 changes: 47 additions & 1 deletion src/ydata_profiling/visualisation/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from matplotlib.colors import Colormap, LinearSegmentedColormap, ListedColormap, rgb2hex
from matplotlib.dates import AutoDateLocator, ConciseDateFormatter
from matplotlib.patches import Patch
from matplotlib.ticker import FuncFormatter
from matplotlib.ticker import FuncFormatter, MaxNLocator
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typeguard import typechecked
from wordcloud import WordCloud
Expand Down Expand Up @@ -557,6 +557,52 @@ def _format_ts_date_axis(
return axis


@manage_matplotlib_context()
def plot_timeseries_gap_analysis(
config: Settings,
series: Union[pd.Series, List[pd.Series]],
gaps: Union[pd.Series, List[pd.Series]],
figsize: tuple = (6, 3),
) -> matplotlib.figure.Figure:
"""Plot an line plot from the data and return the AxesSubplot object.
Args:
variables: The data to plot.
figsize: The size of the figure (width, height) in inches, default (6,4).
Returns:
The TimeSeries lineplot.
"""
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)

colors = create_comparison_color_list(config)
if isinstance(series, list):
min_ = min(s.min() for s in series)
max_ = max(s.max() for s in series)
labels = config.html.style._labels
for serie, gaps_, color, label in zip(series, gaps, colors, labels):
serie.plot(
ax=ax,
label=label,
color=color,
alpha=0.65,
)
_format_ts_date_axis(serie, ax)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
for gap in gaps_:
ax.fill_between(x=gap, y1=min_, y2=max_, color=color, alpha=0.25)
else:
series.plot(ax=ax)
_format_ts_date_axis(series, ax)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))

for gap in gaps:
ax.fill_between(
x=gap, y1=series.min(), y2=series.max(), color=colors[0], alpha=0.25
)

return plot_360_n0sc0pe(config)


@manage_matplotlib_context()
def plot_overview_timeseries(
config: Settings,
Expand Down

0 comments on commit 51fc7c7

Please sign in to comment.