diff --git a/examples/ih/Conversion_Reporting.ipynb b/examples/ih/Conversion_Reporting.ipynb new file mode 100644 index 00000000..7cbb0fa2 --- /dev/null +++ b/examples/ih/Conversion_Reporting.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pdstools import IH\n", + "\n", + "import plotly.io as pio\n", + "import plotly as plotly\n", + "\n", + "plotly.offline.init_notebook_mode()\n", + "pio.renderers.default = \"vscode\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conversion Results\n", + "\n", + "Visualization of conversion modeling results from IH data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import polars as pl\n", + "\n", + "ih_export_file = Path(\n", + " \"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\"\n", + ")\n", + "\n", + "if not ih_export_file.exists():\n", + " ih = IH.from_mock_data()\n", + "else:\n", + " ih = IH.from_ds_export(\n", + " ih_export_file,\n", + " query=pl.col.ExperimentGroup.is_not_null() & (pl.col.ExperimentGroup != \"\"),\n", + " )\n", + "\n", + "ih.aggregates.summary_success_rates(by=[\"ExperimentGroup\", \"Channel\"]).drop(\n", + " \"Outcomes\"\n", + ").collect().to_pandas().style.hide()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ih.plots.conversion_overall_gauges(\n", + " experiment_field=\"ExperimentGroup\",\n", + " by=\"Channel\",\n", + " reference_values={\"Web\": 0.055, \"Email\": 0.09},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Detailed View \n", + "\n", + "Showing conversion rates for all actions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ih.plots.conversion_success_rates_tree_map()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conversion Rate Trends\n", + "\n", + "side-by-side bars and lines (separate methods) with error bars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ih.plots.conversion_success_rates_trend_bar(\n", + " experiment_field=\"ExperimentGroup\",\n", + " every=\"1w\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Engagement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ih.plots.egagement_overall_gauges(\n", + " experiment_field=\"ExperimentGroup\",\n", + " by=\"Channel\",\n", + " reference_values={\"Web\": 0.20, \"Email\": 0.20},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ih.plots.conversion_success_rates_trend_line(\n", + " by=\"Channel\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/ih/Example_IH_Analysis.ipynb b/examples/ih/Example_IH_Analysis.ipynb index 49c257d7..2430ac13 100644 --- a/examples/ih/Example_IH_Analysis.ipynb +++ b/examples/ih/Example_IH_Analysis.ipynb @@ -4,7 +4,19 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'cdhtools'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcdhtools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mIHanalysis\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcdhtools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcdh_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m readDSExport\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mplt\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'cdhtools'" + ] + } + ], "source": [ "import pandas as pd\n", "import sys\n", @@ -18,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -35,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -274,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -548,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -578,7 +590,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -607,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -638,7 +650,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -668,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -699,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -728,7 +740,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -780,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -809,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -826,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -856,7 +868,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -886,7 +898,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -895,7 +907,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -932,7 +944,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -968,11 +980,9 @@ } ], "metadata": { - "interpreter": { - "hash": "0c5c31b7614ab5f7bbff6555bdc6f3ec4cea8754d51936ee45052251e94c1071" - }, "kernelspec": { - "display_name": "Python 3.9.4 64-bit ('newvfenv': conda)", + "display_name": ".venv", + "language": "python", "name": "python3" }, "language_info": { @@ -985,7 +995,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.9" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/python/pdstools/__init__.py b/python/pdstools/__init__.py index dadd39e1..cc881f7d 100644 --- a/python/pdstools/__init__.py +++ b/python/pdstools/__init__.py @@ -7,6 +7,7 @@ from polars import enable_string_cache from .adm.ADMDatamart import ADMDatamart +from .ih.IH import IH from .infinity import Infinity from .pega_io import Anonymization, read_ds_export from .prediction.Prediction import Prediction @@ -23,6 +24,7 @@ __all__ = [ "ADMDatamart", + "IH", "Anonymization", "read_ds_export", "Prediction", diff --git a/python/pdstools/ih/Aggregates.py b/python/pdstools/ih/Aggregates.py index d4c9a8dd..b22b968e 100644 --- a/python/pdstools/ih/Aggregates.py +++ b/python/pdstools/ih/Aggregates.py @@ -1,11 +1,145 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, List, Optional, Union +import polars as pl from ..utils.namespaces import LazyNamespace +from ..utils.cdh_utils import safe_flatten_list if TYPE_CHECKING: from .IH import IH as IH_Class class Aggregates(LazyNamespace): + def __init__(self, ih: "IH_Class"): + super().__init__() self.ih = ih + + def summary_success_rates( + self, + by: Optional[Union[str, List[str]]] = None, + every: Optional[str] = None, + ) -> pl.LazyFrame: + """Groups the IH data summarizing into success rates (SuccessRate) and standard error (StdErr). + + It optionally groups by one or more dimensions (e.g. Experiment, Channel, Issue etc). When + given, the 'every' argument is used to divide the timerange into buckets. It uses the same string + language as Polars. + + Every interaction is considered to have only one outcome: positive, negative or none. When any + outcome in the interaction is in the positive labels, the outcome is considered positive. Next, + when any is in the negative labels, the outcome of the interaction is considered negative. Otherwise + there is no defined outcome and the interaction is ignored in calculations of success rate or error. + + Parameters + ---------- + by : Optional[Union[str, List[str]]], optional + Grouping keys, by default None + every : Optional[str], optional + Every interval start and period length, by default None + + Returns + ------- + pl.LazyFrame + A polars frame with the grouping keys and columns for the total number of Positives, Negatives, + number of Interactions, success rate (SuccessRate) and standard error (StdErr). + """ + + if every is not None: + source = self.ih.data.with_columns(pl.col.OutcomeTime.dt.truncate(every)) + else: + source = self.ih.data + + group_by_clause = safe_flatten_list( + [by] + (["OutcomeTime"] if every is not None else []) + ) + + # TODO filter out nulls for the by arguments + # source.filter( + # pl.col.ExperimentGroup.is_not_null() & (pl.col.ExperimentGroup != "") + # ) + + summary = ( + source.group_by( + (group_by_clause + ["InteractionID"]) + if group_by_clause is not None + else ["InteractionID"] + ) + .agg( + # Take only one outcome per interaction. TODO should perhaps be the last one. + [ + pl.when( + pl.col.Outcome.is_in( + self.ih.positive_outcome_labels[metric] + ).any() + ) + .then(pl.lit(True)) + .when( + pl.col.Outcome.is_in( + self.ih.negative_outcome_labels[metric] + ).any() + ) + .then(pl.lit(False)) + .alias(f"Interaction_Outcome_{metric}") + for metric in self.ih.positive_outcome_labels.keys() + ], + Outcomes=pl.col.Outcome.unique().sort(), # for debugging + ) + .group_by(group_by_clause) + .agg( + [ + pl.col(f"Interaction_Outcome_{metric}") + .filter(pl.col(f"Interaction_Outcome_{metric}")) + .len() + .alias(f"Positives_{metric}") + for metric in self.ih.positive_outcome_labels.keys() + ] + + [ + pl.col(f"Interaction_Outcome_{metric}") + .filter(pl.col(f"Interaction_Outcome_{metric}").not_()) + .len() + .alias(f"Negatives_{metric}") + for metric in self.ih.positive_outcome_labels.keys() + ], + Interactions=pl.len(), + Outcomes=pl.col.Outcomes.list.explode() + .unique() + .sort() + .drop_nulls(), # for debugging + ) + .with_columns( + [ + ( + pl.col(f"Positives_{metric}") + / ( + pl.col(f"Positives_{metric}") + + pl.col(f"Negatives_{metric}") + ) + ).alias(f"SuccessRate_{metric}") + for metric in self.ih.positive_outcome_labels.keys() + ] + ) + .with_columns( + [ + ( + ( + pl.col(f"SuccessRate_{metric}") + * (1 - pl.col(f"SuccessRate_{metric}")) + ) + / ( + pl.col(f"Positives_{metric}") + + pl.col(f"Negatives_{metric}") + ) + ) + .sqrt() + .alias(f"StdErr_{metric}") + for metric in self.ih.positive_outcome_labels.keys() + ] + ) + ) + + if group_by_clause is None: + summary = summary.drop("literal") # created by empty group_by + else: + summary = summary.sort(group_by_clause) + + return summary diff --git a/python/pdstools/ih/IH.py b/python/pdstools/ih/IH.py index d25b32db..c97382be 100644 --- a/python/pdstools/ih/IH.py +++ b/python/pdstools/ih/IH.py @@ -1,14 +1,139 @@ +import datetime +import os +import random +from typing import Dict, List, Optional, Union import polars as pl + from .Aggregates import Aggregates from .Plots import Plots +from ..utils.cdh_utils import to_prpc_date_time, _polars_capitalize, _apply_query +from ..utils.types import QUERY +from ..pega_io.File import read_ds_export class IH: data: pl.LazyFrame + positive_outcome_labels: Dict[str, List[str]] def __init__(self, data: pl.LazyFrame): - self.data = data + self.data = _polars_capitalize(data) self.aggregates = Aggregates(ih=self) self.plots = Plots(ih=self) + self.positive_outcome_labels = { + "Engagement": ["Accepted", "Accept", "Clicked", "Click"], + "Conversion": ["Conversion"], + } + self.negative_outcome_labels = { + "Engagement": [ + "Impression", + "Impressed", + "Pending", + "NoResponse", + ], + "Conversion": ["Impression", "Pending"], + } + + @classmethod + def from_ds_export( + cls, + ih_filename: Union[os.PathLike, str], + query: Optional[QUERY] = None, + ): + """Import from a Pega Dataset Export""" + + data = read_ds_export(ih_filename).with_columns( + # TODO this should come from some polars func in utils + pl.col("pxOutcomeTime").str.strptime(pl.Datetime, "%Y%m%dT%H%M%S%.3f %Z") + ) + if query is not None: + data = _apply_query(data, query=query) + + return IH(data) + + @classmethod + def from_mock_data(cls, days=90, n=100000): + """Generate sample data""" + accept_rate = 0.2 + accept_avg_duration_minutes = 10 + convert_over_accept_rate_test = 0.5 + convert_over_accept_rate_control = 0.3 + convert_avg_duration_days = 2 + + now = datetime.datetime.now() + + # TODO maybe this should be changed in PDS tools - w/o __TimeStamp__ flag + # def to_prpc_time_str(__TimeStamp__): + # return to_prpc_date_time(__TimeStamp__)[0:15] + + ih_fake_impressions = pl.DataFrame( + { + "pxInteractionID": [str(int(1e9 + i)) for i in range(n)], + "pyChannel": random.choices(["Web", "Email"], k=n), + "pyIssue": "Acquisition", + "pyGroup": "Phones", + "pyName": "AppleIPhone1564GB", + "ExperimentGroup": ["Conversion-Test", "Conversion-Control"] + * int(n / 2), + "pxOutcomeTime": [ + (now - datetime.timedelta(days=i * days / n)) for i in range(n) + ], + "__AcceptDurationMinutes__": [ + random.uniform(0, 2 * accept_avg_duration_minutes) for i in range(n) + ], + "__ConvertDurationDays__": [ + random.uniform(0, 2 * convert_avg_duration_days) for i in range(n) + ], + } + ).with_columns( + pyOutcome=pl.when(pl.col.pyChannel == "Web") + .then(pl.lit("Impression")) + .otherwise(pl.lit("Pending")) + ) + ih_fake_accepts = ih_fake_impressions.sample(fraction=accept_rate).with_columns( + pl.col.pxOutcomeTime + + pl.duration(minutes=pl.col("__AcceptDurationMinutes__")), + pyOutcome=pl.when(pl.col.pyChannel == "Web") + .then(pl.lit("Clicked")) + .otherwise(pl.lit("Accepted")), + ) + ih_fake_converts_test = ( + ih_fake_accepts.filter(pl.col.ExperimentGroup == "Conversion-Test") + .sample(fraction=convert_over_accept_rate_test) + .with_columns( + pl.col.pxOutcomeTime + + pl.duration(days=pl.col("__ConvertDurationDays__")), + pyOutcome=pl.lit("Conversion"), + ) + ) + ih_fake_converts_control = ( + ih_fake_accepts.filter(pl.col.ExperimentGroup == "Conversion-Control") + .sample(fraction=convert_over_accept_rate_control) + .with_columns( + pl.col.pxOutcomeTime + + pl.duration(days=pl.col("__ConvertDurationDays__")), + pyOutcome=pl.lit("Conversion"), + ) + ) + + ih_data = ( + pl.concat( + [ + ih_fake_impressions, + ih_fake_accepts, + ih_fake_converts_test, + ih_fake_converts_control, + ] + ) + .filter(pl.col("pxOutcomeTime") < pl.lit(now)) + .drop( + [ + "__AcceptDurationMinutes__", + "__ConvertDurationDays__", + ] + ) + .sort("pxInteractionID", "pxOutcomeTime") + ) + + return IH(ih_data.lazy()) diff --git a/python/pdstools/ih/Plots.py b/python/pdstools/ih/Plots.py index 63f8924b..4951001a 100644 --- a/python/pdstools/ih/Plots.py +++ b/python/pdstools/ih/Plots.py @@ -1,4 +1,9 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Dict, List, Optional +import polars as pl +import plotly as plotly +import plotly.express as px +import plotly.graph_objs as go +from plotly.subplots import make_subplots from ..utils.namespaces import LazyNamespace @@ -8,4 +13,322 @@ class Plots(LazyNamespace): def __init__(self, ih: "IH_Class"): + super().__init__() self.ih = ih + + def overall_gauges( + self, + metric: str, + experiment_field: str, + by: Optional[str] = "Channel", + reference_values: Optional[Dict[str, float]] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + plot_data = self.ih.aggregates.summary_success_rates( + by=[experiment_field, by], + ) + + if return_df: + return plot_data + + if title is None: + title = f"{metric} Overall Rates" + + plot_data = plot_data.collect() + + cols = plot_data[by].unique().shape[0] # TODO can be None + rows = plot_data[experiment_field].unique().shape[0] + + fig = make_subplots( + rows=rows, + cols=cols, + specs=[[{"type": "indicator"} for c in range(cols)] for t in range(rows)], + ) + fig.update_layout( + height=270 * rows, + autosize=True, + title=title, + margin=dict(b=10, t=120, l=10, r=10), + ) + index = 0 + for row in plot_data.iter_rows(named=True): + ref_value = ( + reference_values.get(row[by], None) if reference_values else None + ) + gauge = { + "axis": {"tickformat": ",.2%"}, + "threshold": { + "line": {"color": "red", "width": 2}, + "thickness": 0.75, + "value": ref_value, + }, + } + if ref_value: + if row[f"SuccessRate_{metric}"] < ref_value: + gauge = { + "axis": {"tickformat": ",.2%"}, + "bar": { + "color": ( + "#EC5300" + if row[f"SuccessRate_{metric}"] < (0.75 * ref_value) + else "#EC9B00" + ) + }, + "threshold": { + "line": {"color": "red", "width": 2}, + "thickness": 0.75, + "value": ref_value, + }, + } + + trace1 = go.Indicator( + mode="gauge+number+delta", + number={"valueformat": ",.2%"}, + value=row[f"SuccessRate_{metric}"], + delta={"reference": ref_value, "valueformat": ",.2%"}, + title={"text": f"{row[by]}: {row[experiment_field]}"}, + gauge=gauge, + ) + r, c = divmod(index, cols) + fig.add_trace(trace1, row=(r + 1), col=(c + 1)) + index = index + 1 + + return fig + + def conversion_overall_gauges( + self, + experiment_field: str, + by: Optional[str] = "Channel", + reference_values: Optional[Dict[str, float]] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.overall_gauges( + metric="Conversion", + experiment_field=experiment_field, + by=by, + reference_values=reference_values, + title=title, + return_df=return_df, + ) + + def egagement_overall_gauges( + self, + experiment_field: str, + by: Optional[str] = "Channel", + reference_values: Optional[Dict[str, float]] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.overall_gauges( + metric="Engagement", + experiment_field=experiment_field, + by=by, + reference_values=reference_values, + title=title, + return_df=return_df, + ) + + def success_rates_tree_map( + self, + metric: str, + by: Optional[List[str]] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + if by is None: + by = [ + f + for f in ["Direction", "Channel", "Issue", "Group", "Name"] + if f in self.ih.data.collect_schema().names() + ] + + plot_data = self.ih.aggregates.summary_success_rates( + by=by, + ) + + if return_df: + return plot_data + + if title is None: + title = f"{metric} Rates for All Actions" + + plot_data = plot_data.collect().with_columns( + CTR_DisplayValue=pl.col(f"SuccessRate_{metric}").round(3), + ) + + fig = px.treemap( + plot_data, + path=[px.Constant("ALL")] + by, + values="CTR_DisplayValue", + color="CTR_DisplayValue", + color_continuous_scale=px.colors.sequential.RdBu, + title=title, + hover_data=[ + f"StdErr_{metric}", + f"Positives_{metric}", + f"Negatives_{metric}", + ], + height=640, + template="pega", + ) + fig.update_coloraxes(showscale=False) + fig.update_traces(textinfo="label+value") + fig.update_layout(margin=dict(t=50, l=25, r=25, b=25)) + + return fig + + def conversion_success_rates_tree_map( + self, + by: Optional[List[str]] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.success_rates_tree_map( + metric="Conversion", + by=by, + title=title, + return_df=return_df, + ) + + def engagement_success_rates_tree_map( + self, + by: Optional[List[str]] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.success_rates_tree_map( + metric="Engagement", + by=by, + title=title, + return_df=return_df, + ) + + def success_rates_trend_bar( + self, + metric: str, + experiment_field: str, + every: str = "1d", + by: Optional[str] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + + plot_data = self.ih.aggregates.summary_success_rates( + every=every, + by=[experiment_field] + [by], + ) + if return_df: + return plot_data + + if title is None: + title = f"{metric} Rates over Time" + + fig = px.bar( + plot_data.collect(), + x="OutcomeTime", + y=f"SuccessRate_{metric}", + color=experiment_field, + error_y=f"StdErr_{metric}", + facet_row=by, + barmode="group", + custom_data=[experiment_field], + template="pega", + title=title, + ) + fig.update_yaxes(tickformat=",.3%").update_layout(xaxis_title=None) + return fig + + def conversion_success_rates_trend_bar( + self, + experiment_field: str, + every: str = "1d", + by: Optional[str] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.success_rates_trend_bar( + metric="Conversion", + experiment_field=experiment_field, + every=every, + by=by, + title=title, + return_df=return_df, + ) + + def engagement_success_rates_trend_bar( + self, + experiment_field: str, + every: str = "1d", + by: Optional[str] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.success_rates_trend_bar( + metric="Engagement", + experiment_field=experiment_field, + every=every, + by=by, + title=title, + return_df=return_df, + ) + + def success_rates_trend_line( + self, + metric: str, + every: Optional[str] = "1d", + by: Optional[str] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + plot_data = self.ih.aggregates.summary_success_rates( + every=every, + by=by, + ) + if return_df: + return plot_data + + fig = px.line( + plot_data.collect(), + x="OutcomeTime", + y=f"SuccessRate_{metric}", + color=by, + facet_row=by, + # custom_data=[experiment_field] if experiment_field is not None else None, + template="pega", + title=title, + ) + + fig.update_yaxes(tickformat=",.3%").update_layout(xaxis_title=None) + return fig + + def conversion_success_rates_trend_line( + self, + every: Optional[str] = "1d", + by: Optional[str] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.success_rates_trend_line( + metric="Conversion", + every=every, + by=by, + title=title, + return_df=return_df, + ) + + def engagement_success_rates_trend_line( + self, + every: Optional[str] = "1d", + by: Optional[str] = None, + title: Optional[str] = None, + return_df: Optional[bool] = False, + ): + return self.success_rates_trend_line( + metric="Engagement", + every=every, + by=by, + title=title, + return_df=return_df, + ) diff --git a/python/pdstools/ih/__init__.py b/python/pdstools/ih/__init__.py index e69de29b..9d3c5be9 100644 --- a/python/pdstools/ih/__init__.py +++ b/python/pdstools/ih/__init__.py @@ -0,0 +1,3 @@ +from .IH import IH + +__all__ = ["IH"] \ No newline at end of file diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd index 359d5b27..d817674a 100644 --- a/python/pdstools/reports/HealthCheck.qmd +++ b/python/pdstools/reports/HealthCheck.qmd @@ -1032,7 +1032,6 @@ if datamart.predictor_data is None: ) ``` - ## Number of Predictors per Predictor Category The Predictor Categories identify the source of the predictors. By default we split by the first dot, so this distinguishes between between e.g. *Customer*, *Account*, *IH* and parameterized (*Param.*) predictors. @@ -1041,7 +1040,6 @@ You can override this behavior when the data is read. The numbers here can differ from the totals above, these ones are leading. - ::: {.callout-tip title="Guidance"} - Total number of predictors per model 200 - 700 to stay within service limits - There should be some “IH” predictors but no more than ca 100 of them diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py index 2e5a1088..953ffbb2 100644 --- a/python/pdstools/utils/cdh_utils.py +++ b/python/pdstools/utils/cdh_utils.py @@ -1,6 +1,8 @@ import datetime +from functools import partial import io import logging +from operator import is_not import re import tempfile import warnings @@ -216,8 +218,7 @@ def _extract_keys( .alias(c) for c in overlap ] - ) - .drop([f"{c}_decoded" for c in overlap]) + ).drop([f"{c}_decoded" for c in overlap]) ) @@ -477,7 +478,9 @@ def auc_to_gini(auc: float) -> float: return 2 * safe_range_auc(auc) - 1 -def _capitalize(fields: Union[str, Iterable[str]]) -> List[str]: +def _capitalize( + fields: Union[str, Iterable[str]], extra_endwords: Optional[Iterable[str]] = None +) -> List[str]: """Applies automatic capitalization, aligned with the R couterpart. Parameters @@ -567,6 +570,7 @@ def _capitalize(fields: Union[str, Iterable[str]]) -> List[str]: "Strategy", "ModelTechnique", ] + if not isinstance(fields, list): fields = [fields] fields = [re.sub("^p(x|y|z)", "", field.lower()) for field in fields] @@ -579,9 +583,9 @@ def _capitalize(fields: Union[str, Iterable[str]]) -> List[str]: return fields -def _polars_capitalize(df: F) -> F: +def _polars_capitalize(df: F, extra_endwords: Optional[Iterable[str]] = None) -> F: cols = df.collect_schema().names() - renamed_cols = _capitalize(cols) + renamed_cols = _capitalize(cols, extra_endwords) def deduplicate(columns: List[str]): seen: Dict[str, int] = {} @@ -809,7 +813,9 @@ def lift_impl(bin_pos, bin_neg, total_pos, total_neg): # TODO not sure how polars (mis)behaves when there are no positives at all # I would hope for a NaN but base python doesn't do that. Polars perhaps. # Stijn: It does have proper None value support, may work like you say - bin_pos * (total_pos + total_neg) / ((bin_pos + bin_neg) * total_pos) + bin_pos + * (total_pos + total_neg) + / ((bin_pos + bin_neg) * total_pos) ).alias("Lift") return lift_impl(pos_col, neg_col, pos_col.sum(), neg_col.sum()) @@ -1149,3 +1155,17 @@ def create_working_and_temp_dir( else tempfile.mkdtemp(prefix="tmp_", dir=working_dir) ) return working_dir, Path(temp_dir_name) + + +# Safe flattening of nested lists, removing None elements, and not splitting strings +def safe_flatten_list(alist: List) -> List: + if alist is None: + return None + alist = list(filter(partial(is_not, None), alist)) + alist = [ + item + for sublist in [[item] if type(item) is not list else item for item in alist] + for item in sublist + ] + alist = list(filter(partial(is_not, None), alist)) + return alist if len(alist) > 0 else None