V1 of IH analysis on conversion models

pegasystems · Dec 19, 2024 · e5c150c · e5c150c
1 parent e5daf2e
commit e5c150c
Show file tree

Hide file tree

Showing 5 changed files with 214 additions and 132 deletions.
diff --git a/...es/ih/Conversion_Modeling_Reporting.ipynb → examples/ih/Conversion_Reporting.ipynb b/...es/ih/Conversion_Modeling_Reporting.ipynb → examples/ih/Conversion_Reporting.ipynb
@@ -6,16 +6,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import polars as pl\n",
-    "from pdstools import read_ds_export, IH\n",
-    "from pdstools.utils import cdh_utils\n",
-    "from ih_helper import interaction_history\n",
+    "from pdstools import IH\n",
     "\n",
     "import plotly.io as pio\n",
     "import plotly as plotly\n",
-    "import plotly.express as px\n",
-    "import plotly.graph_objs as go\n",
-    "from plotly.subplots import make_subplots\n",
     "\n",
     "plotly.offline.init_notebook_mode()\n",
     "pio.renderers.default = \"vscode\""
@@ -25,7 +19,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Conversion Results"
+    "# Conversion Results\n",
+    "\n",
+    "Visualization of conversion modeling results from IH data."
    ]
   },
   {
@@ -37,7 +33,7 @@
     "from pathlib import Path\n",
     "\n",
     "ih_export_file = Path(\n",
-    "    \"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\"\n",
+    "    \"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip \"\n",
     ")\n",
     "\n",
     "if not ih_export_file.exists():\n",
@@ -103,17 +99,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ih.plots.trend_line(\n",
-    "#     experiment_field=\"ExperimentGroup\", # should be optional, can also give query to select only the Test group\n",
-    "#     granularity=\"1d\", # string language polars\n",
-    "#     positive_labels=[\"Conversion\"],\n",
-    "#     negative_labels=[\"Impression\", \"Pending\"],\n",
-    "#     title=\"Conversion Rate trends\",\n",
-    "# )"
+    "ih.plots.trend_bar(\n",
+    "    experiment_field=\"ExperimentGroup\",\n",
+    "    every=\"1w\",\n",
+    "    positive_labels=[\"Conversion\"],\n",
+    "    negative_labels=[\"Impression\", \"Pending\"],\n",
+    "    title=\"Conversion Rates over Time\",\n",
+    ")"
    ]
   },
   {
@@ -136,6 +132,17 @@
     "    title = \"Overall Engagement\",\n",
     ")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ih.plots.trend_line(\n",
+    "    title=\"Engagement Rates over Time\",\n",
+    ")"
+   ]
   }
  ],
  "metadata": {

diff --git a/examples/ih/ih_helper.py b/examples/ih/ih_helper.py
diff --git a/python/pdstools/ih/Aggregates.py b/python/pdstools/ih/Aggregates.py
@@ -1,8 +1,8 @@
-from itertools import chain
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import TYPE_CHECKING, List, Optional
 import polars as pl
 
 from ..utils.namespaces import LazyNamespace
+from ..utils.cdh_utils import safe_flatten_list
 
 if TYPE_CHECKING:
     from .IH import IH as IH_Class
@@ -15,17 +15,42 @@ def __init__(self, ih: "IH_Class"):
 
     def summary_by_experiment(
         self,
-        experiment_field: str,
+        experiment_field: Optional[str] = None,
+        every: Optional[str] = None,
         by: Optional[List[str]] = None,
-        positive_labels: List[str] = None,
-        negative_labels: List[str] = None,
-    ):
+        positive_labels: Optional[List[str]] = None,
+        negative_labels: Optional[List[str]] = None,
+    ) -> pl.LazyFrame:
+        """Groups the IH data summarizing into success rate (CTR) and standard error (StdErr).
 
-        if by is not None:
-            if isinstance(by, str):
-                by = [by]
-        else:
-            by = []
+        It groups by the "experiment field" (TODO in the future this can be optional or multiple). When
+        given, the 'every' argument is used to divide the timerange into buckets. It uses the same string
+        language as Polars.
+
+        Every interaction is considered to have only one outcome: positive, negative or none. When any
+        outcome in the interaction is in the positive labels, the outcome is considered positive. Next,
+        when any is in the negative labels, the outcome of the interaction is considered negative. Otherwise
+        there is no defined outcome and the interaction is ignored in calculations of success rate or error.
+
+        Parameters
+        ----------
+        experiment_field : Optional[str], optional
+            Optional field that contains the experiments
+        every : Optional[str], optional
+            Every interval start and period length, by default None
+        by : Optional[List[str]], optional
+            Extra grouping keys, by default None
+        positive_labels : Optional[List[str]], optional
+            Outcome label(s) for the positive responses, by default None
+        negative_labels : Optional[List[str]], optional
+            Outcome label(s) for the negative responses, by default None
+
+        Returns
+        -------
+        pl.LazyFrame
+            A polars frame with the grouping keys and columns for the total number of Positives, Negatives,
+            number of Interactions, success rate (CTR) and standard error (StdErr).
+        """
 
         if positive_labels is None:
             positive_labels = ["Accepted", "Accept", "Clicked", "Click"]
@@ -38,11 +63,26 @@ def summary_by_experiment(
                 "NoResponse",
             ]
 
+        if every is not None:
+            source = self.ih.data.with_columns(pl.col.OutcomeTime.dt.truncate(every))
+        else:
+            source = self.ih.data
+
+        group_by_clause = safe_flatten_list(
+            [experiment_field] + [by] + (["OutcomeTime"] if every is not None else [])
+        )
+        if len(group_by_clause) == 0:
+            group_by_clause = None
+
         summary = (
-            self.ih.data.filter(
+            source.filter(
                 pl.col.ExperimentGroup.is_not_null() & (pl.col.ExperimentGroup != "")
             )
-            .group_by([experiment_field] + by + ["InteractionID"])
+            .group_by(
+                (group_by_clause + ["InteractionID"])
+                if group_by_clause is not None
+                else ["InteractionID"]
+            )
             .agg(
                 # Take only one outcome per interaction. TODO should perhaps be the last one.
                 InteractionOutcome=pl.when(pl.col.Outcome.is_in(positive_labels).any())
@@ -51,7 +91,7 @@ def summary_by_experiment(
                 .then(pl.lit(False)),
                 Outcomes=pl.col.Outcome.unique().sort(),  # for debugging
             )
-            .group_by([experiment_field] + by)
+            .group_by(group_by_clause)
             .agg(
                 Positives=pl.col.InteractionOutcome.filter(
                     pl.col.InteractionOutcome
@@ -74,7 +114,11 @@ def summary_by_experiment(
                     ).sqrt()
                 )
             )
-            .sort([experiment_field] + by)
         )
 
+        if group_by_clause is None:
+            summary = summary.drop("literal")  # created by empty group_by
+        else:
+            summary = summary.sort(group_by_clause)
+
         return summary