Merge pull request #291 from pegasystems/Prediction_Support

Prediction analysis support in Health Check
pegasystems · Dec 8, 2024 · 716f28c · 716f28c
2 parents d91aa2d + 518dc43
commit 716f28c
Show file tree

Hide file tree

Showing 12 changed files with 766 additions and 423 deletions.
diff --git a/examples/adm/ADMBinningInsights.ipynb b/examples/adm/ADMBinningInsights.ipynb
@@ -109,9 +109,7 @@
     "    predictor_name=\"Customer.AnnualIncome\",\n",
     ")\n",
     "fig.update_layout(height=400, width=700, xaxis_title=\"\")\n",
-    "fig.show()\n",
-    "\n",
-    "# TODO: y-order is not correct"
+    "fig.show()"
    ]
   },
   {
@@ -339,7 +337,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb
@@ -944,7 +944,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

diff --git a/examples/prediction_studio/Predictions.ipynb b/examples/prediction_studio/Predictions.ipynb
@@ -6,6 +6,10 @@
    "source": [
     "# Predictions Overview\n",
     "\n",
+    "__Pega__\n",
+    "\n",
+    "__2024-12-04__\n",
+    "\n",
     "This is a small notebook to report and analyse Prediction Studio data on Predictions. The underlying data is from the Data-DM-Snapshot table that is used to populate the Prediction Studio screen with Prediction Performance, Lift, CTR etc.\n",
     "\n",
     "As data this notebook accept data exported from PDC - which has a slightly altered format - as well as data directly exported from the pyGetSnapshot dataset in Pega.\n",
@@ -16,12 +20,12 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 1,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import polars as pl"
+    "## Raw data\n",
+    "\n",
+    "First, we're going to show the raw data as . The raw data is in a \"long\" format with e.g. test and control groups in separate rows."
    ]
   },
   {
@@ -33,31 +37,37 @@
     "from pathlib import Path\n",
     "import sys\n",
     "import polars as pl\n",
-    "import json\n",
-    "from pdstools import readDSExport, Prediction\n",
+    "from pdstools import read_ds_export, Prediction\n",
     "\n",
+    "# path to dataset export here\n",
+    "# e.g. PR_DATA_DM_SNAPSHOTS.parquet\n",
     "data_export = \"<Your Export Here>\"\n",
     "\n",
     "prediction = None\n",
+    "predictions_raw_data = None\n",
     "if data_export.endswith(\".parquet\"):\n",
-    "    predictions_raw_data = pl.read_parquet(Path(data_export).expanduser())\n",
-    "    prediction = Prediction(predictions_raw_data.lazy())\n",
+    "    predictions_raw_data = pl.scan_parquet(Path(data_export).expanduser())\n",
+    "    prediction = Prediction(predictions_raw_data)\n",
     "elif data_export.endswith(\".json\"):\n",
     "    print(\"Import of PDC JSON data not supported\")\n",
     "    sys.exit()\n",
     "elif data_export.endswith(\".zip\"):\n",
-    "    # Assuming a direct export from the dataset\n",
-    "    predictions_raw_data = readDSExport(data_export).collect()\n",
-    "    prediction = Prediction(predictions_raw_data.lazy())\n",
+    "    predictions_raw_data = read_ds_export(data_export)\n",
+    "    prediction = Prediction(predictions_raw_data)\n",
+    "else:\n",
+    "    prediction = Prediction.from_mock_data(days=60)\n",
     "\n",
-    "predictions_raw_data.head().to_pandas().style"
+    "if predictions_raw_data is not None:\n",
+    "    predictions_raw_data.head(5).collect().to_pandas().style"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Peek at the internal data"
+    "## Prediction Data\n",
+    "\n",
+    "The actual prediction data is in a \"wide\" format with separate fields for Test and Control groups. Also, it is only the \"daily\" snapshots and the numbers and date are formatted to be normal Polars types."
    ]
   },
   {
@@ -73,7 +83,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Summary by Channel, over all time"
+    "## Summary by Channel\n",
+    "\n",
+    "Standard functionality exists to summarize the predictions per channel. Note that we do not have the prediction to channel mapping in the data (this is an outstanding product issue), so apply the implicit naming conventions of NBAD. For a specific customer, custom mappings can be passed into the summarization function."
    ]
   },
   {
@@ -89,20 +101,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Quick glance at the available data aggregated by day."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prediction_summary_by_channel = (\n",
-    "    prediction.summary_by_channel(by_period=\"1d\")\n",
-    "    .with_columns(Prediction=pl.format(\"{} ({})\", pl.col.Channel, pl.col.ModelName))\n",
-    "    .collect()\n",
-    ")"
+    "# Prediction Trends\n",
+    "\n",
+    "Summarization by default is over all time. You can pass in an argument to summarize by day, week or any other period as supported by the (Polars time offset string language)[https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.offset_by.html].\n",
+    "\n",
+    "This trend data can then easily be visualized."
    ]
   },
   {
@@ -111,17 +114,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import plotly.express as px\n",
-    "\n",
-    "px.line(\n",
-    "    prediction_summary_by_channel.filter(pl.col(\"isMultiChannelPrediction\").not_())\n",
-    "    .filter(pl.col(\"Channel\") != \"Unknown\")\n",
-    "    .sort([\"Period\"]),\n",
-    "    x=\"Period\",\n",
-    "    y=\"Performance\",\n",
-    "    color=\"Prediction\",\n",
-    "    title=\"Prediction Performance\",\n",
-    ")"
+    "prediction.plot.performance_trend(\"1w\")"
    ]
   },
   {
@@ -130,15 +123,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "px.line(\n",
-    "    prediction_summary_by_channel.filter(pl.col(\"isMultiChannelPrediction\").not_())\n",
-    "    .filter(pl.col(\"Channel\") != \"Unknown\")\n",
-    "    .sort([\"Period\"]),\n",
-    "    x=\"Period\",\n",
-    "    y=\"Lift\",\n",
-    "    color=\"Prediction\",\n",
-    "    title=\"Prediction Lift\",\n",
-    ").update_yaxes(tickformat=\",.2%\")"
+    "prediction.plot.lift_trend(\"1w\")#, return_df=True).collect()"
    ]
   },
   {
@@ -147,18 +132,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "px.line(\n",
-    "    prediction_summary_by_channel.filter(pl.col(\"isMultiChannelPrediction\").not_())\n",
-    "    .filter(pl.col(\"Channel\") != \"Unknown\")\n",
-    "    .sort([\"Period\"]),\n",
-    "    x=\"Period\",\n",
-    "    y=\"CTR\",\n",
-    "    facet_row=\"Prediction\",\n",
-    "    color=\"Prediction\",\n",
-    "    title=\"Prediction CTR\",\n",
-    ").update_yaxes(tickformat=\",.3%\", matches=None).for_each_annotation(\n",
-    "    lambda a: a.update(text=\"\")\n",
-    ")"
+    "prediction.plot.ctr_trend(\"1w\", facetting=False)"
    ]
   },
   {
@@ -167,22 +141,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "px.line(\n",
-    "    prediction_summary_by_channel.filter(pl.col(\"isMultiChannelPrediction\").not_())\n",
-    "    .filter(pl.col(\"Channel\") != \"Unknown\")\n",
-    "    .sort([\"Period\"]),\n",
-    "    x=\"Period\",\n",
-    "    y=\"ResponseCount\",\n",
-    "    facet_row=\"Prediction\",\n",
-    "    color=\"Prediction\",\n",
-    "    title=\"Prediction Responses\",\n",
-    ").update_yaxes(matches=None).for_each_annotation(lambda a: a.update(text=\"\"))"
+    "prediction.plot.responsecount_trend(\"1w\", facetting=False)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },

diff --git a/python/pdstools/adm/CDH_Guidelines.py b/python/pdstools/adm/CDH_Guidelines.py
@@ -14,7 +14,7 @@
     "Actions per Group": [1, 100, 250, None],
     "Channels": [1, 2, None, None],
     "Configurations per Channel": [1, 1, 2, None],
-    "Predictors": [10, 200, 700, 2000],
+    "Predictors": [50, 200, 700, 2000],
     "Active Predictors per Model": [2, 5, 100, None],
     "Model Performance": [52, 55, 80, 90],
     "Engagement Lift": [0.0, 0.2, 2.0, None],

diff --git a/python/pdstools/adm/Plots.py b/python/pdstools/adm/Plots.py
@@ -1143,7 +1143,7 @@ def binning_lift(
             return plot_df
 
         fig = px.bar(
-            plot_df.collect().to_pandas(use_pyarrow_extension_array=False),
+            plot_df.collect(), #.to_pandas(use_pyarrow_extension_array=False),
             x="Lift",
             y="BinSymbolAbbreviated",
             color="Direction",
@@ -1158,6 +1158,7 @@ def binning_lift(
             template="pega",
             custom_data=["PredictorName", "BinSymbol"],
             facet_col_wrap=3,
+            category_orders=plot_df.collect().to_dict(),
         )
         fig.update_traces(
             hovertemplate="<br>".join(
@@ -1175,7 +1176,6 @@ def binning_lift(
             type="category",
             categoryorder="array",
             automargin=True,
-            autorange="reversed",
             title="",
             dtick=1,  # show all bins
             matches=None,  # allow independent y-labels if there are row facets
@@ -1209,6 +1209,10 @@ def partitioned_plot(
                 fig.show()
         return figs
 
+
+    # TODO I took the propensity distrib plot out of the HC as 
+    # it wasn't very clear, also didn't look great visually.
+
     @requires(
         predictor_columns={
             "BinPropensity",