From b94ac3de78d32c2a6aeeb08f119a9e9cc27f145b Mon Sep 17 00:00:00 2001 From: yusufuyanik1 <54443450+yusufuyanik1@users.noreply.github.com> Date: Fri, 8 Dec 2023 10:56:25 +0100 Subject: [PATCH 1/4] Update ADMExplained.ipynb --- examples/articles/ADMExplained.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb index 3207fcda..727f63a0 100644 --- a/examples/articles/ADMExplained.ipynb +++ b/examples/articles/ADMExplained.ipynb @@ -11,7 +11,7 @@ "\n", "__2023-03-15__\n", "\n", - "This notebook shows exactly how all the values in an ADM model report\n", + "This notebook shows exactly how all the values in an adm model report\n", "are calculated. It also shows how the propensity is calculated for a\n", "particular customer.\n", "\n", From fc84f98f56af5a308563c90aacbb21c5c1ad0cdc Mon Sep 17 00:00:00 2001 From: "Uyanik, Yusuf" Date: Fri, 8 Dec 2023 11:04:16 +0100 Subject: [PATCH 2/4] Revert "Update ADMExplained.ipynb" This reverts commit b94ac3de78d32c2a6aeeb08f119a9e9cc27f145b. --- examples/articles/ADMExplained.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb index 727f63a0..3207fcda 100644 --- a/examples/articles/ADMExplained.ipynb +++ b/examples/articles/ADMExplained.ipynb @@ -11,7 +11,7 @@ "\n", "__2023-03-15__\n", "\n", - "This notebook shows exactly how all the values in an adm model report\n", + "This notebook shows exactly how all the values in an ADM model report\n", "are calculated. It also shows how the propensity is calculated for a\n", "particular customer.\n", "\n", From d94c0b30a7e0c7acfd49534322a63a1a77424871 Mon Sep 17 00:00:00 2001 From: perdo Date: Fri, 8 Dec 2023 15:05:04 +0100 Subject: [PATCH 3/4] Doc cleanup --- examples/adm/AGBModelVisualisation.ipynb | 2 +- examples/articles/HealthCheckSetUp.ipynb | 166 ----------- examples/datamart/report_utils.R | 113 +++++--- examples/graph_gallery/graph_gallery.ipynb | 265 ------------------ examples/hds/Example_Data_Anonymization.ipynb | 2 +- examples/helloworld/hello_cdhtools.ipynb | 144 ---------- python/docs/Makefile | 2 +- python/docs/source/Articles.rst | 14 - python/docs/source/examples.rst | 13 - python/docs/source/index.rst | 34 ++- python/docs/source/reference.rst | 11 + python/docs/source/tutorial_notebooks.rst | 12 - python/tests/test_docScenarios.py | 3 +- 13 files changed, 108 insertions(+), 673 deletions(-) delete mode 100644 examples/articles/HealthCheckSetUp.ipynb delete mode 100644 examples/graph_gallery/graph_gallery.ipynb delete mode 100644 examples/helloworld/hello_cdhtools.ipynb delete mode 100644 python/docs/source/Articles.rst delete mode 100644 python/docs/source/examples.rst create mode 100644 python/docs/source/reference.rst delete mode 100644 python/docs/source/tutorial_notebooks.rst diff --git a/examples/adm/AGBModelVisualisation.ipynb b/examples/adm/AGBModelVisualisation.ipynb index e21f861b..ec13cea6 100644 --- a/examples/adm/AGBModelVisualisation.ipynb +++ b/examples/adm/AGBModelVisualisation.ipynb @@ -16,7 +16,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Analyzing ADM Decision Trees\n", + "# Analyzing ADM AGB models\n", "\n", "With the introduction of ADM Gradient Boosting, we now support tree-based models in ADM as an alternative to the traditional Bayesian approach. In prediction studio, there is some information on the predictors, the model performance et cetera. However, it is also possible to export the trees themselves to analyze them further. This example demonstrates some of the info you can extract yourself, including a visualisation of the actual trees - which also allows you to check the exact 'path' a prediction used through each individual tree. \n", "\n", diff --git a/examples/articles/HealthCheckSetUp.ipynb b/examples/articles/HealthCheckSetUp.ipynb deleted file mode 100644 index 558d08ca..00000000 --- a/examples/articles/HealthCheckSetUp.ipynb +++ /dev/null @@ -1,166 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ADM Health Check\n", - "\n", - "__Pega__\n", - "\n", - "__2023-05-10__\n", - "\n", - "Pdstools version 3 brought over the Health Check from the R tools to Python. Along with this change, we made it easier to run by providing an easy to use webapp. The Health Check provides a global overview of Adaptive models and predictors using Pega Machine Learning." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Downloading the Required Libraries\n", - "\n", - "To get started, you need Python with version 3.8 or greater and [pip](https://pypi.org/project/pip/) installed in your system. You can check which python version you have by running this in your command line:\n", - "\n", - "```\n", - "python --version\n", - "```\n", - "![python version check](../../../../images/python_version.png)\n", - "\n", - "It's also best practice to regularly update your version of pip. If you get any errors during installation, first try:\n", - "```\n", - "pip install --upgrade pip\n", - "```\n", - "\n", - "Now, you can proceed to download pdstools and other libraries required to run Health Check app. Simply run:\n", - "```\n", - "pip install --upgrade pdstools[app]\n", - "```\n", - "\n", - "For zsh on Mac use quotes:\n", - "```\n", - "pip install --upgrade pdstools'[app]'\n", - "```\n", - "\n", - "This will install pdstools, along with all required _optional dependencies_ we use in the webapp.\n", - "\n", - "Once you've got pdstools and other necessary packages installed, run `pip show` to verify the install.\n", - "\n", - "![pdstools version](../../../../images/pdstools_version.png)\n", - "\n", - "If your pdstools version is below 3.1, you can upgrade it with this command:\n", - "```\n", - "pip install --upgrade pdstools\n", - "```\n", - "\n", - "Finally, download Quarto cli from [quarto.org](https://quarto.org/docs/get-started/). It is an open-source publishing library for creating dynamic html content." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running the App\n", - "To launch the Health Check application, simply enter “pdstools run” in your system’s command line. \n", - "\n", - "```\n", - "pdstools run\n", - "```\n", - "![run](../../../../images/run.png)\n", - "\n", - "Upon executing this command, the app should open up in your system browser. If the app does not open up automatically, simply copy the Local URL from your terminal and paste it into your browser.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Using the App: A Step-by-Step Guide\n", - "\n", - "### 1. Introduction\n", - "- The browser opens up with the Home page, from the left hand side, simply click on Health Check Check to get started with the HealthCheck app.\n", - "\n", - "![introduction](../../../../images/introduction.png)\n", - "\n", - "- Please read the introduction page to learn more about the correct usage of the app, then you can move to Data Import tab to upload your data.\n", - "\n", - "![data_import](../../../../images/data_import.png)\n", - "\n", - "### 2. Data import\n", - "- You can upload your ADMDatamart data in this tab. \n", - "\n", - "For testing the app, you may use the provided sample data from CDH Sample. For instructions on how to export your own datamart data, [please refer to this article](https://github.com/pegasystems/pega-datascientist-tools/wiki/How-to-export-and-use-the-ADM-Datamart)\n", - "\n", - "If you're using Treatments, make sure to add that to the context keys in the *Configure Advanced Options* section by clicking on *Extract Additional Keys*.\n", - "Additionally, if you are not using all the default context keys(Channel, Direction, Issue, and Group), feel free to remove the ones you won't be using.\n", - "\n", - "- Next, you can select your import method from the dropdown menu under Data Import. You will see 4 options:\n", - " - **Direct file path:** Simply paste the folder path where the ADM files are located. *Ex. /User/Downloads/*. pdstools should automatically find the relevant files in that directory. *Note*: there is no need to extract the zip files, we will also take care of that for you.\n", - " - **Direct file upload:** Browse your local files with this option. If you don't have access Predictor Binning files, you can upload only the Model Snapshot file and then click on the checkbox to confirm that you want to run only model-based Health Check.\n", - " Note: Maximum file upload size is 2000 MB. You can increase it by launching the app with this command\n", - " \"pdstools run --server.maxUploadSize *desired file size*\"\n", - " Ex.\n", - " :\n", - " ```\n", - " pdstools run --server.maxUploadSize 5000\n", - " ```\n", - - " - **CDH Sample:** You can generate a sample HealthCheck with this option. It is a convenient way to test out the Health Check using a CDH Sample dataset.\n", - " - **Download from S3:** This feature is not implemented yet but soon, you will be able to run HealthCheck without downloading the files to your local system if they are located in S3.\n", - "\n", - "### 3. Data filters\n", - "After the data is imported into the app you can easily add custom filters to specify the Health Check to your needs.\n", - "For just analyzing models in the Web channel, select \"Channel\" from dropdown, delete other channels (if any), move to next tab.\n", - "\n", - "![data_filters](../../../../images/data_filters.png)\n", - "\n", - "### 4. Report configuration\n", - "In this tab you can configure some options for the outputs and then generate your Health Check document. The options are:\n", - "\n", - "- **Customer name:** Change the name of the Health Check file that will generated\n", - "- **Select output type:** Change output format under this field\n", - "- **Change working directory:** You can change folder name where the temporary files will be stored while running the code. Please choose a directory where Python has read and write access.\n", - "\n", - "Check Boxes:\n", - "\n", - "- **Remove temporary files:** Chose whether you want to keep temporary files or not. *Note:* If you get an error while the Health Check is being generated, you can deselect this checkbox and share the log which should be created under the directory you named in *Change working directory* section\n", - "- **Include tables in document:** Whether to include the overview tables embedded in the document itself or to separately recieve these in a tabbed Excel file. If you have a large datamart file, it may be worth _unchecking_ this, as the final health check will be smaller and faster.\n", - "\n", - "Finally, you can click on the **Generate Health Check** button. Once the process is complete, a **Download Health Check** button will appear. You can download Health Check by clicking on it.\n", - "\n", - "## Troubleshooting\n", - "\n", - "In case you run into any issues, please [file an issue on GitHub](https://github.com/pegasystems/pega-datascientist-tools/issues) to let us know, and we'll get it fixed as soon as possible!\n", - "\n", - "## Contributing\n", - "\n", - "If you'd like to see more or different information in the Health Check - we'd appreciate any contributions, [preferably in the form of a pull request](https://github.com/pegasystems/pega-datascientist-tools/pulls). If you're not comfortable in Python or want to discuss the value of your proposed solution, also feel free to [open a feature request](https://github.com/pegasystems/pega-datascientist-tools/issues)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "playground", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/datamart/report_utils.R b/examples/datamart/report_utils.R index 922e931b..43517d2b 100644 --- a/examples/datamart/report_utils.R +++ b/examples/datamart/report_utils.R @@ -5,6 +5,7 @@ library(pdstools) library(arrow) library(jsonlite) library(lubridate) +library(R.utils) # Sets a few global variables for the run_report functions. Call this first. Acts # like a constructor in a proper language. @@ -117,44 +118,60 @@ report_utils_write_cached_files <- function(dm, model_filename, preds_filename) } # Drop old HTML files and orphaned hash files (that have no reference) -report_utils_cleanup_cache <- function(folder = report_utils_results_folder, keep_days = 7) +report_utils_cleanup_helper <- function(folder, keep_days, file_pattern, is_obsolete) { - is_obsolete <- function(f, before = now() - days(keep_days)) { - fileModificationTime(f) < before + generated_files <- list.files(folder, + pattern=file_pattern, + full.names = TRUE, recursive = FALSE) + obsolete_files <- c() + if (length(generated_files) > 0) { + obsolete_files <- generated_files[sapply(generated_files, is_obsolete)] } - obsolete_files <- sapply(list.files(folder, pattern=".*[.]html$", full.names = TRUE, recursive = FALSE), is_obsolete) - obsolete_files <- obsolete_files[obsolete_files] - - print(names(obsolete_files)) - cat("Removing", length(obsolete_files), "obsolete HTML files from", folder, fill = T) + # print(obsolete_files) + cat("Removing", length(obsolete_files), "obsolete", file_pattern, "files from", folder, fill = T) if (length(obsolete_files) > 0) { - file.remove(names(obsolete_files)) + file.remove(obsolete_files) } +} - hashFiles <- list.files(path=folder, - pattern = ".*[.]hash$", full.names = T) - hashFileReferences <- gsub("(.*)[.]hash$", "\\1", hashFiles) # reverse of report_utils_hashfilename - hashFileReferencesExist <- sapply(hashFileReferences, file.exists) - orphanedHashFiles <- hashFiles[!hashFileReferencesExist] - - cat("Removing", length(orphanedHashFiles), "orphaned hash files from", folder, fill = T) +report_utils_cleanup_cache <- function(folder = report_utils_results_folder, keep_days = 7) +{ + is_too_old <- function(f, before = now() - days(keep_days)) { + return(fileModificationTime(f) < before) + } - if (length(orphanedHashFiles) > 0) { - file.remove(orphanedHashFiles) + has_no_reference <- function(f) { + hashFileReference <- gsub("(.*)[.]hash$", "\\1", f) # reverse of report_utils_hashfilename + return(!file.exists(hashFileReference)) } + + report_utils_cleanup_helper(folder, keep_days, ".*[.]html$", is_too_old) + report_utils_cleanup_helper(folder, keep_days, ".*[.]hash$", has_no_reference) + + # Do it for the report subfolders as well + sapply(list.files(folder, pattern = ".*Generated Model Reports$", full.names = T, include.dirs = T), + function(d) { + report_utils_cleanup_helper(d, keep_days, ".*[.]html$", is_too_old) + + report_utils_cleanup_helper(d, keep_days, ".*[.]hash$", has_no_reference) + } + ) + + return() } + + # Change time of a file # TODO maybe this is overly sensitive on onedrive folders fileModificationTime <- function(f) { as.POSIXct(file.info(f)$mtime) } # Generic markdown/quarto call that will check hashes and dates to prevent # unnecessary re-creation -report_utils_run_report <- function(customer, dm, target_filename, target_generator_hash, renderer, quiet) +report_utils_run_report <- function(customer, dm, target_fullfilename, target_generator_hash, renderer, quiet) { - destinationFullPath <- file.path(report_utils_results_folder, target_filename) cachedDMFilesFullName <- file.path(report_utils_intermediates_folder, report_utils_cached_dm_filenames(customer)) # make sure cached source exist, otherwise re-create from dm data @@ -164,15 +181,15 @@ report_utils_run_report <- function(customer, dm, target_filename, target_genera } # check if generator script has changed - if (report_utils_is_target_current(destinationFullPath, target_generator_hash, quiet = quiet)) { - if (!quiet) cat("Modification date", target_filename, ":", fileModificationTime(destinationFullPath), fill=T) + if (report_utils_is_target_current(target_fullfilename, target_generator_hash, quiet = quiet)) { + if (!quiet) cat("Modification date", basename(target_fullfilename), ":", fileModificationTime(target_fullfilename), fill=T) if (!quiet) cat("Modification date DM data", report_utils_cached_dm_filenames(customer), ":", fileModificationTime(cachedDMFilesFullName), fill=T) # or if source files are newer - if (is.na(fileModificationTime(destinationFullPath)) | any(is.na(fileModificationTime(cachedDMFilesFullName))) | - any(fileModificationTime(cachedDMFilesFullName) > fileModificationTime(destinationFullPath))) { + if (is.na(fileModificationTime(target_fullfilename)) | any(is.na(fileModificationTime(cachedDMFilesFullName))) | + any(fileModificationTime(cachedDMFilesFullName) > fileModificationTime(target_fullfilename))) { - cat(target_filename, "out of date wrt source files", fill = TRUE) + cat(target_fullfilename, "out of date wrt source files", fill = TRUE) doRegenerate <- TRUE } else { @@ -183,7 +200,7 @@ report_utils_run_report <- function(customer, dm, target_filename, target_genera } if (doRegenerate) { - cat("Creating", target_filename, fill = TRUE) + cat("Creating", basename(target_fullfilename), fill = TRUE) title <- paste0(customer, ' - Adaptive Models') subtitle <- paste(unique(c( @@ -196,19 +213,19 @@ report_utils_run_report <- function(customer, dm, target_filename, target_genera ifelse(!is.null(dm$predictordata), basename(cachedDMFilesFullName[2]), ""), title, subtitle, - target_filename) + target_fullfilename) # writer renderer hash - report_utils_write_hashfiles(destinationFullPath, target_generator_hash) + report_utils_write_hashfiles(target_fullfilename, target_generator_hash) } else { # Touch target - Sys.setFileTime(normalizePath(destinationFullPath), lubridate::now()) - Sys.setFileTime(report_utils_hashfilename(normalizePath(destinationFullPath)), lubridate::now()) + Sys.setFileTime(normalizePath(target_fullfilename), lubridate::now()) + Sys.setFileTime(report_utils_hashfilename(normalizePath(target_fullfilename)), lubridate::now()) - cat("Skipped re-generation of", target_filename, fill = T) + cat("Skipped re-generation of", basename(target_fullfilename), fill = T) } - return(target_filename) + return(basename(target_fullfilename)) } run_r_healthcheck <- function(customer, dm, quiet = T) @@ -217,7 +234,8 @@ run_r_healthcheck <- function(customer, dm, quiet = T) report_utils_run_report(customer, dm, - target_filename = paste0(customer, ' - ADM Health Check - classic.html'), + target_fullfilename = file.path(report_utils_results_folder, + paste0(customer, ' - ADM Health Check - classic.html')), target_generator_hash = r_health_check_hash, renderer = function(filenameModelData, filenamePredictorData, @@ -229,6 +247,7 @@ run_r_healthcheck <- function(customer, dm, quiet = T) if (!quiet) cat(" modelfile:", paste0('"', file.path(report_utils_intermediates_folder, filenameModelData), '"'), fill=T) if (!quiet) cat(" predictordatafile:", paste0('"', file.path(report_utils_intermediates_folder, filenamePredictorData), '"'), fill=T) + R.utils::mkdirs(dirname(destinationfile)) rmarkdown::render( report_utils_healthcheck_notebook_R, params = list( @@ -237,8 +256,8 @@ run_r_healthcheck <- function(customer, dm, quiet = T) "title" = title, "subtitle" = subtitle ), - output_dir = report_utils_results_folder, - output_file = destinationfile, + output_dir = dirname(destinationfile), + output_file = basename(destinationfile), quiet = quiet, intermediates_dir = report_utils_intermediates_folder, knit_root_dir = report_utils_intermediates_folder @@ -254,7 +273,8 @@ run_python_healthcheck <- function(customer, dm, quiet = T) report_utils_run_report(customer, dm, - target_filename = paste0(customer, ' - ADM Health Check - new.html'), + target_fullfilename = file.path(report_utils_results_folder, + paste0(customer, ' - ADM Health Check - new.html')), target_generator_hash = python_health_check_hash, renderer = function(filenameModelData, filenamePredictorData, @@ -299,8 +319,9 @@ run_python_healthcheck <- function(customer, dm, quiet = T) } # TODO check status?? + R.utils::mkdirs(dirname(destinationfile)) file.copy(paste0(sub('\\..[^\\.]*$', '', report_utils_healthcheck_notebook_python), ".html"), - file.path(report_utils_results_folder, destinationfile), + destinationfile, overwrite = TRUE, copy.date = TRUE ) @@ -351,7 +372,9 @@ run_r_model_reports <-function(customer, dm, report_utils_run_report(customer, dm, - target_filename = paste0(customer, " ", modelName, " - classic", ".html"), + target_fullfilename = file.path(report_utils_results_folder, + paste(customer, "Generated Model Reports", sep = " - "), + paste0(customer, " ", modelName, " - classic", ".html")), target_generator_hash = r_model_report_hash, renderer = function(filenameModelData, filenamePredictorData, @@ -362,6 +385,7 @@ run_r_model_reports <-function(customer, dm, if (!quiet) cat(" predictordatafile:", paste0('"', file.path(report_utils_intermediates_folder, filenamePredictorData), '"'), fill=T) if (!quiet) cat(" modelid:", paste0('"', id, '"'), fill=T) + R.utils::mkdirs(dirname(destinationfile)) rmarkdown::render( report_utils_offlinemodelreport_notebook_R, params = list( @@ -369,8 +393,8 @@ run_r_model_reports <-function(customer, dm, "modeldescription" = modelName, "modelid" = id ), - output_dir = report_utils_results_folder, - output_file = destinationfile, + output_dir = dirname(destinationfile), + output_file = basename(destinationfile), quiet = quiet, intermediates_dir = report_utils_intermediates_folder, knit_root_dir = report_utils_intermediates_folder @@ -410,7 +434,9 @@ run_python_model_reports <-function(customer, dm, report_utils_run_report(customer, dm, - target_filename = paste0(customer, " ", modelName, " - new", ".html"), + target_fullfilename = file.path(report_utils_results_folder, + paste(customer, "Generated Model Reports", sep = " - "), + paste0(customer, " ", modelName, " - new", ".html")), target_generator_hash = python_model_report_hash, renderer = function(filenameModelData, filenamePredictorData, @@ -456,13 +482,14 @@ run_python_model_reports <-function(customer, dm, } # TODO check status?? + R.utils::mkdirs(dirname(destinationfile)) file.copy(paste0(sub('\\..[^\\.]*$', '', report_utils_offlinemodelreport_notebook_python), ".html"), - file.path(report_utils_results_folder, destinationfile), + destinationfile, overwrite = TRUE, copy.date = TRUE ) - if (!quiet) cat("Created", file.path(report_utils_results_folder, destinationfile), fill=T) + if (!quiet) cat("Created", destinationfile, fill=T) }, quiet = quiet ) diff --git a/examples/graph_gallery/graph_gallery.ipynb b/examples/graph_gallery/graph_gallery.ipynb deleted file mode 100644 index 849e72d8..00000000 --- a/examples/graph_gallery/graph_gallery.ipynb +++ /dev/null @@ -1,265 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "nbsphinx": "hidden" - }, - "source": [ - "## Link to article\n", - "\n", - "This notebook is included in the documentation, where the interactive Plotly charts show up. See:\n", - "https://pegasystems.github.io/pega-datascientist-tools/Python/articles/graph_gallery.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Graph Gallery" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# These lines are only for rendering in the docs, and are hidden through Jupyter tags\n", - "# Do not run if you're running the notebook seperately\n", - "import plotly.io as pio\n", - "\n", - "pio.renderers.default = \"notebook_connected\"\n", - "\n", - "import sys\n", - "\n", - "sys.path.append(\"../../../\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "from pdstools import datasets\n", - "\n", - "data = datasets.CDHSample()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Models By Positives" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotModelsByPositives()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Over Time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotOverTime(query=pl.col(\"Channel\") == \"Web\", by=\"Name\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Performance Success Rate Bubble Chart" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotPerformanceSuccessRateBubbleChart()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predictor Binning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotPredictorBinning(\n", - " query=(pl.col(\"ModelID\") == \"08ca1302-9fc0-57bf-9031-d4179d400493\")\n", - " & pl.col(\"PredictorName\").is_in(\n", - " [\n", - " \"Customer.Age\",\n", - " \"Customer.AnnualIncome\",\n", - " \"IH.Email.Outbound.Accepted.pxLastGroupID\",\n", - " ]\n", - " )\n", - ",show_each=True);\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predictor Performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotPredictorPerformance(top_n=30)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predictor Contribution" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotPredictorContribution()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predictor Performance Heatmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotPredictorPerformanceHeatmap(top_n=20)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Proposition Success Rates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotPropositionSuccessRates(query=pl.col(\"Channel\") == \"Web\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Response Gain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotResponseGain()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Score Distribution" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotScoreDistribution(show_each=True);\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tree Map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.plotTreeMap()\n" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - }, - "kernelspec": { - "display_name": "Python 3.10.2 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/hds/Example_Data_Anonymization.ipynb b/examples/hds/Example_Data_Anonymization.ipynb index efb50a30..d6d28db4 100644 --- a/examples/hds/Example_Data_Anonymization.ipynb +++ b/examples/hds/Example_Data_Anonymization.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example data anonymization\n", + "# Data Anonymization\n", "\n", "In Pega CDH 8.5 and up, it's now possible to record the historical data as seen by the Adaptive Models. See [this academy challenge](https://academy.pega.com/challenge/exporting-historical-data/v4) for reference. This historical data can be further used to experiment with offline models, but also to fine-tune the OOTB Gradient Boosting model. However, sharing this information with Pega can be sensitive as it contains raw predictor data. \n", "\n", diff --git a/examples/helloworld/hello_cdhtools.ipynb b/examples/helloworld/hello_cdhtools.ipynb deleted file mode 100644 index d4623536..00000000 --- a/examples/helloworld/hello_cdhtools.ipynb +++ /dev/null @@ -1,144 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "nbsphinx": "hidden" - }, - "source": [ - "## Link to article\n", - "\n", - "This notebook is included in the documentation, where the interactive Plotly charts show up. See:\n", - "https://pegasystems.github.io/pega-datascientist-tools/Python/articles/hello_cdhtools.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Hello pdstools\n", - "\n", - "This is a basic example of using pdstools to visualize the ADM datamart.\n", - "\n", - "To run this first install the `pdstools` library as per the [installation instructions](https://github.com/pegasystems/pega-datascientist-tools/wiki#using-the-python-tools) of the GitHub repository https://github.com/pegasystems/pega-datascientist-tools. Then you can import the ADMDatamart class as such:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "# These lines are only for rendering in the docs, and are hidden through Jupyter tags\n", - "# Do not run if you're running the notebook seperately\n", - "import plotly.io as pio\n", - "pio.renderers.default='notebook_connected'\n", - "\n", - "import sys\n", - "sys.path.append('../../../')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pdstools import ADMDatamart, datasets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then to run this example, use a sample dataset included or export your own from Pega. These steps are detailed on the [Wiki](https://github.com/pegasystems/pega-datascientist-tools/wiki/How-to-export-and-use-the-ADM-Datamart).\n", - "\n", - "Change the path if it is different from the current working directory. The class `ADMDatamart` will read ADM datamart and standardize the fields returned. It is very flexible and can read from datamart export zips, from CSV, parquet etc and has a number of arguments to fine tune the behavior.\n", - "\n", - "Because we import from the CDH Sample dataset hosted on GitHub, these two commands are identical:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ADMDatamart = ADMDatamart(\"../../data\")\n", - "ADMDatamart = datasets.CDHSample()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bubble Chart\n", - "\n", - "The following plot shows the bubble chart as shown on the Pega out-of-the-box model report landing page. The \n", - "methods has options to zoom in into only certain channels/issues etc." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ADMDatamart.plotPerformanceSuccessRateBubbleChart()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predictor Overview\n", - "\n", - "Load predictor data and show a basic predictor performance overview." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ADMDatamart.plotPredictorPerformance(top_n=30)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Next steps\n", - "\n", - "For more examples and sample code please see the [pdstools Wiki](https://github.com/pegasystems/pega-datascientist-tools/wiki) and the documentation." - ] - } - ], - "metadata": { - "interpreter": { - "hash": "0c5c31b7614ab5f7bbff6555bdc6f3ec4cea8754d51936ee45052251e94c1071" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/docs/Makefile b/python/docs/Makefile index 4d674db3..19329da1 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -23,7 +23,7 @@ help: %: Makefile mkdir -p source/articles - cp ../../examples/datamart/Example_ADM_Analysis.ipynb ../../examples/graph_gallery/graph_gallery.ipynb ../../examples/helloworld/hello_cdhtools.ipynb ../../examples/valuefinder/* ../../examples/adm/AGBModelVisualisation.ipynb ../../examples/hds/Example_Data_Anonymization.ipynb ../../examples/articles/*.ipynb source/articles + cp ../../examples/datamart/Example_ADM_Analysis.ipynb ../../examples/valuefinder/* ../../examples/adm/AGBModelVisualisation.ipynb ../../examples/hds/Example_Data_Anonymization.ipynb ../../examples/articles/*.ipynb source/articles @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) rm -rf source/articles/output rm source/articles/* diff --git a/python/docs/source/Articles.rst b/python/docs/source/Articles.rst deleted file mode 100644 index 3e581d90..00000000 --- a/python/docs/source/Articles.rst +++ /dev/null @@ -1,14 +0,0 @@ -Articles -==================================== - -Here you can find some written articles. - - -.. toctree:: - :maxdepth: 1 - :caption: Notebooks included: - - articles/pdstoolsv3 - articles/thompsonsampling - articles/ADMExplained - articles/HealthCheckSetUp \ No newline at end of file diff --git a/python/docs/source/examples.rst b/python/docs/source/examples.rst deleted file mode 100644 index 39993532..00000000 --- a/python/docs/source/examples.rst +++ /dev/null @@ -1,13 +0,0 @@ -Example notebooks -==================================== - -Here you can find some example analyses. - - -.. toctree:: - :maxdepth: 1 - :caption: Notebooks included: - - articles/Example_ADM_Analysis - articles/AGBModelVisualisation - articles/vf_analysis \ No newline at end of file diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index ca41bc59..58350e61 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -3,27 +3,39 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to pdstools's Python documentation! +Welcome to the PDS Tools Python documentation =========================================== +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + Installation + Getting Started + ADM Health Check + +.. toctree:: + :maxdepth: 1 + :caption: Examples + + articles/Example_ADM_Analysis + articles/AGBModelVisualisation + articles/vf_analysis + articles/Example_Data_Anonymization + .. toctree:: :maxdepth: 1 :caption: Articles - articles/graph_gallery - tutorial_notebooks - examples - Articles + articles/thompsonsampling + articles/ADMExplained .. toctree:: :maxdepth: 1 - :caption: Links + :caption: Reference Information - Installation - Getting Started - Changelog - GitHub Repository - R Documentation + reference +* :ref:`genindex` .. toctree:: :maxdepth: 1 diff --git a/python/docs/source/reference.rst b/python/docs/source/reference.rst new file mode 100644 index 00000000..29a6f726 --- /dev/null +++ b/python/docs/source/reference.rst @@ -0,0 +1,11 @@ +Additional References +==================================== + +.. toctree:: + :maxdepth: 1 + :caption: References: + + articles/pdstoolsv3 + Changelog + GitHub Repository + R Documentation diff --git a/python/docs/source/tutorial_notebooks.rst b/python/docs/source/tutorial_notebooks.rst deleted file mode 100644 index 6613d688..00000000 --- a/python/docs/source/tutorial_notebooks.rst +++ /dev/null @@ -1,12 +0,0 @@ -Tutorial notebooks -==================================== - -Here you can find some tutorial notebooks. - - -.. toctree:: - :maxdepth: 1 - :caption: Notebooks included: - - articles/hello_cdhtools - articles/Example_Data_Anonymization diff --git a/python/tests/test_docScenarios.py b/python/tests/test_docScenarios.py index c46cddb4..227a8122 100644 --- a/python/tests/test_docScenarios.py +++ b/python/tests/test_docScenarios.py @@ -24,9 +24,8 @@ def test_all_notebooks(): files = [ str(basePath / f) for f in [ + # TODO shouldn't we have all the notebooks here? "examples/datamart/Example_ADM_Analysis.ipynb", - "examples/graph_gallery/graph_gallery.ipynb", - "examples/helloworld/hello_cdhtools.ipynb", "examples/adm/AGBModelVisualisation.ipynb", ] ] From ed132557efd2481b236618b7eb783ea5bb5b92b4 Mon Sep 17 00:00:00 2001 From: perdo Date: Fri, 8 Dec 2023 15:44:46 +0100 Subject: [PATCH 4/4] Dropped old batch scripts in favor of recently introduced new ones --- examples/datamart/createModelReport.sh | 20 ---- examples/datamart/offlinereports.R | 140 ------------------------- 2 files changed, 160 deletions(-) delete mode 100755 examples/datamart/createModelReport.sh delete mode 100644 examples/datamart/offlinereports.R diff --git a/examples/datamart/createModelReport.sh b/examples/datamart/createModelReport.sh deleted file mode 100755 index bc00a768..00000000 --- a/examples/datamart/createModelReport.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Run an R notebook on the given inputfile with predictor binning data - -# Location of the GIT checkout of the Pega Data Scientist tools -pdstools="~/Documents/pega/pega-datascientist-tools" -modelreportnotebook="$pdstools/examples/datamart/modelreport.Rmd" - -# Predictor data. This can be a CSV or any other of the supported formats. -source="$pdstools/data/pr_data_dm_admmart_pred.csv" - -# Model ID to use -modelid="277a2c48-8888-5b71-911e-443d52c9b50f" -modeldescription="Banner Model - BMOBILEAPP" - -# Generated file -output="`pwd`/$modeldescription.html" - -R -e "rmarkdown::render('$modelreportnotebook',params = list(predictordatafile='$source', modeldescription='$modeldescription', modelid='$modelid'), output_file='$output')" - diff --git a/examples/datamart/offlinereports.R b/examples/datamart/offlinereports.R deleted file mode 100644 index 96e8457a..00000000 --- a/examples/datamart/offlinereports.R +++ /dev/null @@ -1,140 +0,0 @@ -# Example R script to create off-line model reports. There is a similar -# but much less complex bash script to do the same. Any language can really -# be used here - the fact that the off-line model reports are in an R notebook -# does not mean the preprocessing and batch processing needs to be in R as well. - -# You can run this script from R, from R Studio, VS Code or really any editor -# of your choice. You will need to change some of the paths defined here. - -library(pdstools) -library(data.table) -library(rmarkdown) -library(arrow) - -# Pandoc is needed by RMarkdown and part of RStudio. If you run this -# script outside of RStudio you'll need to make sure pandoc is installed -# and known to R / markdown. For now this is the best I could think of. To -# make it slightly more generic, dir can be a character vector of paths: -if (!rmarkdown::pandoc_available()) { - rmarkdown::find_pandoc(dir = c("/opt/anaconda3/bin")) - cat("Pandoc:", rmarkdown::pandoc_available(), fill = T) -} - -customer <- "SampleCustomer" # just for titles, change to your customer name -datamart_datasets_folder <- "~/Downloads" # will pick the latest from there - -# Path to the checked out versions of the notebooks. You'll need them locally -# so make sure to to a "clone" of the PDS Tools GitHub repository at -# https://github.com/pegasystems/pega-datascientist-tools. Update the path -# below to reflect the folder where you cloned the repo. - -pdstools_repo_folder <- "~/Documents/pega/pega-datascientist-tools" - -healthcheck_notebook_R <- file.path(pdstools_repo_folder, "examples/datamart/healthcheck.Rmd") -offlinemodelreport_notebook_R <- file.path(pdstools_repo_folder, "examples/datamart/modelreport.Rmd") - -working_folder <- tempdir(TRUE) -output_folder <- file.path(getwd(), "reports") -if (!dir.exists(output_folder)) dir.create(output_folder) - -# Read ADM Datamart from the folder specified above. You can also give -# explicit paths to both dataset. See help on ADMDatamart, in R Studio -# with ?pdstools::ADMDatamart or online at -# https://pegasystems.github.io/pega-datascientist-tools/R/reference/ADMDatamart.html - -# Example of a function you can implement to highlight certain -# types of predictors based on their names. By default the system will -# highlight IH.* and Param.* predictors - simply splitting on the first dot, -# but you can customize this as shown below: - -myPredictorCategorization <- function(name) -{ - if (startsWith(name, "Param.ExtGroup")) return("External Model") - if (endsWith(name, "Score")) return("External Model") - if (endsWith(name, "RiskCode")) return("External Model") - - return(defaultPredictorCategorization(name)) -} - -dm <- ADMDatamart(datamart_datasets_folder, - # optional predictor categorization, see above - predictorCategorization = myPredictorCategorization, - - # filtering the data to be used - filterModelData = function(mdls) { - return(mdls[ConfigurationName %in% c("OmniAdaptiveModel") & - Group == "CreditCards" & - Direction == "Outbound"]) - }) - -# Write back temp files with the filtered data - not strictly necessary, you -# can also refer to the full files in the call to the notebooks. - -tempModelFile <- tempfile(fileext = "_mdls.arrow", tmpdir = working_folder) -arrow::write_ipc_file(dm$modeldata, sink = tempModelFile) -tempPredictorFile <- tempfile(fileext = "_preds.arrow", tmpdir = working_folder) -arrow::write_ipc_file(dm$predictordata, sink = tempPredictorFile) - -# Create Health Check (legacy R version - now superseded by the new Python version) - -rmarkdown::render(healthcheck_notebook_R, - params = list( - modelfile = tempModelFile, - predictordatafile = tempPredictorFile, - title = paste("ADM Health Check", customer, sep = " - "), - subtitle = "legacy R version" - ), - output_dir = working_folder, - output_file = paste("ADM Health Check ", customer, ".html", sep = ""), - quiet = FALSE, intermediates_dir = working_folder -) - -# Individual Model reports - -# In real life situations you probably want to select a subset of the -# models, not run a model report for every possible ADM instance, which -# would typically be in the 100's or 1000's. - -# Below we select 5 of the models from every channel with the largest -# response counts. This is just a simple example that can easily be -# extended. - -recentModels <- filterLatestSnapshotOnly(dm$modeldata)[Positives > 10] -recentModels[, PosRank := frank(-Positives, ties.method="random"), by=c("Direction", "Channel", "ConfigurationName")] -ids <- recentModels[PosRank <= 5, "ModelID"] - -# Associate a name with the model IDs -modelNames <- sapply(ids, function(id) { - make.names(paste( - sapply(unique(dm$modeldata[ - ModelID == id, - c("ConfigurationName", "Channel", "Direction", "Issue", "Group", "Name", "Treatment") - ]), as.character), - collapse = "_" - )) -}) - -# Create a report for every of these models -for (n in seq_along(ids)) { - id <- ids[order(modelNames)][n] - modelName <- modelNames[id] - - cat("Model:", modelName, n, "of", length(ids), fill = T) - - localModelReportHTMLFile <- paste0(customer, "_", modelName, ".html") - - rmarkdown::render(offlinemodelreport_notebook_R, - params = list( - predictordatafile = tempPredictorFile, - modeldescription = modelName, - modelid = id - ), - output_dir = output_folder, - output_file = localModelReportHTMLFile, - quiet = F, intermediates_dir = working_folder - ) -} - -cat("Done. Output is in", output_folder, fill=T) - -