From 4e0940aab789fce9c7ce595f402c90d9e7cccc95 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Mon, 22 Apr 2024 10:21:24 -0400 Subject: [PATCH 01/14] Add diagnoser to toolkit --- README.md | 9 +- .../customized_diagnoser.ipynb | 616 ++++++++++++++ examples/example_notebooks/diagnoser.ipynb | 767 ++++++++++++++++++ pyproject.toml | 5 +- whylabs_toolkit/monitor/diagnoser/README.md | 21 + whylabs_toolkit/monitor/diagnoser/__init__.py | 0 .../monitor/diagnoser/constants.py | 4 + .../monitor/diagnoser/converters/__init__.py | 0 .../diagnoser/converters/granularity.py | 53 ++ .../diagnoser/converters/test_granularity.py | 33 + .../monitor/diagnoser/helpers/__init__.py | 0 .../monitor/diagnoser/helpers/describe.py | 30 + .../monitor/diagnoser/helpers/utils.py | 66 ++ .../monitor/diagnoser/models/__init__.py | 2 + .../diagnoser/models/diagnosis_report.py | 228 ++++++ .../diagnoser/models/noisy_monitors.py | 46 ++ .../monitor/diagnoser/monitor_diagnoser.py | 326 ++++++++ .../diagnoser/recommendation/__init__.py | 0 .../recommendation/change_recommender.py | 163 ++++ .../diagnoser/recommendation/manual_change.py | 15 + .../recommendation/recommended_change.py | 59 ++ .../recommendation/remove_columns.py | 29 + .../diagnoser/recommendation/test_changes.py | 21 + .../recommendation/test_remove_columns.py | 38 + .../monitor/diagnoser/targeting.py | 33 + .../monitor/diagnoser/test/__init__.py | 0 .../monitor/diagnoser/test/test_helpers.py | 12 + 27 files changed, 2570 insertions(+), 6 deletions(-) create mode 100644 examples/example_notebooks/customized_diagnoser.ipynb create mode 100644 examples/example_notebooks/diagnoser.ipynb create mode 100644 whylabs_toolkit/monitor/diagnoser/README.md create mode 100644 whylabs_toolkit/monitor/diagnoser/__init__.py create mode 100644 whylabs_toolkit/monitor/diagnoser/constants.py create mode 100644 whylabs_toolkit/monitor/diagnoser/converters/__init__.py create mode 100644 whylabs_toolkit/monitor/diagnoser/converters/granularity.py create mode 100644 whylabs_toolkit/monitor/diagnoser/converters/test_granularity.py create mode 100644 whylabs_toolkit/monitor/diagnoser/helpers/__init__.py create mode 100644 whylabs_toolkit/monitor/diagnoser/helpers/describe.py create mode 100644 whylabs_toolkit/monitor/diagnoser/helpers/utils.py create mode 100644 whylabs_toolkit/monitor/diagnoser/models/__init__.py create mode 100644 whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py create mode 100644 whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py create mode 100644 whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/__init__.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/test_changes.py create mode 100644 whylabs_toolkit/monitor/diagnoser/recommendation/test_remove_columns.py create mode 100644 whylabs_toolkit/monitor/diagnoser/targeting.py create mode 100644 whylabs_toolkit/monitor/diagnoser/test/__init__.py create mode 100644 whylabs_toolkit/monitor/diagnoser/test/test_helpers.py diff --git a/README.md b/README.md index 5484946..8a4bd60 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,11 @@ pip install whylabs_toolkit The available packages that we have enable different use-cases for the `whylabs_toolkit`. To get started, navigate to one of the following sections and find useful tutorials there. -| Package | Usage | -|---------------------|----------------------| -| [Monitor Manager](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/monitor/manager/README.md) | Author and modify existing WhyLabs monitor with Python | -| [WhyLabs Helpers](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/helpers/README.md) | Interact with and modify your Datasets and ML Models specs in WhyLabs. | +| Package | Usage | +|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------| +| [Monitor Manager](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/monitor/manager/README.md) | Author and modify existing WhyLabs monitor with Python. | +| [Monitor Diagnoser](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/monitor/diagnoser/README.md) | Diagnose problems with monitors. | +| [WhyLabs Helpers](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/helpers/README.md) | Interact with and modify your Datasets and ML Models specs in WhyLabs. | ## Development diff --git a/examples/example_notebooks/customized_diagnoser.ipynb b/examples/example_notebooks/customized_diagnoser.ipynb new file mode 100644 index 0000000..49e6fe4 --- /dev/null +++ b/examples/example_notebooks/customized_diagnoser.ipynb @@ -0,0 +1,616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Detecting noisy monitors\n", + "\n", + "This notebook shows how to use the WhyLabs Monitor Diagnoser to customize the diagnosis of a noisy monitor. It interacts with the diagnoser to get information on noisy and failing monitors, and to make selections about which monitor, segment and columns to diagnose.\n", + "\n", + "## Install requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:13.012745Z", + "start_time": "2024-04-16T15:01:09.165663Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Obtaining file:///Volumes/Workspace/hack/smart-config\r\n", + " Installing build dependencies ... \u001B[?25ldone\r\n", + "\u001B[?25h Checking if build backend supports build_editable ... \u001B[?25ldone\r\n", + "\u001B[?25h Getting requirements to build editable ... \u001B[?25ldone\r\n", + "\u001B[?25h Installing backend dependencies ... \u001B[?25ldone\r\n", + "\u001B[?25h Preparing editable metadata (pyproject.toml) ... \u001B[?25ldone\r\n", + "\u001B[?25hRequirement already satisfied: tabulate in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.9.0)\r\n", + "Requirement already satisfied: pandas in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.0.1)\r\n", + "Requirement already satisfied: numpy in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (1.24.3)\r\n", + "Requirement already satisfied: whylabs-client in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.2)\r\n", + "Requirement already satisfied: whylabs-toolkit in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.0.18)\r\n", + "Requirement already satisfied: pydantic<2 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (1.10.14)\r\n", + "Requirement already satisfied: isodate in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.1)\r\n", + "Requirement already satisfied: python-dateutil in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.8.2)\r\n", + "Requirement already satisfied: fastapi in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.110.0)\r\n", + "Requirement already satisfied: uvicorn[standard] in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.28.0)\r\n", + "Requirement already satisfied: requests in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.31.0)\r\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from pydantic<2->WhyLabs-Monitor-Diagnoser==0.0.1) (4.9.0)\r\n", + "Requirement already satisfied: starlette<0.37.0,>=0.36.3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (0.36.3)\r\n", + "Requirement already satisfied: six in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from isodate->WhyLabs-Monitor-Diagnoser==0.0.1) (1.16.0)\r\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from pandas->WhyLabs-Monitor-Diagnoser==0.0.1) (2022.7.1)\r\n", + "Requirement already satisfied: tzdata>=2022.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from pandas->WhyLabs-Monitor-Diagnoser==0.0.1) (2023.3)\r\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (3.3.2)\r\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (3.6)\r\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (2.2.0)\r\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (2024.2.2)\r\n", + "Requirement already satisfied: click>=7.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (8.0.4)\r\n", + "Requirement already satisfied: h11>=0.8 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.14.0)\r\n", + "Requirement already satisfied: httptools>=0.5.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.1)\r\n", + "Requirement already satisfied: python-dotenv>=0.13 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (1.0.1)\r\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (6.0)\r\n", + "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.19.0)\r\n", + "Requirement already satisfied: watchfiles>=0.13 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.21.0)\r\n", + "Requirement already satisfied: websockets>=10.4 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (12.0)\r\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.17.3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (4.21.1)\r\n", + "Requirement already satisfied: whylogs<2.0.0,>=1.1.26 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (1.2.8)\r\n", + "Requirement already satisfied: attrs>=22.2.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (23.2.0)\r\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (2023.12.1)\r\n", + "Requirement already satisfied: referencing>=0.28.4 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (0.33.0)\r\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (0.18.0)\r\n", + "Requirement already satisfied: anyio<5,>=3.4.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from starlette<0.37.0,>=0.36.3->fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (3.6.2)\r\n", + "Requirement already satisfied: platformdirs<4.0.0,>=3.5.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (3.11.0)\r\n", + "Requirement already satisfied: protobuf>=3.19.4 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (4.25.2)\r\n", + "Requirement already satisfied: types-requests<3.0.0.0,>=2.30.0.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (2.31.0.20240125)\r\n", + "Requirement already satisfied: whylogs-sketching>=3.4.1.dev3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (3.4.1.dev3)\r\n", + "Requirement already satisfied: sniffio>=1.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette<0.37.0,>=0.36.3->fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (1.3.0)\r\n", + "Building wheels for collected packages: WhyLabs-Monitor-Diagnoser\r\n", + " Building editable for WhyLabs-Monitor-Diagnoser (pyproject.toml) ... \u001B[?25ldone\r\n", + "\u001B[?25h Created wheel for WhyLabs-Monitor-Diagnoser: filename=WhyLabs_Monitor_Diagnoser-0.0.1-0.editable-py3-none-any.whl size=3253 sha256=7b4cbfe8c7d43b46817562de75e01238943321354a771ca71eae6da224702c26\r\n", + " Stored in directory: /private/var/folders/kg/k2sb6xms2650ty85vy98q5qr0000gn/T/pip-ephem-wheel-cache-mw1sol4x/wheels/3b/90/fd/b769d4b005362ce18dbd94fe781f74806d1a79ffbe447812d7\r\n", + "Successfully built WhyLabs-Monitor-Diagnoser\r\n", + "Installing collected packages: WhyLabs-Monitor-Diagnoser\r\n", + " Attempting uninstall: WhyLabs-Monitor-Diagnoser\r\n", + " Found existing installation: WhyLabs-Monitor-Diagnoser 0.0.1\r\n", + " Uninstalling WhyLabs-Monitor-Diagnoser-0.0.1:\r\n", + " Successfully uninstalled WhyLabs-Monitor-Diagnoser-0.0.1\r\n", + "Successfully installed WhyLabs-Monitor-Diagnoser-0.0.1\r\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -e .\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Setup whylabs API connection\n", + "\n", + "First, set up the information to connect to WhyLabs. Update the org_id, dataset_id and api_key in the following before running it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "import getpass\n", + "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", + "\n", + "org_id = 'org-0'\n", + "dataset_id = 'model-0'\n", + "api_key = getpass.getpass()\n", + "api_endpoint = 'https://songbird.development.whylabsdev.com'\n", + "\n", + "env_setup(\n", + " org_id=org_id,\n", + " dataset_id=dataset_id,\n", + " api_key=api_key,\n", + " whylabs_endpoint=api_endpoint\n", + ")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:16.123058Z", + "start_time": "2024-04-16T15:01:13.014131Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Then initialize the Monitor Diagnoser with the org_id and dataset_id." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.monitor_diagnoser import MonitorDiagnoser\n", + "diagnoser = MonitorDiagnoser(org_id, dataset_id)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:16.451964Z", + "start_time": "2024-04-16T15:01:16.124858Z" + } + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Running a customized diagnosis\n", + "## Get the recommended diagnostic interval\n", + "\n", + "Get the dataset start/end time, granularity, and a recommended diagnostic interval for the dataset. The diagnoser will use this interval unless you override it by setting the `diagnostic_interval` property." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:17.021550Z", + "start_time": "2024-04-16T15:01:16.452760Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "(TimeRange(start=datetime.datetime(2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 4, 15, 21, 0, tzinfo=datetime.timezone.utc)),\n ,\n '2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z')" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lineage, granularity, interval = diagnoser.choose_dataset_batches()\n", + "lineage, granularity, interval" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get information on noisy and failing monitors\n", + "\n", + "Get information on how many anomalies are detected by each monitor in the dataset. The results are ordered so that the monitors with the most anomalies per column are first (i.e. monitors which are firing on the many batches for certain columns). Beyond that, results with a higher average number of anomalies per column are considered noisier." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "data": { + "text/plain": " monitor_id analyzer_id \n0 adorable-goldenrod-lion-9438 adorable-goldenrod-lion-9438-analyzer \\\n1 unsightly-orchid-gorilla-4971 unsightly-orchid-gorilla-4971-analyzer \n2 concerned-skyblue-penguin-6734 concerned-skyblue-penguin-6734-analyzer \n3 proud-seagreen-carabeef-65 proud-seagreen-carabeef-65-analyzer \n4 kind-cyan-kangaroo-1253 kind-cyan-kangaroo-1253-analyzer \n.. ... ... \n93 numerical-drift-monitor-60dfcc numerical-drift-analyzer-60dfcc \n94 stormy-olive-butterfly-8693 stormy-olive-butterfly-8693-analyzer \n95 fine-magenta-nightingale-9708 fine-magenta-nightingale-9708-analyzer \n96 None eager-violet-newt-4599-analyzer \n97 unsightly-bisque-lemur-1917 unsightly-bisque-lemur-1917-analyzer \n\n metric column_count segment_count anomaly_count \n0 frequent_items 2 1 31 \\\n1 frequent_items 3 1 33 \n2 frequent_items 3 1 32 \n3 histogram 1 1 28 \n4 histogram 1 1 28 \n.. ... ... ... ... \n93 histogram 1 1 2 \n94 histogram 1 1 2 \n95 unique_est_ratio 26 1 39 \n96 count_null_ratio 21 1 28 \n97 frequent_items 1 1 1 \n\n max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \n0 30 1 15 \\\n1 30 1 11 \n2 30 1 10 \n3 28 28 28 \n4 28 28 28 \n.. ... ... ... \n93 2 2 2 \n94 2 2 2 \n95 2 1 1 \n96 2 1 1 \n97 1 1 1 \n\n action_count action_targets \n0 0 [] \n1 0 [] \n2 0 [] \n3 0 [] \n4 0 [] \n.. ... ... \n93 2 [email, slack] \n94 0 [] \n95 0 [] \n96 0 [] \n97 0 [] \n\n[98 rows x 11 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0adorable-goldenrod-lion-9438adorable-goldenrod-lion-9438-analyzerfrequent_items2131301150[]
1unsightly-orchid-gorilla-4971unsightly-orchid-gorilla-4971-analyzerfrequent_items3133301110[]
2concerned-skyblue-penguin-6734concerned-skyblue-penguin-6734-analyzerfrequent_items3132301100[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11282828280[]
4kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11282828280[]
....................................
93numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
94stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram1122220[]
95fine-magenta-nightingale-9708fine-magenta-nightingale-9708-analyzerunique_est_ratio261392110[]
96Noneeager-violet-newt-4599-analyzercount_null_ratio211282110[]
97unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

98 rows × 11 columns

\n
" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "noisy_monitors = diagnoser.detect_noisy_monitors()\n", + "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in noisy_monitors])\n", + "noisy_monitors_df" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.724361Z", + "start_time": "2024-04-16T15:01:17.024927Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Once you have run `detect_noisy_monitors`, you can retrieve the result at any time via the `noisy_monitors` property. You can also retrieve\n", + " information about monitors with analysis failures using `failed_monitors`. " + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "data": { + "text/plain": " monitor_id \n0 energetic-black-cobra-7838 \\\n1 None \n2 good-cornsilk-bear-9359 \n3 elated-gray-baboon-4620 \n4 expensive-tomato-moose-6522 \n5 missing-values-ratio-monitor-v9uywi \n6 curious-lemonchiffon-rabbit-7000 \n7 clear-azure-starling-8883 \n8 light-mintcream-rhinoceros-3655 \n9 handsome-lemonchiffon-eel-4222 \n10 witty-blue-koala-8098 \n11 dark-blanchedalmond-ferret-7729 \n12 eager-limegreen-hedgehog-1312 \n13 famous-yellow-baboon-2243 \n14 gifted-coral-bison-842 \n15 glamorous-orchid-turtle-6425 \n16 inexpensive-maroon-donkey-7562 \n17 inferred-data-type-monitor-vjwbpo \n18 fancy-chocolate-wasp-8247 \n19 None \n20 plain-fuchsia-stinkbug-4064 \n21 stormy-olive-butterfly-8693 \n22 tame-beige-sardine-3501 \n23 tough-green-hare-1322 \n24 uninterested-blueviolet-reindeer-9950 \n25 uninterested-red-alpaca-2523 \n26 unique-estimate-ratio-monitor-ccf7cl \n27 unique-ratio-29f3ef1c-monitor \n28 busy-hotpink-gaur-9703 \n29 happy-snow-grouse-452 \n\n analyzer_id metric \n0 energetic-black-cobra-7838-analyzer unique_est \\\n1 expensive-tomato-moose-6522-analyzer median \n2 good-cornsilk-bear-9359-analyzer count_null \n3 elated-gray-baboon-4620-analyzer count_null_ratio \n4 csw-analyzer-2 median \n5 missing-values-ratio-analyzer-v9uywi count_null_ratio \n6 curious-lemonchiffon-rabbit-7000-analyzer frequent_items \n7 clear-azure-starling-8883-analyzer frequent_items \n8 light-mintcream-rhinoceros-3655-analyzer frequent_items \n9 handsome-lemonchiffon-eel-4222-analyzer frequent_items \n10 witty-blue-koala-8098-analyzer histogram \n11 dark-blanchedalmond-ferret-7729-analyzer frequent_items \n12 eager-limegreen-hedgehog-1312-analyzer histogram \n13 famous-yellow-baboon-2243-analyzer histogram \n14 gifted-coral-bison-842-analyzer histogram \n15 glamorous-orchid-turtle-6425-analyzer histogram \n16 inexpensive-maroon-donkey-7562-analyzer histogram \n17 inferred-data-type-analyzer-vjwbpo inferred_data_type \n18 fancy-chocolate-wasp-8247-analyzer count \n19 eager-violet-newt-4599-analyzer count_null_ratio \n20 plain-fuchsia-stinkbug-4064-analyzer count_null_ratio \n21 stormy-olive-butterfly-8693-analyzer histogram \n22 tame-beige-sardine-3501-analyzer count_null_ratio \n23 tough-green-hare-1322-analyzer count_null_ratio \n24 uninterested-blueviolet-reindeer-9950-analyzer count \n25 uninterested-red-alpaca-2523-analyzer count_null_ratio \n26 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n27 unique-ratio-29f3ef1c unique_est_ratio \n28 busy-hotpink-gaur-9703-analyzer count_null_ratio \n29 happy-snow-grouse-452-analyzer count_null_ratio \n\n failed_count max_failed_per_column min_failed_per_column \n0 56 28 28 \\\n1 2191 28 7 \n2 2163 28 7 \n3 58 28 2 \n4 1190 28 7 \n5 2609 25 2 \n6 15 15 15 \n7 15 15 15 \n8 70 15 1 \n9 17 15 2 \n10 7 7 7 \n11 7 7 7 \n12 7 7 7 \n13 7 7 7 \n14 7 7 7 \n15 7 7 7 \n16 7 7 7 \n17 2 2 2 \n18 1 1 1 \n19 1 1 1 \n20 1 1 1 \n21 1 1 1 \n22 1 1 1 \n23 1 1 1 \n24 1 1 1 \n25 1 1 1 \n26 1 1 1 \n27 1 1 1 \n28 1 1 1 \n29 1 1 1 \n\n avg_failed_per_column action_count action_targets \n0 28 1 [email] \n1 27 0 [] \n2 27 0 [] \n3 19 1 [email] \n4 15 0 [] \n5 24 1 [email] \n6 15 1 [test-sort] \n7 15 1 [test-sort] \n8 8 0 [] \n9 8 0 [] \n10 7 0 [] \n11 7 0 [] \n12 7 0 [] \n13 7 0 [] \n14 7 0 [] \n15 7 0 [] \n16 7 0 [] \n17 2 0 [] \n18 1 0 [] \n19 1 0 [] \n20 1 0 [] \n21 1 0 [] \n22 1 0 [] \n23 1 0 [] \n24 1 1 [christine-test-email] \n25 1 0 [] \n26 1 2 [email, slack] \n27 1 0 [] \n28 1 0 [] \n29 1 0 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est562828281[email]
1Noneexpensive-tomato-moose-6522-analyzermedian2191287270[]
2good-cornsilk-bear-9359good-cornsilk-bear-9359-analyzercount_null2163287270[]
3elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio58282191[email]
4expensive-tomato-moose-6522csw-analyzer-2median1190287150[]
5missing-values-ratio-monitor-v9uywimissing-values-ratio-analyzer-v9uywicount_null_ratio2609252241[email]
6curious-lemonchiffon-rabbit-7000curious-lemonchiffon-rabbit-7000-analyzerfrequent_items151515151[test-sort]
7clear-azure-starling-8883clear-azure-starling-8883-analyzerfrequent_items151515151[test-sort]
8light-mintcream-rhinoceros-3655light-mintcream-rhinoceros-3655-analyzerfrequent_items7015180[]
9handsome-lemonchiffon-eel-4222handsome-lemonchiffon-eel-4222-analyzerfrequent_items1715280[]
10witty-blue-koala-8098witty-blue-koala-8098-analyzerhistogram77770[]
11dark-blanchedalmond-ferret-7729dark-blanchedalmond-ferret-7729-analyzerfrequent_items77770[]
12eager-limegreen-hedgehog-1312eager-limegreen-hedgehog-1312-analyzerhistogram77770[]
13famous-yellow-baboon-2243famous-yellow-baboon-2243-analyzerhistogram77770[]
14gifted-coral-bison-842gifted-coral-bison-842-analyzerhistogram77770[]
15glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram77770[]
16inexpensive-maroon-donkey-7562inexpensive-maroon-donkey-7562-analyzerhistogram77770[]
17inferred-data-type-monitor-vjwbpoinferred-data-type-analyzer-vjwbpoinferred_data_type22220[]
18fancy-chocolate-wasp-8247fancy-chocolate-wasp-8247-analyzercount11110[]
19Noneeager-violet-newt-4599-analyzercount_null_ratio11110[]
20plain-fuchsia-stinkbug-4064plain-fuchsia-stinkbug-4064-analyzercount_null_ratio11110[]
21stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram11110[]
22tame-beige-sardine-3501tame-beige-sardine-3501-analyzercount_null_ratio11110[]
23tough-green-hare-1322tough-green-hare-1322-analyzercount_null_ratio11110[]
24uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount11111[christine-test-email]
25uninterested-red-alpaca-2523uninterested-red-alpaca-2523-analyzercount_null_ratio11110[]
26unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio11112[email, slack]
27unique-ratio-29f3ef1c-monitorunique-ratio-29f3ef1cunique_est_ratio11110[]
28busy-hotpink-gaur-9703busy-hotpink-gaur-9703-analyzercount_null_ratio11110[]
29happy-snow-grouse-452happy-snow-grouse-452-analyzercount_null_ratio11110[]
\n
" + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "failed_monitors_df = pd.DataFrame.from_records([n.dict() for n in diagnoser.failed_monitors])\n", + "failed_monitors_df" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.736520Z", + "start_time": "2024-04-16T15:01:18.725303Z" + } + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From this information, the diagnoser chooses the most noisy monitor that has notification actions to diagnose. This choice can be overridden by setting the `monitor_id_to_diagnose` property of the diagnoser to the desired monitor id. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.740411Z", + "start_time": "2024-04-16T15:01:18.737542Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "'adorable-goldenrod-lion-9438'" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnoser.monitor_id_to_diagnose" + ] + }, + { + "cell_type": "markdown", + "source": [ + "We can get the monitor object from the diagnoser, to see its display name and any other useful information." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/plain": "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1676498472577, author='system', description=None), id='adorable-goldenrod-lion-9438', displayName='wrong-drift-crowded-orchid-coyote-2773', tags=None, analyzerIds=['adorable-goldenrod-lion-9438-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[])" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnoser.monitor_to_diagnose" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.743620Z", + "start_time": "2024-04-16T15:01:18.741222Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We can similarly see the configuration of the analyzer that is being diagnosed.\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "Analyzer(metadata=Metadata(version=2, schemaVersion=1, updatedTimestamp=1713279603124, author='user_c9292ec40407f7b580f0a2c90745ebfba2b9e6ea81c848ef944d31e48a45f98', description=None), id='adorable-goldenrod-lion-9438-analyzer', displayName=None, tags=['featureSelection:all'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[], type=, include=['*'], exclude=['issue_d', , 'url'], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnoser.analyzer_to_diagnose" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:19.544392Z", + "start_time": "2024-04-16T15:01:18.744499Z" + } + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get information on noisy and failing segments in the analyzer\n", + "\n", + "Now we use the diagnoser to get information about noisy and failing segments in the analyzer, so we can choose a segment to diagnose. The results are sorted so the segment with the most anomalies for the selected monitor is first." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "data": { + "text/plain": " segment total_anomalies batch_count\n0 overall 31 30", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
segmenttotal_anomaliesbatch_count
0overall3130
\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.helpers.utils import segment_as_readable_text\n", + "\n", + "noisy_segments = diagnoser.detect_noisy_segments()\n", + "noisy_segments_df = pd.DataFrame.from_records([n.dict() for n in noisy_segments])\n", + "noisy_segments_df['segment'] = [segment_as_readable_text(n.segment.tags) for n in noisy_segments]\n", + "noisy_segments_df" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:01:19.860254Z", + "start_time": "2024-04-16T15:01:19.545452Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "The diagnoser chooses the noisiest segment to diagnose. This can be changed by setting the `diagnostic_segment` property." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:19.863407Z", + "start_time": "2024-04-16T15:01:19.861065Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "'overall'" + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_as_readable_text(diagnoser.diagnostic_segment.tags)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get information on noisy columns \n", + "\n", + "The next step is to get information on the noisy columns within the segment, so we can choose a subset of columns to diagnose. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:20.057746Z", + "start_time": "2024-04-16T15:01:19.864287Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " column total_anomalies\n0 issue_d 30\n1 url 1\n2 debt_settlement_flag 0\n3 desc 0\n4 disbursement_method 0\n5 earliest_cr_line 0\n6 emp_length 0\n7 emp_title 0\n8 grade 0\n9 hardship_flag 0\n10 home_ownership 0\n11 id 0\n12 initial_list_status 0\n13 last_credit_pull_d 0\n14 last_pymnt_d 0\n15 loan_status 0\n16 next_pymnt_d 0\n17 purpose 0\n18 pymnt_plan 0\n19 sub_grade 0\n20 term 0\n21 title 0\n22 verification_status 0\n23 verification_status_joint 0\n24 addr_state 0\n25 zip_code 0\n26 application_type 0", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
columntotal_anomalies
0issue_d30
1url1
2debt_settlement_flag0
3desc0
4disbursement_method0
5earliest_cr_line0
6emp_length0
7emp_title0
8grade0
9hardship_flag0
10home_ownership0
11id0
12initial_list_status0
13last_credit_pull_d0
14last_pymnt_d0
15loan_status0
16next_pymnt_d0
17purpose0
18pymnt_plan0
19sub_grade0
20term0
21title0
22verification_status0
23verification_status_joint0
24addr_state0
25zip_code0
26application_type0
\n
" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "noisy_columns = diagnoser.detect_noisy_columns()\n", + "noisy_columns_df = pd.DataFrame.from_records([n.dict() for n in noisy_columns])\n", + "noisy_columns_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The API limits diagnosis to 100 columns at a time, so we choose the top 100 noisy columns. We could then iterate through other columns if desired." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:20.061262Z", + "start_time": "2024-04-16T15:01:20.058650Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "['issue_d',\n 'url',\n 'debt_settlement_flag',\n 'desc',\n 'disbursement_method',\n 'earliest_cr_line',\n 'emp_length',\n 'emp_title',\n 'grade',\n 'hardship_flag',\n 'home_ownership',\n 'id',\n 'initial_list_status',\n 'last_credit_pull_d',\n 'last_pymnt_d',\n 'loan_status',\n 'next_pymnt_d',\n 'purpose',\n 'pymnt_plan',\n 'sub_grade',\n 'term',\n 'title',\n 'verification_status',\n 'verification_status_joint',\n 'addr_state',\n 'zip_code',\n 'application_type']" + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = list(noisy_columns_df.column[:100])\n", + "columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ask for a monitor diagnosis\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:03:35.516085Z", + "start_time": "2024-04-16T15:03:30.514723Z" + } + }, + "outputs": [], + "source": [ + "# for now, we need to enforce this to run using local server\n", + "import os\n", + "os.environ['USE_LOCAL_SERVER'] = 'server'\n", + "monitor_report = diagnoser.diagnose(columns)" + ] + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"wrong-drift-crowded-orchid-coyote-2773\" [adorable-goldenrod-lion-9438] in model-0 org-0, over interval 2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z.\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"adorable-goldenrod-lion-9438-analyzer\" targets 123 columns and ran on 27 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 2494691 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 27 columns and 30 batches.\n", + "Found 31 anomalies in 2 columns, with up to 100.0% (30) batches having anomalies per column and 50.0% (15.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 1 |\n", + "\n", + "No failures were detected.\n", + "\n", + "Conditions that may impact diagnosis quality include:\n", + "\t* analyzer_changed: Analyzer changed within the diagnostic interval - detectors ['stale_analysis', 'changing_discrete', 'low_drift_threshold', 'missing_baseline_batches', 'small_nonnull_batches']\n", + "\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 2 columns: ['issue_d', 'url']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 1 |\n", + "Accounting for 31 anomalies out of 31\n" + ] + } + ], + "source": [ + "print(monitor_report.describe())" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:03:35.522329Z", + "start_time": "2024-04-16T15:03:35.518688Z" + } + }, + "execution_count": 18 + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/example_notebooks/diagnoser.ipynb b/examples/example_notebooks/diagnoser.ipynb new file mode 100644 index 0000000..ab86e52 --- /dev/null +++ b/examples/example_notebooks/diagnoser.ipynb @@ -0,0 +1,767 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Detecting noisy monitors\n", + "\n", + "This notebook shows how to detect noisy monitors in a dataset using the WhyLabs Monitor Diagnoser. It uses the diagnoser to automatically detect the noisiest monitor for dataset, get a diagnosis of\n", + "the conditions causing the noise, get recommended changes and where automatable, apply those changes.\n", + "\n", + "## Install requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2024-04-16T14:58:15.366726Z", + "start_time": "2024-04-16T14:58:15.361250Z" + } + }, + "outputs": [], + "source": [ + "%pip install ." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Setup whylabs API connection\n", + "\n", + "First, set up the information to connect to WhyLabs. Update the org_id, dataset_id and api_key in the following before running it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:58:19.211882Z", + "start_time": "2024-04-16T14:58:15.369321Z" + } + }, + "outputs": [], + "source": [ + "import getpass\n", + "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", + "\n", + "org_id = 'org-0'\n", + "dataset_id = 'model-0'\n", + "api_key = getpass.getpass()\n", + "api_endpoint = 'https://songbird.development.whylabsdev.com'\n", + "\n", + "env_setup(\n", + " org_id=org_id,\n", + " dataset_id=dataset_id,\n", + " api_key=api_key,\n", + " whylabs_endpoint=api_endpoint\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize the Monitor Diagnoser with the org_id and dataset_id." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:58:19.609165Z", + "start_time": "2024-04-16T14:58:19.213684Z" + } + }, + "outputs": [], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.monitor_diagnoser import MonitorDiagnoser\n", + "diagnoser = MonitorDiagnoser(org_id, dataset_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run the default diagnosis\n", + "\n", + "With no further input, the diagnoser will make a series of calls to identify the noisiest monitor, segment and columns; and then perform a diagnosis." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:59:50.931331Z", + "start_time": "2024-04-16T14:59:44.553343Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "MonitorDiagnosisReport(orgId='org-0', datasetId='model-0', analyzerId='adorable-goldenrod-lion-9438-analyzer', interval='2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z', expectedBatchCount=30, diagnosticData=DiagnosticDataSummary(diagnosticSegment=Segment(tags=[]), diagnosticProfile=ProfileSummary(minRowName='issue_d', minRowCount=2494691, maxRowName='issue_d', maxRowCount=2494691), diagnosticBatches=BatchesSummary(minBatchName='issue_d', minBatchCount=30, maxBatchName='issue_d', maxBatchCount=30), analysisResults=AnalysisResultsSummary(results=ResultRecord(diagnosedColumnCount=27, batchCount=30), failures=FailureRecord(totalFailuresCount=0, maxFailuresCount=0, meanFailuresCount=0, byColumnCount=[], byTypeCount=[]), anomalies=AnomalyRecord(totalAnomalyCount=31, maxAnomalyCount=30, meanAnomalyCount=15, batchCount=30, byColumnCount=[('issue_d', 30), ('url', 1)], byColumnBatchCount=[('addr_state', 19), ('application_type', 19), ('debt_settlement_flag', 30), ('desc', 1), ('disbursement_method', 19), ('earliest_cr_line', 19), ('emp_length', 30), ('emp_title', 19), ('grade', 19), ('hardship_flag', 19), ('home_ownership', 19), ('id', 9), ('initial_list_status', 30), ('issue_d', 30), ('last_credit_pull_d', 30), ('last_pymnt_d', 19), ('loan_status', 19), ('next_pymnt_d', 19), ('purpose', 19), ('pymnt_plan', 19), ('sub_grade', 19), ('term', 19), ('title', 19), ('url', 19), ('verification_status', 19), ('verification_status_joint', 30), ('zip_code', 30)])), targetedColumnCount=125), qualityIssues=[], conditions=[ConditionRecord(columns=['issue_d', 'url'], info=None, summary='many values are unique across batches', name='changing_discrete')], monitor=Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1676498472577, author='system', description=None), id='adorable-goldenrod-lion-9438', displayName='wrong-drift-crowded-orchid-coyote-2773', tags=None, analyzerIds=['adorable-goldenrod-lion-9438-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[]), analyzer=Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1676498472065, author='system', description=None), id='adorable-goldenrod-lion-9438-analyzer', displayName=None, tags=['featureSelection:all'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[], type=, include=['*'], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None))), analyzedColumnCount=27)" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for now, we need to enforce this to run using local server\n", + "import os\n", + "os.environ['USE_LOCAL_SERVER'] = 'server'\n", + "monitor_report = diagnoser.diagnose()\n", + "monitor_report" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:59:50.950021Z", + "start_time": "2024-04-16T14:59:50.932643Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"wrong-drift-crowded-orchid-coyote-2773\" [adorable-goldenrod-lion-9438] in model-0 org-0, over interval 2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z.\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"adorable-goldenrod-lion-9438-analyzer\" targets 125 columns and ran on 27 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 2494691 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 27 columns and 30 batches.\n", + "Found 31 anomalies in 2 columns, with up to 100.0% (30) batches having anomalies per column and 50.0% (15.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 1 |\n", + "\n", + "No failures were detected.\n", + "\n", + "No issues impacting diagnosis quality were detected\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 2 columns: ['issue_d', 'url']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 1 |\n", + "Accounting for 31 anomalies out of 31\n" + ] + } + ], + "source": [ + "print(monitor_report.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-09T21:59:48.180867Z", + "start_time": "2024-03-09T21:59:48.177537Z" + } + }, + "source": [ + "The monitor report can be serialized to a JSON file for later use." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:59:54.558717Z", + "start_time": "2024-04-16T14:59:54.552542Z" + } + }, + "outputs": [], + "source": [ + "with open('monitor_report.json', 'w') as f:\n", + " f.write(monitor_report.json())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:59:56.056487Z", + "start_time": "2024-04-16T14:59:56.051250Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"orgId\": \"org-0\",\n", + " \"datasetId\": \"model-0\",\n", + " \"analyzerId\": \"adorable-goldenrod-lion-9438-analyzer\",\n", + " \"interval\": \"2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z\",\n", + " \"expectedBatchCount\": 30,\n", + " \"diagnosticData\": {\n", + " \"diagnosticSegment\": {\n", + " \"tags\": []\n", + " },\n", + " \"diagnosticProfile\": {\n", + " \"minRowName\": \"issue_d\",\n", + " \"minRowCount\": 2494691,\n", + " \"maxRowName\": \"issue_d\",\n", + " \"maxRowCount\": 2494691\n", + " },\n", + " \"diagnosticBatches\": {\n", + " \"minBatchName\": \"issue_d\",\n", + " \"minBatchCount\": 30,\n", + " \"maxBatchName\": \"issue_d\",\n", + " \"maxBatchCount\": 30\n", + " },\n", + " \"analysisResults\": {\n", + " \"results\": {\n", + " \"diagnosedColumnCount\": 27,\n", + " \"batchCount\": 30\n", + " },\n", + " \"failures\": {\n", + " \"totalFailuresCount\": 0,\n", + " \"maxFailuresCount\": 0,\n", + " \"meanFailuresCount\": 0,\n", + " \"byColumnCount\": [],\n", + " \"byTypeCount\": []\n", + " },\n", + " \"anomalies\": {\n", + " \"totalAnomalyCount\": 31,\n", + " \"maxAnomalyCount\": 30,\n", + " \"meanAnomalyCount\": 15,\n", + " \"batchCount\": 30,\n", + " \"byColumnCount\": [\n", + " [\n", + " \"issue_d\",\n", + " 30\n", + " ],\n", + " [\n", + " \"url\",\n", + " 1\n", + " ]\n", + " ],\n", + " \"byColumnBatchCount\": [\n", + " [\n", + " \"addr_state\",\n", + " 19\n", + " ],\n", + " [\n", + " \"application_type\",\n", + " 19\n", + " ],\n", + " [\n", + " \"debt_settlement_flag\",\n", + " 30\n", + " ],\n", + " [\n", + " \"desc\",\n", + " 1\n", + " ],\n", + " [\n", + " \"disbursement_method\",\n", + " 19\n", + " ],\n", + " [\n", + " \"earliest_cr_line\",\n", + " 19\n", + " ],\n", + " [\n", + " \"emp_length\",\n", + " 30\n", + " ],\n", + " [\n", + " \"emp_title\",\n", + " 19\n", + " ],\n", + " [\n", + " \"grade\",\n", + " 19\n", + " ],\n", + " [\n", + " \"hardship_flag\",\n", + " 19\n", + " ],\n", + " [\n", + " \"home_ownership\",\n", + " 19\n", + " ],\n", + " [\n", + " \"id\",\n", + " 9\n", + " ],\n", + " [\n", + " \"initial_list_status\",\n", + " 30\n", + " ],\n", + " [\n", + " \"issue_d\",\n", + " 30\n", + " ],\n", + " [\n", + " \"last_credit_pull_d\",\n", + " 30\n", + " ],\n", + " [\n", + " \"last_pymnt_d\",\n", + " 19\n", + " ],\n", + " [\n", + " \"loan_status\",\n", + " 19\n", + " ],\n", + " [\n", + " \"next_pymnt_d\",\n", + " 19\n", + " ],\n", + " [\n", + " \"purpose\",\n", + " 19\n", + " ],\n", + " [\n", + " \"pymnt_plan\",\n", + " 19\n", + " ],\n", + " [\n", + " \"sub_grade\",\n", + " 19\n", + " ],\n", + " [\n", + " \"term\",\n", + " 19\n", + " ],\n", + " [\n", + " \"title\",\n", + " 19\n", + " ],\n", + " [\n", + " \"url\",\n", + " 19\n", + " ],\n", + " [\n", + " \"verification_status\",\n", + " 19\n", + " ],\n", + " [\n", + " \"verification_status_joint\",\n", + " 30\n", + " ],\n", + " [\n", + " \"zip_code\",\n", + " 30\n", + " ]\n", + " ]\n", + " }\n", + " },\n", + " \"targetedColumnCount\": 125\n", + " },\n", + " \"qualityIssues\": [],\n", + " \"conditions\": [\n", + " {\n", + " \"columns\": [\n", + " \"issue_d\",\n", + " \"url\"\n", + " ],\n", + " \"info\": null,\n", + " \"summary\": \"many values are unique across batches\",\n", + " \"name\": \"changing_discrete\"\n", + " }\n", + " ],\n", + " \"monitor\": {\n", + " \"metadata\": {\n", + " \"version\": 1,\n", + " \"schemaVersion\": 1,\n", + " \"updatedTimestamp\": 1676498472577,\n", + " \"author\": \"system\",\n", + " \"description\": null\n", + " },\n", + " \"id\": \"adorable-goldenrod-lion-9438\",\n", + " \"displayName\": \"wrong-drift-crowded-orchid-coyote-2773\",\n", + " \"tags\": null,\n", + " \"analyzerIds\": [\n", + " \"adorable-goldenrod-lion-9438-analyzer\"\n", + " ],\n", + " \"schedule\": {\n", + " \"type\": \"immediate\"\n", + " },\n", + " \"disabled\": null,\n", + " \"severity\": 3,\n", + " \"mode\": {\n", + " \"type\": \"DIGEST\",\n", + " \"filter\": null,\n", + " \"creationTimeOffset\": null,\n", + " \"datasetTimestampOffset\": \"P7D\",\n", + " \"groupBy\": null\n", + " },\n", + " \"actions\": []\n", + " },\n", + " \"analyzer\": {\n", + " \"metadata\": {\n", + " \"version\": 1,\n", + " \"schemaVersion\": 1,\n", + " \"updatedTimestamp\": 1676498472065,\n", + " \"author\": \"system\",\n", + " \"description\": null\n", + " },\n", + " \"id\": \"adorable-goldenrod-lion-9438-analyzer\",\n", + " \"displayName\": null,\n", + " \"tags\": [\n", + " \"featureSelection:all\"\n", + " ],\n", + " \"schedule\": {\n", + " \"type\": \"fixed\",\n", + " \"cadence\": \"daily\",\n", + " \"exclusionRanges\": null\n", + " },\n", + " \"disabled\": null,\n", + " \"disableTargetRollup\": null,\n", + " \"targetMatrix\": {\n", + " \"segments\": [],\n", + " \"type\": \"column\",\n", + " \"include\": [\n", + " \"*\"\n", + " ],\n", + " \"exclude\": [\n", + " \"group:output\"\n", + " ],\n", + " \"profileId\": null\n", + " },\n", + " \"dataReadinessDuration\": null,\n", + " \"batchCoolDownPeriod\": null,\n", + " \"backfillGracePeriodDuration\": null,\n", + " \"config\": {\n", + " \"schemaVersion\": null,\n", + " \"params\": null,\n", + " \"metric\": \"frequent_items\",\n", + " \"type\": \"drift\",\n", + " \"algorithm\": \"hellinger\",\n", + " \"threshold\": 0.7,\n", + " \"minBatchSize\": 1,\n", + " \"baseline\": {\n", + " \"datasetId\": null,\n", + " \"inheritSegment\": null,\n", + " \"type\": \"TrailingWindow\",\n", + " \"size\": 7,\n", + " \"offset\": null,\n", + " \"exclusionRanges\": null\n", + " }\n", + " }\n", + " },\n", + " \"analyzedColumnCount\": 27\n", + "}\n" + ] + } + ], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.models import MonitorDiagnosisReport\n", + "\n", + "with open('monitor_report.json', 'r') as f:\n", + " monitor_report = MonitorDiagnosisReport.parse_raw(f.read())\n", + "print(monitor_report.json(indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ask for recommended changes\n", + "\n", + "Given the diagnosis report for the monitor, the ChangeRecommender will recommend changes to make to the monitor. By default it will make recommendations for all columns where it has detected noise-related conditions. Set the `min_anomaly_count` property to restrict this to only columns that caused a certain number of anomalies.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T14:59:59.258082Z", + "start_time": "2024-04-16T14:59:59.248989Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. Remove columns from the analyzer for ['issue_d', 'url']\n" + ] + } + ], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.recommendation.change_recommender import ChangeRecommender\n", + "\n", + "recommender = ChangeRecommender(monitor_report)\n", + "recommender.min_anomaly_count = 1\n", + "changes = recommender.recommend()\n", + "print('\\n'.join([f'{i+1}. {c.describe()}' for i, c in enumerate(changes)]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-09T21:56:56.392829Z", + "start_time": "2024-03-09T21:56:56.250185Z" + } + }, + "source": [ + "## Execute automatable changes\n", + "\n", + "A subset of recommended changes can be executed automatically by the recommender. Pass the ones you want to make into the `make_changes` call, or pass all changes if you want it to make all of the automatable changes." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:00:01.766477Z", + "start_time": "2024-04-16T15:00:01.763192Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remove columns from the analyzer for ['issue_d', 'url']\n" + ] + } + ], + "source": [ + "automatable_changes = [c for c in changes if c.can_automate()]\n", + "print('\\n'.join([c.describe() for c in automatable_changes]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:00:04.589600Z", + "start_time": "2024-04-16T15:00:02.766087Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully made the following changes:\n", + "\t* Remove columns from the analyzer for ['issue_d', 'url']\n" + ] + } + ], + "source": [ + "change_results = recommender.make_changes(automatable_changes)\n", + "print(change_results.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "start_time": "2024-03-09T21:56:56.394187Z" + } + }, + "source": [ + "Note that the monitor will still appear to the diagnoser as the noisiest monitor until enough time has passed for the impact of the monitor changes to be observed. You may want to use the WhyLabs preview UI to view what impacts may be expected from the change.\n", + "\n", + "## Reviewing other noisy monitors\n", + "\n", + "The diagnoser can be used to review other noisy monitors in the dataset. The `noisy_monitors` property will return a list of the noisiest monitors, and the `monitor_id_to_diagnose` property can be set to the monitor_id of the monitor to diagnose." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:00:06.815149Z", + "start_time": "2024-04-16T15:00:06.798273Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " monitor_id analyzer_id \n0 adorable-goldenrod-lion-9438 adorable-goldenrod-lion-9438-analyzer \\\n1 unsightly-orchid-gorilla-4971 unsightly-orchid-gorilla-4971-analyzer \n2 concerned-skyblue-penguin-6734 concerned-skyblue-penguin-6734-analyzer \n3 proud-seagreen-carabeef-65 proud-seagreen-carabeef-65-analyzer \n4 kind-cyan-kangaroo-1253 kind-cyan-kangaroo-1253-analyzer \n.. ... ... \n93 numerical-drift-monitor-60dfcc numerical-drift-analyzer-60dfcc \n94 stormy-olive-butterfly-8693 stormy-olive-butterfly-8693-analyzer \n95 fine-magenta-nightingale-9708 fine-magenta-nightingale-9708-analyzer \n96 None eager-violet-newt-4599-analyzer \n97 unsightly-bisque-lemur-1917 unsightly-bisque-lemur-1917-analyzer \n\n metric column_count segment_count anomaly_count \n0 frequent_items 2 1 31 \\\n1 frequent_items 3 1 33 \n2 frequent_items 3 1 32 \n3 histogram 1 1 28 \n4 histogram 1 1 28 \n.. ... ... ... ... \n93 histogram 1 1 2 \n94 histogram 1 1 2 \n95 unique_est_ratio 26 1 39 \n96 count_null_ratio 21 1 28 \n97 frequent_items 1 1 1 \n\n max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \n0 30 1 15 \\\n1 30 1 11 \n2 30 1 10 \n3 28 28 28 \n4 28 28 28 \n.. ... ... ... \n93 2 2 2 \n94 2 2 2 \n95 2 1 1 \n96 2 1 1 \n97 1 1 1 \n\n action_count action_targets \n0 0 [] \n1 0 [] \n2 0 [] \n3 0 [] \n4 0 [] \n.. ... ... \n93 2 [email, slack] \n94 0 [] \n95 0 [] \n96 0 [] \n97 0 [] \n\n[98 rows x 11 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0adorable-goldenrod-lion-9438adorable-goldenrod-lion-9438-analyzerfrequent_items2131301150[]
1unsightly-orchid-gorilla-4971unsightly-orchid-gorilla-4971-analyzerfrequent_items3133301110[]
2concerned-skyblue-penguin-6734concerned-skyblue-penguin-6734-analyzerfrequent_items3132301100[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11282828280[]
4kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11282828280[]
....................................
93numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
94stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram1122220[]
95fine-magenta-nightingale-9708fine-magenta-nightingale-9708-analyzerunique_est_ratio261392110[]
96Noneeager-violet-newt-4599-analyzercount_null_ratio211282110[]
97unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

98 rows × 11 columns

\n
" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors])\n", + "noisy_monitors_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-09T21:56:56.396893Z", + "start_time": "2024-03-09T21:56:56.395306Z" + } + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:00:17.594027Z", + "start_time": "2024-04-16T15:00:09.536137Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"unsightly-orchid-gorilla-4971\" [unsightly-orchid-gorilla-4971] in model-0 org-0, over interval 2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z.\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"unsightly-orchid-gorilla-4971-analyzer\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 2494691 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 33 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| desc | 2 |\n", + "| url | 1 |\n", + "\n", + "No failures were detected.\n", + "\n", + "No issues impacting diagnosis quality were detected\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| desc | 2 |\n", + "| url | 1 |\n", + "Accounting for 33 anomalies out of 33\n" + ] + } + ], + "source": [ + "diagnoser.monitor_id_to_diagnose = noisy_monitors_df.iloc[1]['monitor_id']\n", + "monitor_report = diagnoser.diagnose()\n", + "print(monitor_report.describe())" + ] + }, + { + "cell_type": "markdown", + "source": [ + "You can also use the `noisy_monitors_with_actions` property to prioritize noise in monitors with actions, as these are most likely to cause alert fatigue." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [ + { + "data": { + "text/plain": " monitor_id \n0 energetic-black-cobra-7838 \\\n1 frequent-items-drift-monitor-uu0ax8 \n2 old-crimson-starling-2516 \n3 frequent-items-drift-monitor-48ukw1 \n4 frequent-items-drift-monitor-jepz7t \n5 frequent-items-drift-monitor-pxexvn \n6 frequent-items-drift-monitor-u31vmb \n7 elated-gray-baboon-4620 \n8 nice-burlywood-tarsier-4771 \n9 unique-estimate-ratio-monitor-ccf7cl \n10 numerical-drift-monitor-zy4q8v \n11 uninterested-blueviolet-reindeer-9950 \n12 numerical-drift-monitor-jpodsg \n13 numerical-drift-monitor-60dfcc \n\n analyzer_id metric \n0 energetic-black-cobra-7838-analyzer unique_est \\\n1 frequent-items-drift-analyzer-uu0ax8 frequent_items \n2 old-crimson-starling-2516-analyzer frequent_items \n3 frequent-items-drift-analyzer-48ukw1 frequent_items \n4 frequent-items-drift-analyzer-jepz7t frequent_items \n5 frequent-items-drift-analyzer-pxexvn frequent_items \n6 frequent-items-drift-analyzer-u31vmb frequent_items \n7 elated-gray-baboon-4620-analyzer count_null_ratio \n8 nice-burlywood-tarsier-4771-analyzer unique_est \n9 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n10 numerical-drift-analyzer-zy4q8v histogram \n11 uninterested-blueviolet-reindeer-9950-analyzer count \n12 numerical-drift-analyzer-jpodsg histogram \n13 numerical-drift-analyzer-60dfcc histogram \n\n column_count segment_count anomaly_count max_anomaly_per_column \n0 8 1 100 28 \\\n1 3 1 31 28 \n2 3 1 31 28 \n3 3 1 31 28 \n4 3 1 31 28 \n5 3 1 31 28 \n6 3 1 31 28 \n7 15 1 70 28 \n8 7 1 97 26 \n9 104 1 358 7 \n10 3 1 14 6 \n11 101 1 246 6 \n12 1 1 2 2 \n13 1 1 2 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \n0 3 12 1 \\\n1 1 10 3 \n2 1 10 1 \n3 1 10 2 \n4 1 10 2 \n5 1 10 2 \n6 1 10 2 \n7 1 4 1 \n8 3 13 2 \n9 1 3 2 \n10 2 4 1 \n11 1 2 1 \n12 2 2 2 \n13 2 2 2 \n\n action_targets \n0 [email] \n1 [email, slack, email-victor-at-whylabs] \n2 [email] \n3 [email, slack] \n4 [email, slack] \n5 [email, slack] \n6 [email, slack] \n7 [email] \n8 [slack, email] \n9 [email, slack] \n10 [email] \n11 [christine-test-email] \n12 [email, slack] \n13 [email, slack] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est81100283121[email]
1frequent-items-drift-monitor-uu0ax8frequent-items-drift-analyzer-uu0ax8frequent_items3131281103[email, slack, email-victor-at-whylabs]
2old-crimson-starling-2516old-crimson-starling-2516-analyzerfrequent_items3131281101[email]
3frequent-items-drift-monitor-48ukw1frequent-items-drift-analyzer-48ukw1frequent_items3131281102[email, slack]
4frequent-items-drift-monitor-jepz7tfrequent-items-drift-analyzer-jepz7tfrequent_items3131281102[email, slack]
5frequent-items-drift-monitor-pxexvnfrequent-items-drift-analyzer-pxexvnfrequent_items3131281102[email, slack]
6frequent-items-drift-monitor-u31vmbfrequent-items-drift-analyzer-u31vmbfrequent_items3131281102[email, slack]
7elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio1517028141[email]
8nice-burlywood-tarsier-4771nice-burlywood-tarsier-4771-analyzerunique_est7197263132[slack, email]
9unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio10413587132[email, slack]
10numerical-drift-monitor-zy4q8vnumerical-drift-analyzer-zy4q8vhistogram31146241[email]
11uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount10112466121[christine-test-email]
12numerical-drift-monitor-jpodsgnumerical-drift-analyzer-jpodsghistogram1122222[email, slack]
13numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
\n
" + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors_with_actions])\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-16T15:00:17.603562Z", + "start_time": "2024-04-16T15:00:17.595665Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/pyproject.toml b/pyproject.toml index 57d45e2..e9dbcc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [tool.poetry] name = "whylabs-toolkit" version = "0.0.18" -description = "Whylabs CLI and Helpers package." -authors = ["Murilo Mendonca ", "Anthony Naddeo "] +description = "Whylabs Toolkit package." +authors = ["Murilo Mendonca ", "Anthony Naddeo ", + "Christine Draper "] license = "Apache-2.0 license" readme = "README.md" packages = [{include = "whylabs_toolkit/**/*.py"}] diff --git a/whylabs_toolkit/monitor/diagnoser/README.md b/whylabs_toolkit/monitor/diagnoser/README.md new file mode 100644 index 0000000..e534e26 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/README.md @@ -0,0 +1,21 @@ +# Noisy monitor diagnosis + +This package helps users diagnose and fix noisy monitors in WhyLabs. This workflow has the following steps: +* Identify the noisiest monitors for a selected organization and dataset, and choose one to diagnose. +* Identify the noisiest segment of the monitor to be the diagnostic segment. +* Within that segment, identify the noisiest columns. +* Identify the conditions contributing to the noise in the diagnostic segment and noisiest columns. +* Determine the appropriate action to take to fix the conditions contributing to the noise. +* Apply the actions to the monitor. + +Most of the above steps are automated by the monitor diagnoser for common noise conditions, although in some cases the +diagnoser may not match the dataset to any known conditions. Users will also usually need to manually consider the +most appropriate action to take to fix the monitor. A recommender is provided to suggest reasonable actions +and to automate some of the basic actions. We are happy to work with you to improve the diagnoser in such cases. + +See [diagnoser.ipynb](/examples/example_notebooks/diagnoser.ipynb) for an end-to-end example of identifying noisy +monitors, diagnosing the conditions contributing to noise, and getting recommendations for fixing them. + +See [customized_diagnoser.ipynb](/examples/example_notebooks/customized_diagnoser.ipynb) for an example of how to +customize the diagnosis for your specific needs. + diff --git a/whylabs_toolkit/monitor/diagnoser/__init__.py b/whylabs_toolkit/monitor/diagnoser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/constants.py b/whylabs_toolkit/monitor/diagnoser/constants.py new file mode 100644 index 0000000..70cdc15 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/constants.py @@ -0,0 +1,4 @@ +MAX_COLUMNS = 100 +DEFAULT_BATCHES = 30 +MAX_PROFILES = 10000 +assert DEFAULT_BATCHES * MAX_COLUMNS <= MAX_PROFILES diff --git a/whylabs_toolkit/monitor/diagnoser/converters/__init__.py b/whylabs_toolkit/monitor/diagnoser/converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/converters/granularity.py b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py new file mode 100644 index 0000000..b62c3cb --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py @@ -0,0 +1,53 @@ +from dateutil.relativedelta import relativedelta +from whylabs_toolkit.utils.granularity import Granularity +from isodate import parse_datetime, parse_duration, parse_date + + +def batches_to_timedelta(time_period: str, batches: int) -> relativedelta: + if time_period == 'PT1H': + return relativedelta(hours=batches) + + if time_period == 'P1W': + return relativedelta(weeks=batches) + + if time_period == 'P1M': + return relativedelta(months=batches) + + return relativedelta(days=batches) + + +def time_period_to_granularity(time_period: str) -> Granularity: + if time_period == 'PT1H': + return Granularity.hourly + + if time_period == 'P1W': + return Granularity.weekly + + if time_period == 'P1M': + return Granularity.monthly + + return Granularity.daily + + +def calculate_num_batches(interval: str, granularity: str) -> int: + # Parse the ISO8601 interval string into a start and end datetime + start, end = interval.split('/') + start_date = parse_datetime(start) if 'T' in start else parse_date(start) + try: + end_date = parse_datetime(end) if 'T' in start else parse_date(end) + except ValueError: + end_date = start_date + parse_duration(end) + + # Calculate the difference based on the granularity + if granularity == 'hourly': + difference = relativedelta(end_date, start_date).days * 24 + relativedelta(end_date, start_date).hours + elif granularity == 'daily': + difference = relativedelta(end_date, start_date).days + elif granularity == 'weekly': + difference = relativedelta(end_date, start_date).weeks + elif granularity == 'monthly': + difference = relativedelta(end_date, start_date).months + else: + raise ValueError(f"Unsupported granularity: {granularity}") + + return difference diff --git a/whylabs_toolkit/monitor/diagnoser/converters/test_granularity.py b/whylabs_toolkit/monitor/diagnoser/converters/test_granularity.py new file mode 100644 index 0000000..5e18a5c --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/converters/test_granularity.py @@ -0,0 +1,33 @@ +from whylabs_toolkit.monitor.diagnoser.converters.granularity import calculate_num_batches + + +def test_calculate_num_batches_hourly(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-01T03:30:00Z', 'hourly') == 3 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-03T01:00:00Z', 'hourly') == 49 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-02T00:00:00Z', 'hourly') == 24 + + +def test_calculate_num_batches_daily(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-02T00:00:00Z', 'daily') == 1 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-09T00:00:00Z', 'daily') == 8 + + +def test_calculate_num_batches_weekly(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-02T00:00:00Z', 'weekly') == 0 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-09T00:00:00Z', 'weekly') == 1 + + +def test_calculate_num_batches_monthly(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-02-02T00:00:00Z', 'monthly') == 1 + + +def test_calculate_num_batches_duration(): + assert calculate_num_batches('2022-01-01T00:00:00Z/P3D', 'daily') == 3 + assert calculate_num_batches('2022-01-01T00:00:00Z/P1W', 'daily') == 7 + assert calculate_num_batches('2022-01-01T00:00:00Z/P1D', 'hourly') == 24 + + +def test_calculate_num_batches_format(): + assert calculate_num_batches('2022-01-01T00:00/2022-01-02T00:00', 'daily') == 1 + assert calculate_num_batches('2022-01-01/2022-01-02', 'daily') == 1 + assert calculate_num_batches('2022-01-01/P1D', 'daily') == 1 diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/__init__.py b/whylabs_toolkit/monitor/diagnoser/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/describe.py b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py new file mode 100644 index 0000000..e726f60 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py @@ -0,0 +1,30 @@ +from typing import List, Union + +import pandas as pd + + +def describe_truncated_list(vals: List[str], num=10) -> str: + if len(vals) <= num: + return str(vals) + return f'{vals[0:num]} and {len(vals) - num} more' + + +def describe_truncated_table(df: Union[pd.DataFrame, pd.Series], num=10) -> str: + if len(df) <= num: + return df.to_markdown() + return f'{df[0:num].to_markdown()}\n and {len(df) - num} more' + + +def filter_by_index(items: Union[pd.Index, list], ref: pd.Series) -> pd.Series: + """ + Filters the reference by items in its index. Appends 0 values for any + items not in the ref index. + + Example use... ref is anomalies by column, items are columns in a condition. + """ + index = items if isinstance(items, pd.Index) else pd.Index(items) + diff = index.difference(ref.index) + if len(diff) == 0: + return ref.loc[index].sort_index() + expanded_ref = pd.concat([ref, pd.Series([0] * len(diff), index=diff)]) + return expanded_ref.loc[index].sort_index() diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/utils.py b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py new file mode 100644 index 0000000..a3d73c0 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py @@ -0,0 +1,66 @@ +import os +from typing import List + +from whylabs_client.api.monitor_diagnostics_api import MonitorDiagnosticsApi + +from whylabs_toolkit.helpers.client import create_client +from whylabs_toolkit.helpers.config import Config +from whylabs_toolkit.monitor.models import SegmentTag + + +def get_monitor_diagnostics_api(config: Config = Config()) -> MonitorDiagnosticsApi: + """ + Get the monitor diagnostics API, which is used to interact with the WhyLabs Monitor Diagnostics service + to diagnose noisy monitors. + :param config: + :return: + """ + return MonitorDiagnosticsApi(api_client=create_client(config=config)) + + +def env_setup(org_id: str, dataset_id: str, api_key: str = None, whylabs_endpoint: str = None): + """ + Set environment variables to work with both whylabs-toolkit and whylogs. Will pick up the API + key from the environment if not provided as a parameter. + :param org_id: + :param dataset_id: + :param api_key: + :param whylabs_endpoint: + :return: + """ + os.environ['WHYLABS_API_KEY'] = api_key if api_key else os.environ['WHYLABS_API_KEY'] + if not os.environ['WHYLABS_API_KEY']: + raise Exception('Please provide an API key') + os.environ['WHYLABS_DEFAULT_ORG_ID'] = org_id + os.environ['ORG_ID'] = org_id + os.environ['WHYLABS_DEFAULT_DATASET_ID'] = dataset_id + if whylabs_endpoint: + os.environ['WHYLABS_API_ENDPOINT'] = whylabs_endpoint + os.environ['WHYLABS_HOST'] = whylabs_endpoint + + +def segment_to_text(segment: List[SegmentTag]) -> str: + if segment is None or len(segment) == 0: + return '' + text = '' + for tag in segment: + if len(text) > 0: + text += '&' + text += f'{tag.key}={tag.value}' + return text + + +def segment_as_readable_text(segment: List[SegmentTag]) -> str: + text = segment_to_text(segment) + return 'overall' if text == '' else text + + +def text_to_segment(text: str) -> List[SegmentTag]: + if text == '': + return [] + tags = [] + parts = text.split('&') + for part in parts: + [key, value] = part.split('=', 2) + tags.append(SegmentTag(key=key, value=value)) + return tags diff --git a/whylabs_toolkit/monitor/diagnoser/models/__init__.py b/whylabs_toolkit/monitor/diagnoser/models/__init__.py new file mode 100644 index 0000000..f72a901 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/models/__init__.py @@ -0,0 +1,2 @@ +from .noisy_monitors import * +from .diagnosis_report import * diff --git a/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py new file mode 100644 index 0000000..8f2118c --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py @@ -0,0 +1,228 @@ +import pandas as pd +from typing import Dict, List, Optional, Tuple +from pydantic import BaseModel +from whylabs_toolkit.monitor.models import Analyzer, Monitor, Segment, TargetLevel, FixedThresholdsConfig + +from whylabs_toolkit.monitor.diagnoser.helpers.describe import describe_truncated_table, filter_by_index, describe_truncated_list +from whylabs_toolkit.monitor.diagnoser.helpers.utils import segment_as_readable_text + + +class SegmentReport(BaseModel): + batchCount: int + segment: Segment + totalAnomalies: int + totalFailures: int + totalColumns: int + + +class NamedCount(BaseModel): + name: str + count: int + + def to_tuple(self) -> Tuple[str, int]: + return self.name, self.count + + +class ConditionRecord(BaseModel): + columns: Optional[List[str]] # not present for some conditions like stale analysis + info: Optional[Dict] + summary: str + name: str + + +class QualityIssueRecord(BaseModel): + name: str + description: str + detectors: List[str] + + +class ProfileSummary(BaseModel): + minRowName: str + minRowCount: int + maxRowName: str + maxRowCount: int + + def describe(self) -> str: + count_desc = str( + self.minRowCount) if self.minRowCount == self.maxRowCount else \ + f'{self.minRowCount} - {self.maxRowCount}' + return f'Diagnostic interval rollup contains {count_desc} rows for the diagnosed columns.\n' + + +class BatchesSummary(BaseModel): + minBatchName: str + minBatchCount: int + maxBatchName: str + maxBatchCount: int + + def describe(self) -> str: + count_desc = str( + self.minBatchCount) if self.minBatchCount == self.maxBatchCount else \ + f'{self.minBatchCount} - {self.maxBatchCount}' + return f'Diagnostic interval contains {count_desc} batches.\n' + + +class ResultRecord(BaseModel): + diagnosedColumnCount: int + batchCount: int + + def describe(self) -> str: + return f'Found non-failed results for {self.diagnosedColumnCount} columns and {self.batchCount} batches.' + + +class FailureRecord(BaseModel): + totalFailuresCount: int + maxFailuresCount: int + meanFailuresCount: int + byColumnCount: List[NamedCount] + byTypeCount: List[NamedCount] + + def describe(self) -> str: + failures = pd.DataFrame([c.to_tuple() for c in self.byColumnCount]) + failure_types = [t.name for t in self.byTypeCount] + if len(failures) == 0: + return 'No failures were detected.' + return ( + f'Found {self.totalFailuresCount} failed results, with up to {self.maxFailuresCount} ' + f'failures per column and {self.meanFailuresCount} failures on average.\n' + f'Failure types are {describe_truncated_list(failure_types)}\n' + f'Columns with failures are: \n{describe_truncated_table(failures)}\n') + + +class AnomalyRecord(BaseModel): + totalAnomalyCount: int + maxAnomalyCount: int + meanAnomalyCount: int + batchCount: int + byColumnCount: List[NamedCount] + byColumnBatchCount: List[NamedCount] + + def describe(self) -> str: + counts = pd.Series([c.to_tuple() for c in self.byColumnCount]) + max_count = int(self.maxAnomalyCount) + max_pct = max_count * 100 / self.batchCount + mean_count = float(self.meanAnomalyCount) + mean_pct = mean_count * 100 / self.batchCount + return ( + f'Found {self.totalAnomalyCount} anomalies in {len(self.byColumnCount)} columns, with up to ' + f'{max_pct:.1f}% ({max_count}) batches having anomalies per column and ' + f'{mean_pct:.1f}% ({mean_count:.1f}) on average.\n' + f'Columns with anomalies are:\n{describe_truncated_table(counts)}\n') + + +class AnalysisResultsSummary(BaseModel): + results: ResultRecord + failures: FailureRecord + anomalies: AnomalyRecord + + def describe(self) -> str: + return (f'Analysis results summary:\n' + f'{self.results.describe()}\n' + f'{self.anomalies.describe()}\n' + f'{self.failures.describe()}\n') + + +class DiagnosticDataSummary(BaseModel): + diagnosticSegment: Segment + diagnosticProfile: Optional[ProfileSummary] + diagnosticBatches: Optional[BatchesSummary] + analysisResults: Optional[AnalysisResultsSummary] + targetedColumnCount: int + + def describe(self) -> str: + return '\n'.join([ + f'Diagnostic segment is "{segment_as_readable_text(self.diagnosticSegment.tags)}".', + self.diagnosticBatches.describe(), + self.diagnosticProfile.describe() if self.diagnosticProfile is not None else '', + self.analysisResults.describe() + ]) + + +class AnalyzerDiagnosisReport(BaseModel): + orgId: str + datasetId: str + analyzerId: str + interval: str + expectedBatchCount: int + diagnosticData: DiagnosticDataSummary + qualityIssues: List[QualityIssueRecord] + conditions: List[ConditionRecord] + + def describe(self) -> str: + text = '\n'.join( + [self.diagnosticData.describe(), + self.describe_quality_issues(), self.describe_conditions()]) + return text + + def describe_quality_issues(self) -> str: + if len(self.qualityIssues) == 0: + return 'No issues impacting diagnosis quality were detected' + text = 'Conditions that may impact diagnosis quality include:\n' + for issue in self.qualityIssues: + text += f'\t* {issue.name}: {issue.description} - detectors {issue.detectors}\n' + return text + + def describe_conditions(self) -> str: + if len(self.conditions) == 0: + return 'No conditions related to noise were detected.' + text = 'Conditions that may contribute to noise include:\n' + cols = [] + for condition in self.conditions: + text += f'\t* Condition {condition.name} ({condition.summary})' + if condition.columns is not None: + cols += condition.columns + col_text = describe_truncated_list(cols, 10) + text += f' for {len(cols)} columns: {col_text}' + text += '\n' + + cols = pd.Series(cols).unique() + if len(cols) > 0: + text += f'\nAnomalies for columns with these conditions:\n' + count_tuples = [c.to_tuple() for c in self.diagnosticData.analysisResults.anomalies.byColumnCount] + idx, values = zip(*count_tuples) + count_by_col = pd.Series(values, idx) + cols_with_count = filter_by_index(cols.tolist(), count_by_col).sort_values( + ascending=False) + cols_with_count.rename('anomalies') + text += describe_truncated_table(cols_with_count) + text += (f'\nAccounting for {cols_with_count.sum()} anomalies out of ' + f'{count_by_col.sum()}\n') + + return text + + +class MonitorDiagnosisReport(AnalyzerDiagnosisReport): + monitor: Optional[Monitor] # sometimes there isn't one, e.g. it's been deleted + analyzer: Optional[Analyzer] + analyzedColumnCount: int + + def describe(self) -> str: + text = '\n'.join( + [self.describe_monitor(), self.describe_analyzer(), super().describe()]) + return text + + def describe_monitor(self) -> str: + if self.monitor is None: + return 'Monitor has been deleted.\n' + text = (f'Diagnosis is for monitor "{self.monitor.displayName if self.monitor.displayName else self.monitor.id}" ' + f'[{self.monitor.id}] in {self.datasetId} {self.orgId}, over interval {self.interval}.\n') + if len(self.monitor.actions) > 0: + text += f'Monitor has {len(self.monitor.actions)} notification actions {[a.target for a in self.monitor.actions]}.\n' + return text + + def describe_analyzer(self) -> str: + baseline = 'no baseline' if isinstance(self.analyzer.config, FixedThresholdsConfig) else \ + f'{self.analyzer.config.baseline.type} baseline' + # need to add better support for composite analyzers + targeting_desc = '' + if self.analyzer.targetMatrix is not None and self.analyzer.targetMatrix.type == TargetLevel.column: + targeting_desc = (f'\nAnalyzer "{self.analyzer.id}" targets {self.diagnosticData.targetedColumnCount} ' + f'columns and ran on {self.analyzedColumnCount} columns in the diagnosed segment.\n') + text = f'Analyzer is {self.analyzer.config.type} configuration for {self.analyzer.config.metric} metric with {baseline}.' + text += targeting_desc + text += '\n' + return text + + +class MonitorDiagnosisReportList(BaseModel): + __root__: List[MonitorDiagnosisReport] diff --git a/whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py b/whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py new file mode 100644 index 0000000..ee6c5b7 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py @@ -0,0 +1,46 @@ +from typing import Optional, List + +from pydantic import BaseModel +from whylabs_toolkit.monitor.models import Segment + + +class NoisyMonitorStats(BaseModel): + monitor_id: Optional[str] + analyzer_id: str + metric: str + column_count: int + segment_count: int + anomaly_count: int + max_anomaly_per_column: int + min_anomaly_per_column: int + avg_anomaly_per_column: int + action_count: int + action_targets: List[str] + + +class FailedMonitorStats(BaseModel): + monitor_id: Optional[str] + analyzer_id: str + metric: str + failed_count: int + max_failed_per_column: int + min_failed_per_column: int + avg_failed_per_column: int + action_count: int + action_targets: List[str] + + +class NoisySegmentStats(BaseModel): + segment: Segment + total_anomalies: int + batch_count: int + + +class FailedSegmentStats(BaseModel): + segment: Segment + total_failed: int + + +class NoisyColumnStats(BaseModel): + column: str + total_anomalies: int diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py new file mode 100644 index 0000000..962666d --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -0,0 +1,326 @@ +import os + +import pandas as pd +from typing import Tuple, List, Optional, Dict + +from pydantic import ValidationError +from whylabs_client.model.analyzer_segment_columns_diagnostic_request import AnalyzerSegmentColumnsDiagnosticRequest +from whylabs_client.model.analyzer_segment_columns_diagnostic_response import AnalyzerSegmentColumnsDiagnosticResponse +from whylabs_client.model.analyzer_segments_diagnostic_request import AnalyzerSegmentsDiagnosticRequest +from whylabs_client.model.analyzer_segments_diagnostic_response import AnalyzerSegmentsDiagnosticResponse +from whylabs_client.model.analyzers_diagnostic_response import AnalyzersDiagnosticResponse +from whylabs_client.model.diagnostic_interval_request import DiagnosticIntervalRequest +from whylabs_client.model.diagnostic_interval_response import DiagnosticIntervalResponse +from whylabs_client.model.analyzers_diagnostic_request import AnalyzersDiagnosticRequest +from whylabs_client.model.segment import Segment as WhyLabsSegment +from whylabs_client.model.segment_tag import SegmentTag as WhyLabsSegmentTag +from whylabs_toolkit.helpers.utils import get_monitor_api, get_models_api +from whylabs_toolkit.monitor.models import TimeRange, Monitor, Segment, Analyzer, EntitySchema +from whylabs_toolkit.utils.granularity import Granularity + +from whylabs_toolkit.monitor.diagnoser.helpers.utils import get_monitor_diagnostics_api, segment_as_readable_text +from whylabs_toolkit.monitor.diagnoser.converters.granularity import time_period_to_granularity +from whylabs_toolkit.monitor.diagnoser.constants import DEFAULT_BATCHES +from whylabs_toolkit.monitor.diagnoser.models import NoisyMonitorStats, FailedMonitorStats, FailedSegmentStats, \ + NoisySegmentStats, NoisyColumnStats, MonitorDiagnosisReport +from whylabs_toolkit.monitor.diagnoser.targeting import targeted_columns + + +class MonitorDiagnoser: + def __init__(self, org_id: str, dataset_id: str): + self.org_id: str = org_id + self.dataset_id: str = dataset_id + self.desired_batches: int = DEFAULT_BATCHES + self.granularity: Optional[Granularity] = None + self._diagnostics_api = get_monitor_diagnostics_api() + self._monitor_api = get_monitor_api() + self._models_api = get_models_api() + self._monitor_configs = None + self._noisy_monitors: Optional[List[NoisyMonitorStats]] = None + self._failed_monitors: Optional[List[FailedMonitorStats]] = None + self._noisy_segments: Optional[List[NoisySegmentStats]] = None + self._failed_segments: Optional[List[FailedSegmentStats]] = None + self._noisy_columns: Optional[List[NoisyColumnStats]] = None + self._diagnostic_interval: Optional[str] = None + self._monitor_id: Optional[str] = None + self._diagnostic_segment: Optional[Segment] = None + self._analyzer: Optional[Analyzer] = None + self._diagnosed_columns: Optional[str] = None + self._diagnosis: Optional[MonitorDiagnosisReport] = None + self.schema: Optional[EntitySchema] = None + + @property + def noisy_monitors(self) -> List[NoisyMonitorStats]: + if self._noisy_monitors is None: + raise Exception('Run "detect_noisy_monitors" first to get the noisy monitors.') + return self._noisy_monitors + + @property + def noisy_monitors_with_actions(self) -> List[NoisyMonitorStats]: + return [m for m in self.noisy_monitors if m.action_count > 0] + + @property + def noisy_monitors_without_actions(self) -> List[NoisyMonitorStats]: + return [m for m in self.noisy_monitors if m.action_count == 0] + + @property + def failed_monitors(self) -> List[FailedMonitorStats]: + if self._failed_monitors is None: + raise Exception('Run "detect_noisy_monitors" first to get the failed monitors.') + return self._failed_monitors + + @property + def noisy_segments(self) -> List[NoisySegmentStats]: + if self._noisy_segments is None: + raise Exception('Run "detect_noisy_segments" first to get the noisy monitors.') + return self._noisy_segments + + @property + def failed_segments(self) -> List[FailedSegmentStats]: + if self._failed_segments is None: + raise Exception('Run "detect_noisy_segments" first to get the failed monitors.') + return self._failed_segments + + @property + def noisy_columns(self) -> List[NoisyColumnStats]: + if self._noisy_columns is None: + raise Exception('Run "detect_noisy_columns" first to get the noisy columns.') + return self._noisy_columns + + @property + def monitor_configs(self): + if self._monitor_configs is None: + config = self._monitor_api.get_monitor_config_v3(self.org_id, self.dataset_id) + self._monitor_configs = [] + for m in config.get('monitors', []): + try: + self._monitor_configs.append(Monitor.parse_obj(m)) + except ValidationError: + pass # skipping monitors with validation problems + return self._monitor_configs + + @property + def diagnostic_interval(self) -> str: + if self._diagnostic_interval is None: + raise Exception('Set a diagnostic interval first, e.g. by running "choose_dataset_batches"') + return self._diagnostic_interval + + @diagnostic_interval.setter + def diagnostic_interval(self, interval: str): + self._diagnostic_interval = interval + + @property + def diagnostic_segment(self) -> Segment: + if self._diagnostic_segment is None: + raise Exception('Set the "diagnostic_segment" property first, e.g. by running "detect_noisy_segments"') + return self._diagnostic_segment + + @diagnostic_segment.setter + def diagnostic_segment(self, segment: Segment): + if self._diagnostic_segment != segment: + self._diagnostic_segment = segment + self._noisy_columns = None + self._diagnosis = None + + @property + def monitor_id_to_diagnose(self) -> str: + if self._monitor_id is None: + raise Exception('Set the "monitor_id" property first, e.g. by running "detect_noisy_monitors"') + return self._monitor_id + + @monitor_id_to_diagnose.setter + def monitor_id_to_diagnose(self, monitor_id: str): + if self._monitor_id != monitor_id: + self._monitor_id = monitor_id + # Reset anything specific to the monitor + self._analyzer = None + self._noisy_segments = None + self._failed_segments = None + self._noisy_columns = None + self._diagnosis = None + self._diagnostic_segment = None + + @property + def monitor_to_diagnose(self) -> Monitor: + return next(m for m in self.monitor_configs if m.id == self._monitor_id) + + def targeted_columns(self): + if self.schema is None: + self.schema = self._models_api.get_entity_schema(self.org_id, self.dataset_id) + return targeted_columns(self.analyzer_to_diagnose.targetMatrix, self.schema) + + @property + def analyzer_to_diagnose(self) -> Analyzer: + if self._analyzer is None: + analyzer_id = self.get_analyzer_id_for_monitor() + resp = self._monitor_api.get_analyzer(self.org_id, self.dataset_id, analyzer_id) + self._analyzer = Analyzer.parse_obj(resp) + return self._analyzer + + def choose_dataset_batches(self) -> Tuple[TimeRange, Granularity, str]: + """ + Based on the dataset's batch frequency, lineage (start/end) and the desired number of batches, + recommends a diagnostic interval for the dataset. + :return: tuple of lineage, granularity, interval + """ + # get recommended diagnostic interval and the dataset's batch frequency + resp: DiagnosticIntervalResponse = self._diagnostics_api.recommend_diagnostic_interval( + self.org_id, + DiagnosticIntervalRequest(dataset_id=self.dataset_id, batches=self.desired_batches) + ) + time_period = resp.time_period + self._diagnostic_interval = resp.interval + if resp.start_timestamp is None or resp.end_timestamp is None: + raise Exception('No existing batch data') + + lineage = TimeRange(start=resp.start_timestamp, end=resp.end_timestamp) + self.granularity = time_period_to_granularity(time_period) + + return lineage, self.granularity, self._diagnostic_interval + + def detect_noisy_monitors(self) -> List[NoisyMonitorStats]: + """ + Detects noisy monitors for the dataset. The summary statistics are returned and made available in the + noisy_monitors property. + :return: List of noisy monitor statistics, ordered with the noisiest first + """ + + def merge_monitor_actions(item: Dict, mon_acts: List[Dict]) -> Dict: + monitor_action = next((m for m in mon_acts if m['analyzer_id'] == item['analyzer_id']), None) + if monitor_action: + item.update(monitor_action) + else: + item['action_count'] = 0 + item['action_targets'] = [] + return item + + if self._diagnostic_interval is None: + self.choose_dataset_batches() + resp: AnalyzersDiagnosticResponse = self._diagnostics_api.detect_noisy_analyzers( + self.org_id, AnalyzersDiagnosticRequest(dataset_id=self.dataset_id, interval=self._diagnostic_interval)) + monitor_actions = [{ + 'monitor_id': m.id, + 'analyzer_id': m.analyzerIds[0] if len(m.analyzerIds) > 0 else None, + 'action_count': len(m.actions), + 'action_targets': [a.target for a in m.actions] + } for m in self.monitor_configs] + self._noisy_monitors = [NoisyMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) + for item in resp.noisy_analyzers] + self._failed_monitors = [FailedMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) + for item in resp.failed_analyzers] + if len(self._noisy_monitors) == 0: + raise Exception('No noisy monitors found') + if self._monitor_id is None: + self._monitor_id = self._noisy_monitors[0].monitor_id + return self._noisy_monitors + + def get_analyzer_id_for_monitor(self) -> str: + analyzer_id = next((m.analyzerIds[0] for m in self.monitor_configs if m.id == self.monitor_id_to_diagnose), + None) + if analyzer_id is None: + raise Exception(f'No analyzer found for monitor {self.monitor_id_to_diagnose}') + return analyzer_id + + def detect_noisy_segments(self): + analyzer_id = self.get_analyzer_id_for_monitor() + resp: AnalyzerSegmentsDiagnosticResponse = self._diagnostics_api.detect_noisy_segments( + self.org_id, + AnalyzerSegmentsDiagnosticRequest( + dataset_id=self.dataset_id, analyzer_id=analyzer_id, interval=self._diagnostic_interval)) + self._noisy_segments = [NoisySegmentStats.parse_obj(n.to_dict()) for n in resp.noisy_segments] + self._failed_segments = [FailedSegmentStats.parse_obj(n.to_dict()) for n in resp.failed_segments] + self.diagnostic_segment = self._noisy_segments[0].segment + return self._noisy_segments + + def detect_noisy_columns(self): + analyzer_id = self.get_analyzer_id_for_monitor() + resp: AnalyzerSegmentColumnsDiagnosticResponse = self._diagnostics_api.detect_noisy_columns( + self.org_id, + AnalyzerSegmentColumnsDiagnosticRequest( + dataset_id=self.dataset_id, analyzer_id=analyzer_id, interval=self._diagnostic_interval, + segment=WhyLabsSegment(tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags]))) + self._noisy_columns = [NoisyColumnStats.parse_obj(n.to_dict()) for n in resp.noisy_columns] + return self._noisy_columns + + def describe_segments(self) -> str: + with_anomalies = [s for s in self.noisy_segments if s.total_anomalies > 0] + with_failures = [s for s in self.failed_segments if s.total_failed > 0] + text = (f'{len(with_anomalies)} of {len(self.noisy_segments)} analyzed segments have anomalies ' + f'and {len(with_failures)} have failures\n\n') + if len(with_anomalies): + text += 'Segments with anomalies:\n' + text += pd.DataFrame.from_records(with_anomalies).to_markdown() + text += '\n' + if len(with_failures): + text += 'Segments with failures:\n' + text += pd.DataFrame.from_records(with_failures).to_markdown() + text += '\n' + noisiest = segment_as_readable_text(self.noisy_segments[0].segment.tags) + text += f'Noisiest segment selected for diagnosis: {noisiest}\n' + return text + + def describe_columns(self) -> str: + cols = self.noisy_columns + text = f'Analysis ran on {len(cols)} columns in the diagnosed segment.\n' + text += pd.DataFrame.from_records(cols).to_markdown() + return text + + def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisReport: + if self._diagnostic_interval is None: + self.choose_dataset_batches() + if self._monitor_id is None: + self.detect_noisy_monitors() + if self._diagnostic_segment is None: + self.detect_noisy_segments() + if columns is None: + if self._noisy_columns is None: + self.detect_noisy_columns() + self._diagnosed_columns = [c.column for c in self._noisy_columns[:100]] + else: + self._diagnosed_columns = columns[:100] + use_local_server = os.environ.get('USE_LOCAL_SERVER', False) + if use_local_server: + # Call the server function directly if configured to do so (for testing) + try: + from smart_config.server.server import DiagnosisRequest + from smart_config.server.diagnosis.analyzer_diagnoser import AnalyzerDiagnoser + if use_local_server == 'library': + # Call server code directly + analyzer_diagnoser = AnalyzerDiagnoser( + self.org_id, self.dataset_id, self.get_analyzer_id_for_monitor(), self.diagnostic_interval, + os.environ['WHYLABS_API_KEY'] + ) + analyzer_diagnoser.assemble_data([t for t in self.diagnostic_segment.tags], self._diagnosed_columns) + analyzer_diagnoser.run_detectors() + report = analyzer_diagnoser.summarize_diagnosis() + report_dict = report.dict() + else: + # Call local instance of server + from smart_config.server.service.diagnosis_service import DiagnosisService + diagnosis_service = DiagnosisService(options={ + 'headers': {'Accept': 'application/json', 'Content-Type': 'application/json', + 'X-API-KEY': os.environ['WHYLABS_API_KEY']}}) + report_dict = diagnosis_service.diagnose_sync( + DiagnosisRequest( + orgId=self.org_id, + datasetId=self.dataset_id, + analyzerId=self.get_analyzer_id_for_monitor(), + interval=self.diagnostic_interval, + columns=self._diagnosed_columns, + segment=self.diagnostic_segment, + granularity=self.granularity, + )) + except ImportError: + raise Exception('USE_LOCAL_SERVER is set but server library is not available.') + else: + # TODO implement call through songbird/whylabs-client instead of direct + # Call the diagnosis API via whyLabs client + raise NotImplementedError('Diagnosis API call not implemented') + + self._diagnosis = MonitorDiagnosisReport( + **report_dict, + analyzer=self.analyzer_to_diagnose, + monitor=self.monitor_to_diagnose, + analyzedColumnCount=len(self._noisy_columns) + ) + return self._diagnosis diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/__init__.py b/whylabs_toolkit/monitor/diagnoser/recommendation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py new file mode 100644 index 0000000..fd75262 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py @@ -0,0 +1,163 @@ +from __future__ import annotations +from typing import List, NamedTuple, Optional +import pandas as pd +from whylabs_client.api.monitor_api import MonitorApi +from whylabs_toolkit.helpers.utils import get_monitor_api +from whylabs_toolkit.monitor.models import Analyzer, Monitor + +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange +from whylabs_toolkit.monitor.diagnoser.recommendation.manual_change import ManualChange +from whylabs_toolkit.monitor.diagnoser.recommendation.remove_columns import RemoveColumns +from whylabs_toolkit.monitor.diagnoser.models.diagnosis_report import ( + MonitorDiagnosisReport, + ConditionRecord, +) + + +class ChangeResults(NamedTuple): + succeeded: List[RecommendedChange] + failed: List[RecommendedChange] + errors: List[str] + manual: List[RecommendedChange] + + def describe(self) -> str: + description = '' + if len(self.succeeded): + description += 'Successfully made the following changes:\n' + description += '\n\t'.join(['\t* ' + c.describe() for c in self.succeeded]) + '\n' + if len(self.failed): + description += 'Failed to make the following changes:\n' + description += '\n\t'.join(['\t* ' + c.describe() for c in self.failed]) + description += '\nErrors:\n' + description += '\n\t'.join(['\t* ' + e for e in self.errors]) + '\n' + if len(self.manual): + description += 'The following changes require manual intervention:\n' + description += '\n\t'.join(['\t* ' + c.describe() for c in self.manual]) + '\n' + return description + + +class ChangeRecommender: + + _condition_order = [ + # specific conditions unlikely to be rectified by other actions + 'changing_discrete', 'changing_continuous', + 'few_unique', 'many_unique', 'very_few_unique', + 'late_upload_mismatch', + 'narrow_threshold_band', + 'small_nonnull_batches', + # most general conditions + 'stale_analysis', + 'low_drift_threshold', + 'fixed_threshold_mismatch', + 'stddev_insufficient_baseline', + 'missing_baseline_batches', + 'fixed_baseline_mismatch' + ] + + def __init__(self, report: MonitorDiagnosisReport): + self._min_anomaly_count = 0 + self.report = report + self.org_id = report.orgId + self.dataset_id = report.datasetId + self.analyzer = report.analyzer + self.monitor = report.monitor + self._monitor_api = None # lazy + + @property + def monitor_api(self) -> MonitorApi: + if self._monitor_api is None: + self._monitor_api = get_monitor_api() + return self._monitor_api + + def _sort_conditions(self, conditions: List[ConditionRecord]) -> List[ConditionRecord]: + return sorted(conditions, key=lambda c: self._condition_order.index(c.name)) + + @staticmethod + def _best_change_for_condition(condition: ConditionRecord) -> RecommendedChange: + if condition.name in ['changing_discrete', 'changing_continuous']: + return RemoveColumns(columns=condition.columns, info=condition.info) + info = condition.info if condition.info else {} + info['condition'] = condition.name + info['summary'] = condition.summary + return ManualChange(columns=condition.columns, info=info) + + @property + def min_anomaly_count(self) -> int: + return self._min_anomaly_count + + @min_anomaly_count.setter + def min_anomaly_count(self, count: int): + self._min_anomaly_count = count + + def recommend(self) -> List[RecommendedChange]: + count_tuples = [c.to_tuple() for c in self.report.diagnosticData.analysisResults.anomalies.byColumnCount] + cols, counts = zip(*count_tuples) + anom_count = pd.Series(counts, index=cols) + cols_to_address = anom_count[anom_count >= self.min_anomaly_count] + changes = [] + # find the best actions for the cols that pass min anomaly criteria + for c in self._sort_conditions(self.report.conditions): + c.columns = list(cols_to_address.filter(items=c.columns if c.columns else []).index) + if len(c.columns) > 0: + changes.append(self._best_change_for_condition(c)) + return changes + + def _update_analyzer(self, updated: Analyzer): + self.monitor_api.put_analyzer( + org_id=self.org_id, + dataset_id=self.dataset_id, + analyzer_id=updated.id, + body=updated.dict(exclude_none=True), + ) + + def _delete_monitor(self): + if self.monitor is not None: + self.monitor_api.delete_monitor( + org_id=self.org_id, + dataset_id=self.dataset_id, + monitor_id=self.monitor.id + ) + self.monitor_api.delete_analyzer( + org_id=self.org_id, + dataset_id=self.dataset_id, + analyzer_id=self.analyzer.id + ) + + def _add_new_monitor(self, new_analyzer: Analyzer): + new_monitor = Monitor(**self.monitor.dict(), id=new_analyzer.id) if self.monitor else Monitor(id=new_analyzer.id) + self.monitor_api.put_monitor( + org_id=self.org_id, + dataset_id=self.dataset_id, + monitor_id=new_analyzer.id, # use same id as the analyzer + body=new_monitor.json(exclude_none=True), + ) + self._monitor_api.put_analyzer( + org_id=self.org_id, + dataset_id=self.dataset_id, + analyzer_id=new_analyzer.id, + body=new_analyzer.json(exclude_none=True), + ) + + def make_changes(self, changes: Optional[List[RecommendedChange]] = None) -> ChangeResults: + changes = self.recommend() if changes is None else changes + succeeded: List[RecommendedChange] = [] + failed: List[RecommendedChange] = [] + errors: List[str] = [] + for c in changes: + if c.can_automate(): + try: + changed_analyzers = c.generate_config(self.analyzer) + if next((a.id for a in changed_analyzers), None) is None: + # Delete existing analyzer/monitor as there's nothing useful left in it + self._delete_monitor() + # update existing or create new monitor(s) + for changed in changed_analyzers: + if changed.id == self.analyzer.id: + self._update_analyzer(changed) + else: + self._add_new_monitor(changed) + succeeded.append(c) + except Exception as e: + failed.append(c) + errors.append(f'{c.name} failed with {e}') + return ChangeResults(succeeded, failed, errors, [c for c in changes if not c.can_automate()]) diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py new file mode 100644 index 0000000..395a62f --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py @@ -0,0 +1,15 @@ +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange + + +class ManualChange(RecommendedChange): + name = 'manual_change' + summary = 'Make a manual change to the analyzer to address {condition}: {summary}' + required_info = ['condition'] + manual = True + + def summarize(self) -> str: + condition = self.info.get('condition') + if condition == 'narrow_threshold_band': + # percent diff of 0 would be bad... need to add info to differentiate + return 'Move columns to a new analyzer that uses absolute diff, percent diff or fixed thresholds' + return super().summarize() diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py new file mode 100644 index 0000000..00cb568 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py @@ -0,0 +1,59 @@ +from __future__ import annotations +from typing import Optional, List + +from whylabs_toolkit.monitor.models import Analyzer + +from whylabs_toolkit.monitor.diagnoser.models import ConditionRecord +from whylabs_toolkit.monitor.diagnoser.helpers.describe import describe_truncated_list + + +class RecommendedChange: + name = '' + summary = '' + manual = True + required_info = [] + + @classmethod + def from_condition(cls, condition: ConditionRecord): + return cls(condition.columns, condition.info) + + def __init__(self, columns: List[str], info: Optional[dict] = None): + self.columns = columns + self.info = info + + def merge(self, change: RecommendedChange) -> RecommendedChange: + if change.name != self.name: + raise ValueError(f'Cannot merge {self.name} and {change.name}') + merged = RecommendedChange(list(set(self.columns) | set(change.columns)), self.info) + merged.merge_info(change.info) + return merged + + def merge_info(self, info: Optional[dict]) -> Optional[dict]: + if self.info is None: + self.info = info + elif info is not None: + self.info = {**self.info, **info} + return self.info + + def summarize(self) -> str: + info = self.info if self.info else {} + return self.summary.format(**info) + + def describe(self) -> str: + return f'{self.summarize()} for {describe_truncated_list(self.columns)}' + + def can_automate(self) -> bool: + return all(getattr(self.info, f, False) for f in self.required_info) and not self.manual + + def _check_can_do(self, analyzer: Analyzer) -> bool: + if self.manual: + raise Exception(f'{self.name} has not been automated') + if not self.can_automate(): + raise Exception(f'{self.name} requires extra information ' + f'{[f for f in self.required_info if self.info is None or f not in self.info.keys()]}') + return True + + def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: + self._check_can_do(analyzer) + return [analyzer] + diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py new file mode 100644 index 0000000..fced1f1 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py @@ -0,0 +1,29 @@ +from typing import List + +from whylabs_toolkit.monitor.models import Analyzer, TargetLevel + +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange + + +class RemoveColumns(RecommendedChange): + name = 'remove_columns' + summary = 'Remove columns from the analyzer' + required_info = [] + manual = False + + def _check_can_do(self, analyzer: Analyzer) -> bool: + if analyzer.targetMatrix.type == TargetLevel.dataset: + raise ValueError('Cannot remove columns from a dataset level target matrix') + return super()._check_can_do(analyzer) + + def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: + self._check_can_do(analyzer) + to_remove = set(self.columns) + # remove from includes if possible, otherwise exclude + remove_includes = set(analyzer.targetMatrix.include).intersection(to_remove) + analyzer.targetMatrix.include = list(set(analyzer.targetMatrix.include) - to_remove) + analyzer.targetMatrix.exclude = list(set(analyzer.targetMatrix.exclude) | (to_remove - remove_includes)) + # if nothing's left to target, just remove the analyzer + if len(analyzer.targetMatrix.include) == 0: + return [] + return [analyzer] diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/test_changes.py b/whylabs_toolkit/monitor/diagnoser/recommendation/test_changes.py new file mode 100644 index 0000000..6f77d6f --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/test_changes.py @@ -0,0 +1,21 @@ +from whylabs_toolkit.monitor.diagnoser.models import ConditionRecord +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange + + +def test_from_condition(): + info = {'k1': 3} + condition = ConditionRecord(name="fixed_threshold_mismatch", summary='a mismatch', columns=['col1', 'col3', 'col4'], info=info) + change = RecommendedChange.from_condition(condition) + assert change.columns == condition.columns + assert change.info == condition.info + + +def test_merge_changes(): + change1 = RecommendedChange(columns=['c1', 'c2'], info={'f1': 1, 'f2': 2}) + change2 = RecommendedChange(columns=['c1', 'c3'], info={'f1': 0, 'f3': 3}) + merged = change1.merge(change2) + assert change1.columns == ['c1', 'c2'] + assert change2.columns == ['c1', 'c3'] + assert set(merged.columns) == {'c1', 'c2', 'c3'} + assert merged.info == {'f1': 0, 'f2': 2, 'f3': 3} + diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/test_remove_columns.py b/whylabs_toolkit/monitor/diagnoser/recommendation/test_remove_columns.py new file mode 100644 index 0000000..20788c7 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/test_remove_columns.py @@ -0,0 +1,38 @@ +from typing import Optional + +from whylabs_toolkit.monitor.models import Analyzer + +from whylabs_toolkit.monitor.diagnoser.models import ConditionRecord +from whylabs_toolkit.monitor.diagnoser.recommendation.remove_columns import RemoveColumns + + +def gen_analyzer(metric='mean', config: Optional[dict] = None, + target_matrix: Optional[dict] = None, baseline: Optional[dict] = None): + target_matrix = {'type': 'column', 'include': ['col1']} if target_matrix is None else target_matrix + config = {'type': 'fixed', 'metric': metric, 'upper': 1.0} if config is None else config + if config['type'] != 'fixed': + config['baseline'] = {'type': 'TrailingWindow', 'size': 7} if baseline is None else baseline + return Analyzer.parse_obj( + { + 'id': 'test_analyzer', + 'config': config, + 'targetMatrix': target_matrix, + }) + + +def test_remove_columns(): + analyzer = gen_analyzer(target_matrix={'type': 'column', 'include': ['col1', 'col2'], 'exclude': ['col3']}) + condition = ConditionRecord(name='fixed_threshold', summary='', columns=['col1', 'col3', 'col4']) + change = RemoveColumns.from_condition(condition) + result = change.generate_config(analyzer) + assert len(result) == 1 + updated = result[0] + assert updated.targetMatrix.include == ['col2'] + assert updated.targetMatrix.exclude.sort() == ['col3', 'col4'].sort() + + +def test_remove_columns2(): + analyzer = gen_analyzer(target_matrix={'type': 'column', 'include': ['col1', 'col2'], 'exclude': ['col3']}) + action = RemoveColumns(['col1', 'col2']) + result = action.generate_config(analyzer) + assert len(result) == 0 diff --git a/whylabs_toolkit/monitor/diagnoser/targeting.py b/whylabs_toolkit/monitor/diagnoser/targeting.py new file mode 100644 index 0000000..7d1e628 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/targeting.py @@ -0,0 +1,33 @@ +from typing import List, Union + +from whylabs_toolkit.monitor.models import EntitySchema, ColumnMatrix, DatasetMatrix, TargetLevel + + +def expand_target(target: str, schema: EntitySchema) -> List[str]: + if target == '*': + return [str(k) for k in schema.columns.keys()] + col_items = schema.columns.items() + if target == 'group:discrete': + return [name for (name, c) in col_items if c.discreteness == 'discrete'] + if target == 'group:continuous': + return [name for (name, c) in col_items if c.discreteness != 'discrete'] + if target == 'group:input': + return [name for (name, c) in col_items if c.classifier == 'input'] + if target == 'group:output': + return [name for (name, c) in col_items if c.classifier == 'output'] + return [target] + + +def targeted_columns(target_matrix: Union[ColumnMatrix, DatasetMatrix], schema: EntitySchema) -> List[str]: + if target_matrix is None: + return [] + if target_matrix.type == TargetLevel.dataset: + return ['__internal__datasetMetrics'] + columns = set() + for include in target_matrix.include: + columns.update(expand_target(include, schema)) + if target_matrix.exclude is not None: + for exclude in target_matrix.exclude: + columns = columns - set(expand_target(exclude, schema)) + return list(columns) + diff --git a/whylabs_toolkit/monitor/diagnoser/test/__init__.py b/whylabs_toolkit/monitor/diagnoser/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/test/test_helpers.py b/whylabs_toolkit/monitor/diagnoser/test/test_helpers.py new file mode 100644 index 0000000..fe67cb8 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/test/test_helpers.py @@ -0,0 +1,12 @@ +import pandas as pd +from pandas.testing import assert_series_equal + +from whylabs_toolkit.monitor.diagnoser.helpers.describe import filter_by_index + + +def test_filter_by_index(): + to_sort = pd.Series([0, 1, 1], index=['c3', 'c4', 'c1']) + ref = pd.Series([10, 9, 8], index=['c1', 'c2', 'c3']) + expected = pd.Series([10, 8, 0], index=['c1', 'c3', 'c4']) + assert_series_equal(filter_by_index(to_sort.index, ref), expected) + assert_series_equal(filter_by_index(['c3', 'c4', 'c1'], ref), expected) From b1b57bf8aa637876a4aca2aa485af02ab2edb97a Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Mon, 22 Apr 2024 17:45:39 -0400 Subject: [PATCH 02/14] First pass of diagnoser in whylabs toolkit --- Makefile | 2 +- .../customized_diagnoser.ipynb | 1202 +++++++++++++++-- examples/example_notebooks/diagnoser.ipynb | 513 +------ poetry.lock | 283 ++-- pyproject.toml | 16 +- .../monitor/diagnoser}/__init__.py | 0 .../monitor/diagnoser/converters/__init__.py | 0 .../diagnoser/converters/test_granularity.py | 0 .../diagnoser/recommendation/__init__.py | 0 .../diagnoser/recommendation/test_changes.py | 0 .../recommendation/test_remove_columns.py | 0 .../monitor/diagnoser}/test_helpers.py | 0 12 files changed, 1357 insertions(+), 659 deletions(-) rename {whylabs_toolkit/monitor/diagnoser/test => tests/monitor/diagnoser}/__init__.py (100%) create mode 100644 tests/monitor/diagnoser/converters/__init__.py rename {whylabs_toolkit => tests}/monitor/diagnoser/converters/test_granularity.py (100%) create mode 100644 tests/monitor/diagnoser/recommendation/__init__.py rename {whylabs_toolkit => tests}/monitor/diagnoser/recommendation/test_changes.py (100%) rename {whylabs_toolkit => tests}/monitor/diagnoser/recommendation/test_remove_columns.py (100%) rename {whylabs_toolkit/monitor/diagnoser/test => tests/monitor/diagnoser}/test_helpers.py (100%) diff --git a/Makefile b/Makefile index 5370fbb..c20d5f3 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ format-fix: poetry run autoflake --in-place --remove-unused-variables $(PY_SOURCE) setup: - poetry install + poetry install -E diagnoser test: poetry run pytest diff --git a/examples/example_notebooks/customized_diagnoser.ipynb b/examples/example_notebooks/customized_diagnoser.ipynb index 49e6fe4..ade9188 100644 --- a/examples/example_notebooks/customized_diagnoser.ipynb +++ b/examples/example_notebooks/customized_diagnoser.ipynb @@ -19,12 +19,12 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "pycharm": { - "name": "#%%\n" - }, "ExecuteTime": { "end_time": "2024-04-16T15:01:13.012745Z", "start_time": "2024-04-16T15:01:09.165663Z" + }, + "pycharm": { + "name": "#%%\n" } }, "outputs": [ @@ -33,12 +33,12 @@ "output_type": "stream", "text": [ "Obtaining file:///Volumes/Workspace/hack/smart-config\r\n", - " Installing build dependencies ... \u001B[?25ldone\r\n", - "\u001B[?25h Checking if build backend supports build_editable ... \u001B[?25ldone\r\n", - "\u001B[?25h Getting requirements to build editable ... \u001B[?25ldone\r\n", - "\u001B[?25h Installing backend dependencies ... \u001B[?25ldone\r\n", - "\u001B[?25h Preparing editable metadata (pyproject.toml) ... \u001B[?25ldone\r\n", - "\u001B[?25hRequirement already satisfied: tabulate in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.9.0)\r\n", + " Installing build dependencies ... \u001b[?25ldone\r\n", + "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\r\n", + "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\r\n", + "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\r\n", + "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\r\n", + "\u001b[?25hRequirement already satisfied: tabulate in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.9.0)\r\n", "Requirement already satisfied: pandas in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.0.1)\r\n", "Requirement already satisfied: numpy in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (1.24.3)\r\n", "Requirement already satisfied: whylabs-client in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.2)\r\n", @@ -79,8 +79,8 @@ "Requirement already satisfied: whylogs-sketching>=3.4.1.dev3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (3.4.1.dev3)\r\n", "Requirement already satisfied: sniffio>=1.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette<0.37.0,>=0.36.3->fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (1.3.0)\r\n", "Building wheels for collected packages: WhyLabs-Monitor-Diagnoser\r\n", - " Building editable for WhyLabs-Monitor-Diagnoser (pyproject.toml) ... \u001B[?25ldone\r\n", - "\u001B[?25h Created wheel for WhyLabs-Monitor-Diagnoser: filename=WhyLabs_Monitor_Diagnoser-0.0.1-0.editable-py3-none-any.whl size=3253 sha256=7b4cbfe8c7d43b46817562de75e01238943321354a771ca71eae6da224702c26\r\n", + " Building editable for WhyLabs-Monitor-Diagnoser (pyproject.toml) ... \u001b[?25ldone\r\n", + "\u001b[?25h Created wheel for WhyLabs-Monitor-Diagnoser: filename=WhyLabs_Monitor_Diagnoser-0.0.1-0.editable-py3-none-any.whl size=3253 sha256=7b4cbfe8c7d43b46817562de75e01238943321354a771ca71eae6da224702c26\r\n", " Stored in directory: /private/var/folders/kg/k2sb6xms2650ty85vy98q5qr0000gn/T/pip-ephem-wheel-cache-mw1sol4x/wheels/3b/90/fd/b769d4b005362ce18dbd94fe781f74806d1a79ffbe447812d7\r\n", "Successfully built WhyLabs-Monitor-Diagnoser\r\n", "Installing collected packages: WhyLabs-Monitor-Diagnoser\r\n", @@ -94,7 +94,7 @@ } ], "source": [ - "%pip install -e .\n" + "#%pip install whylabs-toolkit[diagnoser]\n" ] }, { @@ -112,8 +112,26 @@ }, { "cell_type": "code", - "execution_count": 2, - "outputs": [], + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:16.123058Z", + "start_time": "2024-04-16T15:01:13.014131Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], "source": [ "import getpass\n", "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", @@ -129,39 +147,38 @@ " api_key=api_key,\n", " whylabs_endpoint=api_endpoint\n", ")" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:01:16.123058Z", - "start_time": "2024-04-16T15:01:13.014131Z" - } - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "Then initialize the Monitor Diagnoser with the org_id and dataset_id." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 3, - "outputs": [], - "source": [ - "from whylabs_toolkit.monitor.diagnoser.monitor_diagnoser import MonitorDiagnoser\n", - "diagnoser = MonitorDiagnoser(org_id, dataset_id)" - ], + "execution_count": 2, "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-16T15:01:16.451964Z", "start_time": "2024-04-16T15:01:16.124858Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } - } + }, + "outputs": [], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.monitor_diagnoser import MonitorDiagnoser\n", + "diagnoser = MonitorDiagnoser(org_id, dataset_id)" + ] }, { "cell_type": "markdown", @@ -175,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:01:17.021550Z", @@ -185,9 +202,13 @@ "outputs": [ { "data": { - "text/plain": "(TimeRange(start=datetime.datetime(2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 4, 15, 21, 0, tzinfo=datetime.timezone.utc)),\n ,\n '2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z')" + "text/plain": [ + "(TimeRange(start=datetime.datetime(2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 4, 21, 21, 0, tzinfo=datetime.timezone.utc)),\n", + " ,\n", + " '2024-03-22T00:00:00.000Z/2024-04-21T00:00:00.000Z')" + ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -208,14 +229,269 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.724361Z", + "start_time": "2024-04-16T15:01:17.024927Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": " monitor_id analyzer_id \n0 adorable-goldenrod-lion-9438 adorable-goldenrod-lion-9438-analyzer \\\n1 unsightly-orchid-gorilla-4971 unsightly-orchid-gorilla-4971-analyzer \n2 concerned-skyblue-penguin-6734 concerned-skyblue-penguin-6734-analyzer \n3 proud-seagreen-carabeef-65 proud-seagreen-carabeef-65-analyzer \n4 kind-cyan-kangaroo-1253 kind-cyan-kangaroo-1253-analyzer \n.. ... ... \n93 numerical-drift-monitor-60dfcc numerical-drift-analyzer-60dfcc \n94 stormy-olive-butterfly-8693 stormy-olive-butterfly-8693-analyzer \n95 fine-magenta-nightingale-9708 fine-magenta-nightingale-9708-analyzer \n96 None eager-violet-newt-4599-analyzer \n97 unsightly-bisque-lemur-1917 unsightly-bisque-lemur-1917-analyzer \n\n metric column_count segment_count anomaly_count \n0 frequent_items 2 1 31 \\\n1 frequent_items 3 1 33 \n2 frequent_items 3 1 32 \n3 histogram 1 1 28 \n4 histogram 1 1 28 \n.. ... ... ... ... \n93 histogram 1 1 2 \n94 histogram 1 1 2 \n95 unique_est_ratio 26 1 39 \n96 count_null_ratio 21 1 28 \n97 frequent_items 1 1 1 \n\n max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \n0 30 1 15 \\\n1 30 1 11 \n2 30 1 10 \n3 28 28 28 \n4 28 28 28 \n.. ... ... ... \n93 2 2 2 \n94 2 2 2 \n95 2 1 1 \n96 2 1 1 \n97 1 1 1 \n\n action_count action_targets \n0 0 [] \n1 0 [] \n2 0 [] \n3 0 [] \n4 0 [] \n.. ... ... \n93 2 [email, slack] \n94 0 [] \n95 0 [] \n96 0 [] \n97 0 [] \n\n[98 rows x 11 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0adorable-goldenrod-lion-9438adorable-goldenrod-lion-9438-analyzerfrequent_items2131301150[]
1unsightly-orchid-gorilla-4971unsightly-orchid-gorilla-4971-analyzerfrequent_items3133301110[]
2concerned-skyblue-penguin-6734concerned-skyblue-penguin-6734-analyzerfrequent_items3132301100[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11282828280[]
4kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11282828280[]
....................................
93numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
94stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram1122220[]
95fine-magenta-nightingale-9708fine-magenta-nightingale-9708-analyzerunique_est_ratio261392110[]
96Noneeager-violet-newt-4599-analyzercount_null_ratio211282110[]
97unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

98 rows × 11 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11303030300[]
1cooperative-maroon-parrot-8886discrete-drift-jensenshannon-analyzerfrequent_items11303030300[]
2famous-salmon-cobra-8902famous-salmon-cobra-8902-analyzermin11303030300[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11303030300[]
4Nonecooperative-maroon-parrot-8886-analyzerfrequent_items11303030300[]
....................................
94glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram1122220[]
95Noneshy-black-raccoon-3594-analyzerhistogram1122220[]
96Noneeager-violet-newt-4599-analyzercount_null_ratio221262110[]
97unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
98expensive-tomato-moose-6522csw-analyzer-2median1111110[]
\n", + "

99 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " monitor_id analyzer_id \\\n", + "0 kind-cyan-kangaroo-1253 kind-cyan-kangaroo-1253-analyzer \n", + "1 cooperative-maroon-parrot-8886 discrete-drift-jensenshannon-analyzer \n", + "2 famous-salmon-cobra-8902 famous-salmon-cobra-8902-analyzer \n", + "3 proud-seagreen-carabeef-65 proud-seagreen-carabeef-65-analyzer \n", + "4 None cooperative-maroon-parrot-8886-analyzer \n", + ".. ... ... \n", + "94 glamorous-orchid-turtle-6425 glamorous-orchid-turtle-6425-analyzer \n", + "95 None shy-black-raccoon-3594-analyzer \n", + "96 None eager-violet-newt-4599-analyzer \n", + "97 unsightly-bisque-lemur-1917 unsightly-bisque-lemur-1917-analyzer \n", + "98 expensive-tomato-moose-6522 csw-analyzer-2 \n", + "\n", + " metric column_count segment_count anomaly_count \\\n", + "0 histogram 1 1 30 \n", + "1 frequent_items 1 1 30 \n", + "2 min 1 1 30 \n", + "3 histogram 1 1 30 \n", + "4 frequent_items 1 1 30 \n", + ".. ... ... ... ... \n", + "94 histogram 1 1 2 \n", + "95 histogram 1 1 2 \n", + "96 count_null_ratio 22 1 26 \n", + "97 frequent_items 1 1 1 \n", + "98 median 1 1 1 \n", + "\n", + " max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \\\n", + "0 30 30 30 \n", + "1 30 30 30 \n", + "2 30 30 30 \n", + "3 30 30 30 \n", + "4 30 30 30 \n", + ".. ... ... ... \n", + "94 2 2 2 \n", + "95 2 2 2 \n", + "96 2 1 1 \n", + "97 1 1 1 \n", + "98 1 1 1 \n", + "\n", + " action_count action_targets \n", + "0 0 [] \n", + "1 0 [] \n", + "2 0 [] \n", + "3 0 [] \n", + "4 0 [] \n", + ".. ... ... \n", + "94 0 [] \n", + "95 0 [] \n", + "96 0 [] \n", + "97 0 [] \n", + "98 0 [] \n", + "\n", + "[99 rows x 11 columns]" + ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -225,35 +501,563 @@ "noisy_monitors = diagnoser.detect_noisy_monitors()\n", "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in noisy_monitors])\n", "noisy_monitors_df" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.724361Z", - "start_time": "2024-04-16T15:01:17.024927Z" - } - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "Once you have run `detect_noisy_monitors`, you can retrieve the result at any time via the `noisy_monitors` property. You can also retrieve\n", " information about monitors with analysis failures using `failed_monitors`. " - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.736520Z", + "start_time": "2024-04-16T15:01:18.725303Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": " monitor_id \n0 energetic-black-cobra-7838 \\\n1 None \n2 good-cornsilk-bear-9359 \n3 elated-gray-baboon-4620 \n4 expensive-tomato-moose-6522 \n5 missing-values-ratio-monitor-v9uywi \n6 curious-lemonchiffon-rabbit-7000 \n7 clear-azure-starling-8883 \n8 light-mintcream-rhinoceros-3655 \n9 handsome-lemonchiffon-eel-4222 \n10 witty-blue-koala-8098 \n11 dark-blanchedalmond-ferret-7729 \n12 eager-limegreen-hedgehog-1312 \n13 famous-yellow-baboon-2243 \n14 gifted-coral-bison-842 \n15 glamorous-orchid-turtle-6425 \n16 inexpensive-maroon-donkey-7562 \n17 inferred-data-type-monitor-vjwbpo \n18 fancy-chocolate-wasp-8247 \n19 None \n20 plain-fuchsia-stinkbug-4064 \n21 stormy-olive-butterfly-8693 \n22 tame-beige-sardine-3501 \n23 tough-green-hare-1322 \n24 uninterested-blueviolet-reindeer-9950 \n25 uninterested-red-alpaca-2523 \n26 unique-estimate-ratio-monitor-ccf7cl \n27 unique-ratio-29f3ef1c-monitor \n28 busy-hotpink-gaur-9703 \n29 happy-snow-grouse-452 \n\n analyzer_id metric \n0 energetic-black-cobra-7838-analyzer unique_est \\\n1 expensive-tomato-moose-6522-analyzer median \n2 good-cornsilk-bear-9359-analyzer count_null \n3 elated-gray-baboon-4620-analyzer count_null_ratio \n4 csw-analyzer-2 median \n5 missing-values-ratio-analyzer-v9uywi count_null_ratio \n6 curious-lemonchiffon-rabbit-7000-analyzer frequent_items \n7 clear-azure-starling-8883-analyzer frequent_items \n8 light-mintcream-rhinoceros-3655-analyzer frequent_items \n9 handsome-lemonchiffon-eel-4222-analyzer frequent_items \n10 witty-blue-koala-8098-analyzer histogram \n11 dark-blanchedalmond-ferret-7729-analyzer frequent_items \n12 eager-limegreen-hedgehog-1312-analyzer histogram \n13 famous-yellow-baboon-2243-analyzer histogram \n14 gifted-coral-bison-842-analyzer histogram \n15 glamorous-orchid-turtle-6425-analyzer histogram \n16 inexpensive-maroon-donkey-7562-analyzer histogram \n17 inferred-data-type-analyzer-vjwbpo inferred_data_type \n18 fancy-chocolate-wasp-8247-analyzer count \n19 eager-violet-newt-4599-analyzer count_null_ratio \n20 plain-fuchsia-stinkbug-4064-analyzer count_null_ratio \n21 stormy-olive-butterfly-8693-analyzer histogram \n22 tame-beige-sardine-3501-analyzer count_null_ratio \n23 tough-green-hare-1322-analyzer count_null_ratio \n24 uninterested-blueviolet-reindeer-9950-analyzer count \n25 uninterested-red-alpaca-2523-analyzer count_null_ratio \n26 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n27 unique-ratio-29f3ef1c unique_est_ratio \n28 busy-hotpink-gaur-9703-analyzer count_null_ratio \n29 happy-snow-grouse-452-analyzer count_null_ratio \n\n failed_count max_failed_per_column min_failed_per_column \n0 56 28 28 \\\n1 2191 28 7 \n2 2163 28 7 \n3 58 28 2 \n4 1190 28 7 \n5 2609 25 2 \n6 15 15 15 \n7 15 15 15 \n8 70 15 1 \n9 17 15 2 \n10 7 7 7 \n11 7 7 7 \n12 7 7 7 \n13 7 7 7 \n14 7 7 7 \n15 7 7 7 \n16 7 7 7 \n17 2 2 2 \n18 1 1 1 \n19 1 1 1 \n20 1 1 1 \n21 1 1 1 \n22 1 1 1 \n23 1 1 1 \n24 1 1 1 \n25 1 1 1 \n26 1 1 1 \n27 1 1 1 \n28 1 1 1 \n29 1 1 1 \n\n avg_failed_per_column action_count action_targets \n0 28 1 [email] \n1 27 0 [] \n2 27 0 [] \n3 19 1 [email] \n4 15 0 [] \n5 24 1 [email] \n6 15 1 [test-sort] \n7 15 1 [test-sort] \n8 8 0 [] \n9 8 0 [] \n10 7 0 [] \n11 7 0 [] \n12 7 0 [] \n13 7 0 [] \n14 7 0 [] \n15 7 0 [] \n16 7 0 [] \n17 2 0 [] \n18 1 0 [] \n19 1 0 [] \n20 1 0 [] \n21 1 0 [] \n22 1 0 [] \n23 1 0 [] \n24 1 1 [christine-test-email] \n25 1 0 [] \n26 1 2 [email, slack] \n27 1 0 [] \n28 1 0 [] \n29 1 0 [] ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est562828281[email]
1Noneexpensive-tomato-moose-6522-analyzermedian2191287270[]
2good-cornsilk-bear-9359good-cornsilk-bear-9359-analyzercount_null2163287270[]
3elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio58282191[email]
4expensive-tomato-moose-6522csw-analyzer-2median1190287150[]
5missing-values-ratio-monitor-v9uywimissing-values-ratio-analyzer-v9uywicount_null_ratio2609252241[email]
6curious-lemonchiffon-rabbit-7000curious-lemonchiffon-rabbit-7000-analyzerfrequent_items151515151[test-sort]
7clear-azure-starling-8883clear-azure-starling-8883-analyzerfrequent_items151515151[test-sort]
8light-mintcream-rhinoceros-3655light-mintcream-rhinoceros-3655-analyzerfrequent_items7015180[]
9handsome-lemonchiffon-eel-4222handsome-lemonchiffon-eel-4222-analyzerfrequent_items1715280[]
10witty-blue-koala-8098witty-blue-koala-8098-analyzerhistogram77770[]
11dark-blanchedalmond-ferret-7729dark-blanchedalmond-ferret-7729-analyzerfrequent_items77770[]
12eager-limegreen-hedgehog-1312eager-limegreen-hedgehog-1312-analyzerhistogram77770[]
13famous-yellow-baboon-2243famous-yellow-baboon-2243-analyzerhistogram77770[]
14gifted-coral-bison-842gifted-coral-bison-842-analyzerhistogram77770[]
15glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram77770[]
16inexpensive-maroon-donkey-7562inexpensive-maroon-donkey-7562-analyzerhistogram77770[]
17inferred-data-type-monitor-vjwbpoinferred-data-type-analyzer-vjwbpoinferred_data_type22220[]
18fancy-chocolate-wasp-8247fancy-chocolate-wasp-8247-analyzercount11110[]
19Noneeager-violet-newt-4599-analyzercount_null_ratio11110[]
20plain-fuchsia-stinkbug-4064plain-fuchsia-stinkbug-4064-analyzercount_null_ratio11110[]
21stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram11110[]
22tame-beige-sardine-3501tame-beige-sardine-3501-analyzercount_null_ratio11110[]
23tough-green-hare-1322tough-green-hare-1322-analyzercount_null_ratio11110[]
24uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount11111[christine-test-email]
25uninterested-red-alpaca-2523uninterested-red-alpaca-2523-analyzercount_null_ratio11110[]
26unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio11112[email, slack]
27unique-ratio-29f3ef1c-monitorunique-ratio-29f3ef1cunique_est_ratio11110[]
28busy-hotpink-gaur-9703busy-hotpink-gaur-9703-analyzercount_null_ratio11110[]
29happy-snow-grouse-452happy-snow-grouse-452-analyzercount_null_ratio11110[]
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est603030301[email]
1good-cornsilk-bear-9359good-cornsilk-bear-9359-analyzercount_null2313303290[]
2elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio68308221[email]
3Noneexpensive-tomato-moose-6522-analyzermedian2109273260[]
4expensive-tomato-moose-6522csw-analyzer-2median877273110[]
5missing-values-ratio-monitor-v9uywimissing-values-ratio-analyzer-v9uywicount_null_ratio2714263251[email]
6curious-lemonchiffon-rabbit-7000curious-lemonchiffon-rabbit-7000-analyzerfrequent_items111111111[test-sort]
7clear-azure-starling-8883clear-azure-starling-8883-analyzerfrequent_items111111111[test-sort]
8light-mintcream-rhinoceros-3655light-mintcream-rhinoceros-3655-analyzerfrequent_items5011280[]
9handsome-lemonchiffon-eel-4222handsome-lemonchiffon-eel-4222-analyzerfrequent_items1311260[]
10witty-blue-koala-8098witty-blue-koala-8098-analyzerhistogram33330[]
11dark-blanchedalmond-ferret-7729dark-blanchedalmond-ferret-7729-analyzerfrequent_items33330[]
12eager-limegreen-hedgehog-1312eager-limegreen-hedgehog-1312-analyzerhistogram33330[]
13famous-yellow-baboon-2243famous-yellow-baboon-2243-analyzerhistogram33330[]
14gifted-coral-bison-842gifted-coral-bison-842-analyzerhistogram33330[]
15glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram33330[]
16inexpensive-maroon-donkey-7562inexpensive-maroon-donkey-7562-analyzerhistogram33330[]
17inferred-data-type-monitor-vjwbpoinferred-data-type-analyzer-vjwbpoinferred_data_type33330[]
18busy-hotpink-gaur-9703busy-hotpink-gaur-9703-analyzercount_null_ratio11110[]
19fancy-chocolate-wasp-8247fancy-chocolate-wasp-8247-analyzercount11110[]
20Noneeager-violet-newt-4599-analyzercount_null_ratio11110[]
21plain-fuchsia-stinkbug-4064plain-fuchsia-stinkbug-4064-analyzercount_null_ratio11110[]
22stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram11110[]
23tame-beige-sardine-3501tame-beige-sardine-3501-analyzercount_null_ratio11110[]
24tough-green-hare-1322tough-green-hare-1322-analyzercount_null_ratio11110[]
25uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount11111[christine-test-email]
26uninterested-red-alpaca-2523uninterested-red-alpaca-2523-analyzercount_null_ratio11110[]
27unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio11112[email, slack]
28happy-snow-grouse-452happy-snow-grouse-452-analyzercount_null_ratio11110[]
29unique-ratio-29f3ef1c-monitorunique-ratio-29f3ef1cunique_est_ratio11110[]
\n", + "
" + ], + "text/plain": [ + " monitor_id \\\n", + "0 energetic-black-cobra-7838 \n", + "1 good-cornsilk-bear-9359 \n", + "2 elated-gray-baboon-4620 \n", + "3 None \n", + "4 expensive-tomato-moose-6522 \n", + "5 missing-values-ratio-monitor-v9uywi \n", + "6 curious-lemonchiffon-rabbit-7000 \n", + "7 clear-azure-starling-8883 \n", + "8 light-mintcream-rhinoceros-3655 \n", + "9 handsome-lemonchiffon-eel-4222 \n", + "10 witty-blue-koala-8098 \n", + "11 dark-blanchedalmond-ferret-7729 \n", + "12 eager-limegreen-hedgehog-1312 \n", + "13 famous-yellow-baboon-2243 \n", + "14 gifted-coral-bison-842 \n", + "15 glamorous-orchid-turtle-6425 \n", + "16 inexpensive-maroon-donkey-7562 \n", + "17 inferred-data-type-monitor-vjwbpo \n", + "18 busy-hotpink-gaur-9703 \n", + "19 fancy-chocolate-wasp-8247 \n", + "20 None \n", + "21 plain-fuchsia-stinkbug-4064 \n", + "22 stormy-olive-butterfly-8693 \n", + "23 tame-beige-sardine-3501 \n", + "24 tough-green-hare-1322 \n", + "25 uninterested-blueviolet-reindeer-9950 \n", + "26 uninterested-red-alpaca-2523 \n", + "27 unique-estimate-ratio-monitor-ccf7cl \n", + "28 happy-snow-grouse-452 \n", + "29 unique-ratio-29f3ef1c-monitor \n", + "\n", + " analyzer_id metric \\\n", + "0 energetic-black-cobra-7838-analyzer unique_est \n", + "1 good-cornsilk-bear-9359-analyzer count_null \n", + "2 elated-gray-baboon-4620-analyzer count_null_ratio \n", + "3 expensive-tomato-moose-6522-analyzer median \n", + "4 csw-analyzer-2 median \n", + "5 missing-values-ratio-analyzer-v9uywi count_null_ratio \n", + "6 curious-lemonchiffon-rabbit-7000-analyzer frequent_items \n", + "7 clear-azure-starling-8883-analyzer frequent_items \n", + "8 light-mintcream-rhinoceros-3655-analyzer frequent_items \n", + "9 handsome-lemonchiffon-eel-4222-analyzer frequent_items \n", + "10 witty-blue-koala-8098-analyzer histogram \n", + "11 dark-blanchedalmond-ferret-7729-analyzer frequent_items \n", + "12 eager-limegreen-hedgehog-1312-analyzer histogram \n", + "13 famous-yellow-baboon-2243-analyzer histogram \n", + "14 gifted-coral-bison-842-analyzer histogram \n", + "15 glamorous-orchid-turtle-6425-analyzer histogram \n", + "16 inexpensive-maroon-donkey-7562-analyzer histogram \n", + "17 inferred-data-type-analyzer-vjwbpo inferred_data_type \n", + "18 busy-hotpink-gaur-9703-analyzer count_null_ratio \n", + "19 fancy-chocolate-wasp-8247-analyzer count \n", + "20 eager-violet-newt-4599-analyzer count_null_ratio \n", + "21 plain-fuchsia-stinkbug-4064-analyzer count_null_ratio \n", + "22 stormy-olive-butterfly-8693-analyzer histogram \n", + "23 tame-beige-sardine-3501-analyzer count_null_ratio \n", + "24 tough-green-hare-1322-analyzer count_null_ratio \n", + "25 uninterested-blueviolet-reindeer-9950-analyzer count \n", + "26 uninterested-red-alpaca-2523-analyzer count_null_ratio \n", + "27 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n", + "28 happy-snow-grouse-452-analyzer count_null_ratio \n", + "29 unique-ratio-29f3ef1c unique_est_ratio \n", + "\n", + " failed_count max_failed_per_column min_failed_per_column \\\n", + "0 60 30 30 \n", + "1 2313 30 3 \n", + "2 68 30 8 \n", + "3 2109 27 3 \n", + "4 877 27 3 \n", + "5 2714 26 3 \n", + "6 11 11 11 \n", + "7 11 11 11 \n", + "8 50 11 2 \n", + "9 13 11 2 \n", + "10 3 3 3 \n", + "11 3 3 3 \n", + "12 3 3 3 \n", + "13 3 3 3 \n", + "14 3 3 3 \n", + "15 3 3 3 \n", + "16 3 3 3 \n", + "17 3 3 3 \n", + "18 1 1 1 \n", + "19 1 1 1 \n", + "20 1 1 1 \n", + "21 1 1 1 \n", + "22 1 1 1 \n", + "23 1 1 1 \n", + "24 1 1 1 \n", + "25 1 1 1 \n", + "26 1 1 1 \n", + "27 1 1 1 \n", + "28 1 1 1 \n", + "29 1 1 1 \n", + "\n", + " avg_failed_per_column action_count action_targets \n", + "0 30 1 [email] \n", + "1 29 0 [] \n", + "2 22 1 [email] \n", + "3 26 0 [] \n", + "4 11 0 [] \n", + "5 25 1 [email] \n", + "6 11 1 [test-sort] \n", + "7 11 1 [test-sort] \n", + "8 8 0 [] \n", + "9 6 0 [] \n", + "10 3 0 [] \n", + "11 3 0 [] \n", + "12 3 0 [] \n", + "13 3 0 [] \n", + "14 3 0 [] \n", + "15 3 0 [] \n", + "16 3 0 [] \n", + "17 3 0 [] \n", + "18 1 0 [] \n", + "19 1 0 [] \n", + "20 1 0 [] \n", + "21 1 0 [] \n", + "22 1 0 [] \n", + "23 1 0 [] \n", + "24 1 0 [] \n", + "25 1 1 [christine-test-email] \n", + "26 1 0 [] \n", + "27 1 2 [email, slack] \n", + "28 1 0 [] \n", + "29 1 0 [] " + ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -261,14 +1065,7 @@ "source": [ "failed_monitors_df = pd.DataFrame.from_records([n.dict() for n in diagnoser.failed_monitors])\n", "failed_monitors_df" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.736520Z", - "start_time": "2024-04-16T15:01:18.725303Z" - } - } + ] }, { "cell_type": "markdown", @@ -279,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:01:18.740411Z", @@ -289,9 +1086,11 @@ "outputs": [ { "data": { - "text/plain": "'adorable-goldenrod-lion-9438'" + "text/plain": [ + "'kind-cyan-kangaroo-1253'" + ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -302,69 +1101,85 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "We can get the monitor object from the diagnoser, to see its display name and any other useful information." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:18.743620Z", + "start_time": "2024-04-16T15:01:18.741222Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1676498472577, author='system', description=None), id='adorable-goldenrod-lion-9438', displayName='wrong-drift-crowded-orchid-coyote-2773', tags=None, analyzerIds=['adorable-goldenrod-lion-9438-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[])" + "text/plain": [ + "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279098033, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253', displayName='kind-cyan-kangaroo-1253', tags=None, analyzerIds=['kind-cyan-kangaroo-1253-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[])" + ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diagnoser.monitor_to_diagnose" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.743620Z", - "start_time": "2024-04-16T15:01:18.741222Z" - } - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "We can similarly see the configuration of the analyzer that is being diagnosed.\n" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:19.544392Z", + "start_time": "2024-04-16T15:01:18.744499Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "Analyzer(metadata=Metadata(version=2, schemaVersion=1, updatedTimestamp=1713279603124, author='user_c9292ec40407f7b580f0a2c90745ebfba2b9e6ea81c848ef944d31e48a45f98', description=None), id='adorable-goldenrod-lion-9438-analyzer', displayName=None, tags=['featureSelection:all'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[], type=, include=['*'], exclude=['issue_d', , 'url'], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" + "text/plain": [ + "Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279095485, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253-analyzer', displayName=None, tags=['featureSelection:all', 'discreteness:non-discrete'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.02, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" + ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diagnoser.analyzer_to_diagnose" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:01:19.544392Z", - "start_time": "2024-04-16T15:01:18.744499Z" - } - } + ] }, { "cell_type": "markdown", @@ -377,14 +1192,64 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:01:19.860254Z", + "start_time": "2024-04-16T15:01:19.545452Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": " segment total_anomalies batch_count\n0 overall 31 30", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
segmenttotal_anomaliesbatch_count
0overall3130
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
segmenttotal_anomaliesbatch_count
0purpose=car&verification_status=Source Verified3030
\n", + "
" + ], + "text/plain": [ + " segment total_anomalies \\\n", + "0 purpose=car&verification_status=Source Verified 30 \n", + "\n", + " batch_count \n", + "0 30 " + ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -396,27 +1261,23 @@ "noisy_segments_df = pd.DataFrame.from_records([n.dict() for n in noisy_segments])\n", "noisy_segments_df['segment'] = [segment_as_readable_text(n.segment.tags) for n in noisy_segments]\n", "noisy_segments_df" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:01:19.860254Z", - "start_time": "2024-04-16T15:01:19.545452Z" - } - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "The diagnoser chooses the noisiest segment to diagnose. This can be changed by setting the `diagnostic_segment` property." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:01:19.863407Z", @@ -426,9 +1287,11 @@ "outputs": [ { "data": { - "text/plain": "'overall'" + "text/plain": [ + "'purpose=car&verification_status=Source Verified'" + ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -448,7 +1311,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:01:20.057746Z", @@ -458,10 +1321,45 @@ "outputs": [ { "data": { - "text/plain": " column total_anomalies\n0 issue_d 30\n1 url 1\n2 debt_settlement_flag 0\n3 desc 0\n4 disbursement_method 0\n5 earliest_cr_line 0\n6 emp_length 0\n7 emp_title 0\n8 grade 0\n9 hardship_flag 0\n10 home_ownership 0\n11 id 0\n12 initial_list_status 0\n13 last_credit_pull_d 0\n14 last_pymnt_d 0\n15 loan_status 0\n16 next_pymnt_d 0\n17 purpose 0\n18 pymnt_plan 0\n19 sub_grade 0\n20 term 0\n21 title 0\n22 verification_status 0\n23 verification_status_joint 0\n24 addr_state 0\n25 zip_code 0\n26 application_type 0", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
columntotal_anomalies
0issue_d30
1url1
2debt_settlement_flag0
3desc0
4disbursement_method0
5earliest_cr_line0
6emp_length0
7emp_title0
8grade0
9hardship_flag0
10home_ownership0
11id0
12initial_list_status0
13last_credit_pull_d0
14last_pymnt_d0
15loan_status0
16next_pymnt_d0
17purpose0
18pymnt_plan0
19sub_grade0
20term0
21title0
22verification_status0
23verification_status_joint0
24addr_state0
25zip_code0
26application_type0
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columntotal_anomalies
0pred_credit_risk (output)30
\n", + "
" + ], + "text/plain": [ + " column total_anomalies\n", + "0 pred_credit_risk (output) 30" + ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -481,7 +1379,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:01:20.061262Z", @@ -491,9 +1389,11 @@ "outputs": [ { "data": { - "text/plain": "['issue_d',\n 'url',\n 'debt_settlement_flag',\n 'desc',\n 'disbursement_method',\n 'earliest_cr_line',\n 'emp_length',\n 'emp_title',\n 'grade',\n 'hardship_flag',\n 'home_ownership',\n 'id',\n 'initial_list_status',\n 'last_credit_pull_d',\n 'last_pymnt_d',\n 'loan_status',\n 'next_pymnt_d',\n 'purpose',\n 'pymnt_plan',\n 'sub_grade',\n 'term',\n 'title',\n 'verification_status',\n 'verification_status_joint',\n 'addr_state',\n 'zip_code',\n 'application_type']" + "text/plain": [ + "['pred_credit_risk (output)']" + ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -512,14 +1412,31 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:03:35.516085Z", "start_time": "2024-04-16T15:03:30.514723Z" } }, - "outputs": [], + "outputs": [ + { + "ename": "Exception", + "evalue": "USE_LOCAL_SERVER is set but server library is not available.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py:285\u001b[0m, in \u001b[0;36mMonitorDiagnoser.diagnose\u001b[0;34m(self, columns)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 285\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msmart_config\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mserver\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mserver\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DiagnosisRequest\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msmart_config\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mserver\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdiagnosis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer_diagnoser\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AnalyzerDiagnoser\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'smart_config'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 3\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUSE_LOCAL_SERVER\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mserver\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m monitor_report \u001b[38;5;241m=\u001b[39m \u001b[43mdiagnoser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdiagnose\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py:314\u001b[0m, in \u001b[0;36mMonitorDiagnoser.diagnose\u001b[0;34m(self, columns)\u001b[0m\n\u001b[1;32m 303\u001b[0m report_dict \u001b[38;5;241m=\u001b[39m diagnosis_service\u001b[38;5;241m.\u001b[39mdiagnose_sync(\n\u001b[1;32m 304\u001b[0m DiagnosisRequest(\n\u001b[1;32m 305\u001b[0m orgId\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morg_id,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 311\u001b[0m granularity\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgranularity,\n\u001b[1;32m 312\u001b[0m ))\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m--> 314\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUSE_LOCAL_SERVER is set but server library is not available.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 315\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 316\u001b[0m \u001b[38;5;66;03m# TODO implement call through songbird/whylabs-client instead of direct\u001b[39;00m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;66;03m# Call the diagnosis API via whyLabs client\u001b[39;00m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDiagnosis API call not implemented\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mException\u001b[0m: USE_LOCAL_SERVER is set but server library is not available." + ] + } + ], "source": [ "# for now, we need to enforce this to run using local server\n", "import os\n", @@ -529,6 +1446,17 @@ }, { "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-16T15:03:35.522329Z", + "start_time": "2024-04-16T15:03:35.518688Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -573,23 +1501,19 @@ ], "source": [ "print(monitor_report.describe())" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-16T15:03:35.522329Z", - "start_time": "2024-04-16T15:03:35.518688Z" + "jupyter": { + "outputs_hidden": false } }, - "execution_count": 18 - }, - { - "cell_type": "code", "outputs": [], - "source": [], - "metadata": { - "collapsed": false - } + "source": [] } ], "metadata": { @@ -612,5 +1536,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/examples/example_notebooks/diagnoser.ipynb b/examples/example_notebooks/diagnoser.ipynb index ab86e52..22d577c 100644 --- a/examples/example_notebooks/diagnoser.ipynb +++ b/examples/example_notebooks/diagnoser.ipynb @@ -20,17 +20,17 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "pycharm": { - "name": "#%%\n" - }, "ExecuteTime": { "end_time": "2024-04-16T14:58:15.366726Z", "start_time": "2024-04-16T14:58:15.361250Z" + }, + "pycharm": { + "name": "#%%\n" } }, "outputs": [], "source": [ - "%pip install ." + "# %pip install whylabs-toolkit[diagnoser]" ] }, { @@ -55,7 +55,34 @@ "start_time": "2024-04-16T14:58:15.369321Z" } }, - "outputs": [], + "outputs": [ + { + "ename": "TypeError", + "evalue": "issubclass() arg 1 must be a class", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgetpass\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdiagnoser\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhelpers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m env_setup\n\u001b[1;32m 4\u001b[0m org_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124morg-0\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 5\u001b[0m dataset_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel-0\u001b[39m\u001b[38;5;124m'\u001b[39m\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorSetup, MonitorManager\n\u001b[1;32m 4\u001b[0m ALL \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 5\u001b[0m MonitorManager,\n\u001b[1;32m 6\u001b[0m MonitorSetup,\n\u001b[1;32m 7\u001b[0m ]\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/manager/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorManager\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcredentials\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorCredentials\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor_setup\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorSetup\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/manager/manager.py:10\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_client\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnotification_settings_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotificationSettingsApi\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_client\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ModelsApi\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor_setup\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorSetup\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhelpers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor_helpers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_model_granularity\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/manager/monitor_setup.py:9\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_client\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFoundException\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhelpers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_models_api\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtargets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ColumnGroups\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcredentials\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorCredentials\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124;03m\"\"\"Console script for monitor_schema.\"\"\"\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcolumn_schema\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcommons\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/analyzer/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124;03m\"\"\"Analyzer module.\"\"\"\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01malgorithms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Analyzer\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbaseline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/analyzer/algorithms.py:7\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, Dict, List, Literal, Optional, Union\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseModel, Field, constr\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbaseline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 8\u001b[0m ReferenceProfileId,\n\u001b[1;32m 9\u001b[0m SingleBatchBaseline,\n\u001b[1;32m 10\u001b[0m TimeRangeBaseline,\n\u001b[1;32m 11\u001b[0m TrailingWindowBaseline,\n\u001b[1;32m 12\u001b[0m )\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcommons\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NoExtrasBaseModel, TimeRange\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m COLUMN_NAME_TYPE, anyOf_to_oneOf\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/analyzer/baseline.py:7\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m List, Literal, Optional\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Field\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcommons\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DATASET_ID_DEF, NoExtrasBaseModel, TimeRange\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mBaselineType\u001b[39;00m(\u001b[38;5;28mstr\u001b[39m, Enum):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Supported baseline types.\"\"\"\u001b[39;00m\n", + "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/commons.py:31\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mNoExtrasBaseModel\u001b[39;00m(BaseModel, extra\u001b[38;5;241m=\u001b[39mExtra\u001b[38;5;241m.\u001b[39mforbid): \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"No extras base model.\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \n\u001b[1;32m 27\u001b[0m \u001b[38;5;124;03m Inherit to prevent accidental extra fields.\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 31\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mImmediateSchedule\u001b[39;00m(NoExtrasBaseModel):\n\u001b[1;32m 32\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Schedule the monitor to run immediately.\"\"\"\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28mtype\u001b[39m: Literal[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimmediate\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimmediate\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/main.py:197\u001b[0m, in \u001b[0;36mpydantic.main.ModelMetaclass.__new__\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:506\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField.infer\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:436\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField.__init__\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:552\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField.prepare\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:668\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField._type_analysis\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/miniconda3/envs/hackthis/lib/python3.9/typing.py:852\u001b[0m, in \u001b[0;36m_SpecialGenericAlias.__subclasscheck__\u001b[0;34m(self, cls)\u001b[0m\n\u001b[1;32m 850\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__origin__, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__origin__)\n\u001b[1;32m 851\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mcls\u001b[39m, _GenericAlias):\n\u001b[0;32m--> 852\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43missubclass\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__origin__\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__subclasscheck__\u001b[39m(\u001b[38;5;28mcls\u001b[39m)\n", + "\u001b[0;31mTypeError\u001b[0m: issubclass() arg 1 must be a class" + ] + } + ], "source": [ "import getpass\n", "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", @@ -82,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T14:58:19.609165Z", @@ -106,23 +133,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T14:59:50.931331Z", "start_time": "2024-04-16T14:59:44.553343Z" } }, - "outputs": [ - { - "data": { - "text/plain": "MonitorDiagnosisReport(orgId='org-0', datasetId='model-0', analyzerId='adorable-goldenrod-lion-9438-analyzer', interval='2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z', expectedBatchCount=30, diagnosticData=DiagnosticDataSummary(diagnosticSegment=Segment(tags=[]), diagnosticProfile=ProfileSummary(minRowName='issue_d', minRowCount=2494691, maxRowName='issue_d', maxRowCount=2494691), diagnosticBatches=BatchesSummary(minBatchName='issue_d', minBatchCount=30, maxBatchName='issue_d', maxBatchCount=30), analysisResults=AnalysisResultsSummary(results=ResultRecord(diagnosedColumnCount=27, batchCount=30), failures=FailureRecord(totalFailuresCount=0, maxFailuresCount=0, meanFailuresCount=0, byColumnCount=[], byTypeCount=[]), anomalies=AnomalyRecord(totalAnomalyCount=31, maxAnomalyCount=30, meanAnomalyCount=15, batchCount=30, byColumnCount=[('issue_d', 30), ('url', 1)], byColumnBatchCount=[('addr_state', 19), ('application_type', 19), ('debt_settlement_flag', 30), ('desc', 1), ('disbursement_method', 19), ('earliest_cr_line', 19), ('emp_length', 30), ('emp_title', 19), ('grade', 19), ('hardship_flag', 19), ('home_ownership', 19), ('id', 9), ('initial_list_status', 30), ('issue_d', 30), ('last_credit_pull_d', 30), ('last_pymnt_d', 19), ('loan_status', 19), ('next_pymnt_d', 19), ('purpose', 19), ('pymnt_plan', 19), ('sub_grade', 19), ('term', 19), ('title', 19), ('url', 19), ('verification_status', 19), ('verification_status_joint', 30), ('zip_code', 30)])), targetedColumnCount=125), qualityIssues=[], conditions=[ConditionRecord(columns=['issue_d', 'url'], info=None, summary='many values are unique across batches', name='changing_discrete')], monitor=Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1676498472577, author='system', description=None), id='adorable-goldenrod-lion-9438', displayName='wrong-drift-crowded-orchid-coyote-2773', tags=None, analyzerIds=['adorable-goldenrod-lion-9438-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[]), analyzer=Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1676498472065, author='system', description=None), id='adorable-goldenrod-lion-9438-analyzer', displayName=None, tags=['featureSelection:all'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[], type=, include=['*'], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None))), analyzedColumnCount=27)" - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# for now, we need to enforce this to run using local server\n", "import os\n", @@ -133,53 +151,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T14:59:50.950021Z", "start_time": "2024-04-16T14:59:50.932643Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Diagnosis is for monitor \"wrong-drift-crowded-orchid-coyote-2773\" [adorable-goldenrod-lion-9438] in model-0 org-0, over interval 2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z.\n", - "\n", - "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", - "Analyzer \"adorable-goldenrod-lion-9438-analyzer\" targets 125 columns and ran on 27 columns in the diagnosed segment.\n", - "\n", - "\n", - "Diagnostic segment is \"overall\".\n", - "Diagnostic interval contains 30 batches.\n", - "\n", - "Diagnostic interval rollup contains 2494691 rows for the diagnosed columns.\n", - "\n", - "Analysis results summary:\n", - "Found non-failed results for 27 columns and 30 batches.\n", - "Found 31 anomalies in 2 columns, with up to 100.0% (30) batches having anomalies per column and 50.0% (15.0) on average.\n", - "Columns with anomalies are:\n", - "| | 0 |\n", - "|:--------|----:|\n", - "| issue_d | 30 |\n", - "| url | 1 |\n", - "\n", - "No failures were detected.\n", - "\n", - "No issues impacting diagnosis quality were detected\n", - "Conditions that may contribute to noise include:\n", - "\t* Condition changing_discrete (many values are unique across batches) for 2 columns: ['issue_d', 'url']\n", - "\n", - "Anomalies for columns with these conditions:\n", - "| | 0 |\n", - "|:--------|----:|\n", - "| issue_d | 30 |\n", - "| url | 1 |\n", - "Accounting for 31 anomalies out of 31\n" - ] - } - ], + "outputs": [], "source": [ "print(monitor_report.describe())" ] @@ -198,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T14:59:54.558717Z", @@ -213,278 +192,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T14:59:56.056487Z", "start_time": "2024-04-16T14:59:56.051250Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"orgId\": \"org-0\",\n", - " \"datasetId\": \"model-0\",\n", - " \"analyzerId\": \"adorable-goldenrod-lion-9438-analyzer\",\n", - " \"interval\": \"2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z\",\n", - " \"expectedBatchCount\": 30,\n", - " \"diagnosticData\": {\n", - " \"diagnosticSegment\": {\n", - " \"tags\": []\n", - " },\n", - " \"diagnosticProfile\": {\n", - " \"minRowName\": \"issue_d\",\n", - " \"minRowCount\": 2494691,\n", - " \"maxRowName\": \"issue_d\",\n", - " \"maxRowCount\": 2494691\n", - " },\n", - " \"diagnosticBatches\": {\n", - " \"minBatchName\": \"issue_d\",\n", - " \"minBatchCount\": 30,\n", - " \"maxBatchName\": \"issue_d\",\n", - " \"maxBatchCount\": 30\n", - " },\n", - " \"analysisResults\": {\n", - " \"results\": {\n", - " \"diagnosedColumnCount\": 27,\n", - " \"batchCount\": 30\n", - " },\n", - " \"failures\": {\n", - " \"totalFailuresCount\": 0,\n", - " \"maxFailuresCount\": 0,\n", - " \"meanFailuresCount\": 0,\n", - " \"byColumnCount\": [],\n", - " \"byTypeCount\": []\n", - " },\n", - " \"anomalies\": {\n", - " \"totalAnomalyCount\": 31,\n", - " \"maxAnomalyCount\": 30,\n", - " \"meanAnomalyCount\": 15,\n", - " \"batchCount\": 30,\n", - " \"byColumnCount\": [\n", - " [\n", - " \"issue_d\",\n", - " 30\n", - " ],\n", - " [\n", - " \"url\",\n", - " 1\n", - " ]\n", - " ],\n", - " \"byColumnBatchCount\": [\n", - " [\n", - " \"addr_state\",\n", - " 19\n", - " ],\n", - " [\n", - " \"application_type\",\n", - " 19\n", - " ],\n", - " [\n", - " \"debt_settlement_flag\",\n", - " 30\n", - " ],\n", - " [\n", - " \"desc\",\n", - " 1\n", - " ],\n", - " [\n", - " \"disbursement_method\",\n", - " 19\n", - " ],\n", - " [\n", - " \"earliest_cr_line\",\n", - " 19\n", - " ],\n", - " [\n", - " \"emp_length\",\n", - " 30\n", - " ],\n", - " [\n", - " \"emp_title\",\n", - " 19\n", - " ],\n", - " [\n", - " \"grade\",\n", - " 19\n", - " ],\n", - " [\n", - " \"hardship_flag\",\n", - " 19\n", - " ],\n", - " [\n", - " \"home_ownership\",\n", - " 19\n", - " ],\n", - " [\n", - " \"id\",\n", - " 9\n", - " ],\n", - " [\n", - " \"initial_list_status\",\n", - " 30\n", - " ],\n", - " [\n", - " \"issue_d\",\n", - " 30\n", - " ],\n", - " [\n", - " \"last_credit_pull_d\",\n", - " 30\n", - " ],\n", - " [\n", - " \"last_pymnt_d\",\n", - " 19\n", - " ],\n", - " [\n", - " \"loan_status\",\n", - " 19\n", - " ],\n", - " [\n", - " \"next_pymnt_d\",\n", - " 19\n", - " ],\n", - " [\n", - " \"purpose\",\n", - " 19\n", - " ],\n", - " [\n", - " \"pymnt_plan\",\n", - " 19\n", - " ],\n", - " [\n", - " \"sub_grade\",\n", - " 19\n", - " ],\n", - " [\n", - " \"term\",\n", - " 19\n", - " ],\n", - " [\n", - " \"title\",\n", - " 19\n", - " ],\n", - " [\n", - " \"url\",\n", - " 19\n", - " ],\n", - " [\n", - " \"verification_status\",\n", - " 19\n", - " ],\n", - " [\n", - " \"verification_status_joint\",\n", - " 30\n", - " ],\n", - " [\n", - " \"zip_code\",\n", - " 30\n", - " ]\n", - " ]\n", - " }\n", - " },\n", - " \"targetedColumnCount\": 125\n", - " },\n", - " \"qualityIssues\": [],\n", - " \"conditions\": [\n", - " {\n", - " \"columns\": [\n", - " \"issue_d\",\n", - " \"url\"\n", - " ],\n", - " \"info\": null,\n", - " \"summary\": \"many values are unique across batches\",\n", - " \"name\": \"changing_discrete\"\n", - " }\n", - " ],\n", - " \"monitor\": {\n", - " \"metadata\": {\n", - " \"version\": 1,\n", - " \"schemaVersion\": 1,\n", - " \"updatedTimestamp\": 1676498472577,\n", - " \"author\": \"system\",\n", - " \"description\": null\n", - " },\n", - " \"id\": \"adorable-goldenrod-lion-9438\",\n", - " \"displayName\": \"wrong-drift-crowded-orchid-coyote-2773\",\n", - " \"tags\": null,\n", - " \"analyzerIds\": [\n", - " \"adorable-goldenrod-lion-9438-analyzer\"\n", - " ],\n", - " \"schedule\": {\n", - " \"type\": \"immediate\"\n", - " },\n", - " \"disabled\": null,\n", - " \"severity\": 3,\n", - " \"mode\": {\n", - " \"type\": \"DIGEST\",\n", - " \"filter\": null,\n", - " \"creationTimeOffset\": null,\n", - " \"datasetTimestampOffset\": \"P7D\",\n", - " \"groupBy\": null\n", - " },\n", - " \"actions\": []\n", - " },\n", - " \"analyzer\": {\n", - " \"metadata\": {\n", - " \"version\": 1,\n", - " \"schemaVersion\": 1,\n", - " \"updatedTimestamp\": 1676498472065,\n", - " \"author\": \"system\",\n", - " \"description\": null\n", - " },\n", - " \"id\": \"adorable-goldenrod-lion-9438-analyzer\",\n", - " \"displayName\": null,\n", - " \"tags\": [\n", - " \"featureSelection:all\"\n", - " ],\n", - " \"schedule\": {\n", - " \"type\": \"fixed\",\n", - " \"cadence\": \"daily\",\n", - " \"exclusionRanges\": null\n", - " },\n", - " \"disabled\": null,\n", - " \"disableTargetRollup\": null,\n", - " \"targetMatrix\": {\n", - " \"segments\": [],\n", - " \"type\": \"column\",\n", - " \"include\": [\n", - " \"*\"\n", - " ],\n", - " \"exclude\": [\n", - " \"group:output\"\n", - " ],\n", - " \"profileId\": null\n", - " },\n", - " \"dataReadinessDuration\": null,\n", - " \"batchCoolDownPeriod\": null,\n", - " \"backfillGracePeriodDuration\": null,\n", - " \"config\": {\n", - " \"schemaVersion\": null,\n", - " \"params\": null,\n", - " \"metric\": \"frequent_items\",\n", - " \"type\": \"drift\",\n", - " \"algorithm\": \"hellinger\",\n", - " \"threshold\": 0.7,\n", - " \"minBatchSize\": 1,\n", - " \"baseline\": {\n", - " \"datasetId\": null,\n", - " \"inheritSegment\": null,\n", - " \"type\": \"TrailingWindow\",\n", - " \"size\": 7,\n", - " \"offset\": null,\n", - " \"exclusionRanges\": null\n", - " }\n", - " }\n", - " },\n", - " \"analyzedColumnCount\": 27\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "from whylabs_toolkit.monitor.diagnoser.models import MonitorDiagnosisReport\n", "\n", @@ -504,22 +219,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T14:59:59.258082Z", "start_time": "2024-04-16T14:59:59.248989Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. Remove columns from the analyzer for ['issue_d', 'url']\n" - ] - } - ], + "outputs": [], "source": [ "from whylabs_toolkit.monitor.diagnoser.recommendation.change_recommender import ChangeRecommender\n", "\n", @@ -545,22 +252,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:00:01.766477Z", "start_time": "2024-04-16T15:00:01.763192Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Remove columns from the analyzer for ['issue_d', 'url']\n" - ] - } - ], + "outputs": [], "source": [ "automatable_changes = [c for c in changes if c.can_automate()]\n", "print('\\n'.join([c.describe() for c in automatable_changes]))" @@ -568,23 +267,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:00:04.589600Z", "start_time": "2024-04-16T15:00:02.766087Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Successfully made the following changes:\n", - "\t* Remove columns from the analyzer for ['issue_d', 'url']\n" - ] - } - ], + "outputs": [], "source": [ "change_results = recommender.make_changes(automatable_changes)\n", "print(change_results.describe())" @@ -607,24 +297,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:00:06.815149Z", "start_time": "2024-04-16T15:00:06.798273Z" } }, - "outputs": [ - { - "data": { - "text/plain": " monitor_id analyzer_id \n0 adorable-goldenrod-lion-9438 adorable-goldenrod-lion-9438-analyzer \\\n1 unsightly-orchid-gorilla-4971 unsightly-orchid-gorilla-4971-analyzer \n2 concerned-skyblue-penguin-6734 concerned-skyblue-penguin-6734-analyzer \n3 proud-seagreen-carabeef-65 proud-seagreen-carabeef-65-analyzer \n4 kind-cyan-kangaroo-1253 kind-cyan-kangaroo-1253-analyzer \n.. ... ... \n93 numerical-drift-monitor-60dfcc numerical-drift-analyzer-60dfcc \n94 stormy-olive-butterfly-8693 stormy-olive-butterfly-8693-analyzer \n95 fine-magenta-nightingale-9708 fine-magenta-nightingale-9708-analyzer \n96 None eager-violet-newt-4599-analyzer \n97 unsightly-bisque-lemur-1917 unsightly-bisque-lemur-1917-analyzer \n\n metric column_count segment_count anomaly_count \n0 frequent_items 2 1 31 \\\n1 frequent_items 3 1 33 \n2 frequent_items 3 1 32 \n3 histogram 1 1 28 \n4 histogram 1 1 28 \n.. ... ... ... ... \n93 histogram 1 1 2 \n94 histogram 1 1 2 \n95 unique_est_ratio 26 1 39 \n96 count_null_ratio 21 1 28 \n97 frequent_items 1 1 1 \n\n max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \n0 30 1 15 \\\n1 30 1 11 \n2 30 1 10 \n3 28 28 28 \n4 28 28 28 \n.. ... ... ... \n93 2 2 2 \n94 2 2 2 \n95 2 1 1 \n96 2 1 1 \n97 1 1 1 \n\n action_count action_targets \n0 0 [] \n1 0 [] \n2 0 [] \n3 0 [] \n4 0 [] \n.. ... ... \n93 2 [email, slack] \n94 0 [] \n95 0 [] \n96 0 [] \n97 0 [] \n\n[98 rows x 11 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0adorable-goldenrod-lion-9438adorable-goldenrod-lion-9438-analyzerfrequent_items2131301150[]
1unsightly-orchid-gorilla-4971unsightly-orchid-gorilla-4971-analyzerfrequent_items3133301110[]
2concerned-skyblue-penguin-6734concerned-skyblue-penguin-6734-analyzerfrequent_items3132301100[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11282828280[]
4kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11282828280[]
....................................
93numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
94stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram1122220[]
95fine-magenta-nightingale-9708fine-magenta-nightingale-9708-analyzerunique_est_ratio261392110[]
96Noneeager-violet-newt-4599-analyzercount_null_ratio211282110[]
97unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

98 rows × 11 columns

\n
" - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors])\n", @@ -643,56 +323,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-04-16T15:00:17.594027Z", "start_time": "2024-04-16T15:00:09.536137Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Diagnosis is for monitor \"unsightly-orchid-gorilla-4971\" [unsightly-orchid-gorilla-4971] in model-0 org-0, over interval 2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z.\n", - "\n", - "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", - "Analyzer \"unsightly-orchid-gorilla-4971-analyzer\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", - "\n", - "\n", - "Diagnostic segment is \"overall\".\n", - "Diagnostic interval contains 30 batches.\n", - "\n", - "Diagnostic interval rollup contains 2494691 rows for the diagnosed columns.\n", - "\n", - "Analysis results summary:\n", - "Found non-failed results for 26 columns and 30 batches.\n", - "Found 33 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", - "Columns with anomalies are:\n", - "| | 0 |\n", - "|:--------|----:|\n", - "| issue_d | 30 |\n", - "| desc | 2 |\n", - "| url | 1 |\n", - "\n", - "No failures were detected.\n", - "\n", - "No issues impacting diagnosis quality were detected\n", - "Conditions that may contribute to noise include:\n", - "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", - "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", - "\n", - "Anomalies for columns with these conditions:\n", - "| | 0 |\n", - "|:--------|----:|\n", - "| issue_d | 30 |\n", - "| desc | 2 |\n", - "| url | 1 |\n", - "Accounting for 33 anomalies out of 33\n" - ] - } - ], + "outputs": [], "source": [ "diagnoser.monitor_id_to_diagnose = noisy_monitors_df.iloc[1]['monitor_id']\n", "monitor_report = diagnoser.diagnose()\n", @@ -701,46 +339,45 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "You can also use the `noisy_monitors_with_actions` property to prioritize noise in monitors with actions, as these are most likely to cause alert fatigue." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 14, - "outputs": [ - { - "data": { - "text/plain": " monitor_id \n0 energetic-black-cobra-7838 \\\n1 frequent-items-drift-monitor-uu0ax8 \n2 old-crimson-starling-2516 \n3 frequent-items-drift-monitor-48ukw1 \n4 frequent-items-drift-monitor-jepz7t \n5 frequent-items-drift-monitor-pxexvn \n6 frequent-items-drift-monitor-u31vmb \n7 elated-gray-baboon-4620 \n8 nice-burlywood-tarsier-4771 \n9 unique-estimate-ratio-monitor-ccf7cl \n10 numerical-drift-monitor-zy4q8v \n11 uninterested-blueviolet-reindeer-9950 \n12 numerical-drift-monitor-jpodsg \n13 numerical-drift-monitor-60dfcc \n\n analyzer_id metric \n0 energetic-black-cobra-7838-analyzer unique_est \\\n1 frequent-items-drift-analyzer-uu0ax8 frequent_items \n2 old-crimson-starling-2516-analyzer frequent_items \n3 frequent-items-drift-analyzer-48ukw1 frequent_items \n4 frequent-items-drift-analyzer-jepz7t frequent_items \n5 frequent-items-drift-analyzer-pxexvn frequent_items \n6 frequent-items-drift-analyzer-u31vmb frequent_items \n7 elated-gray-baboon-4620-analyzer count_null_ratio \n8 nice-burlywood-tarsier-4771-analyzer unique_est \n9 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n10 numerical-drift-analyzer-zy4q8v histogram \n11 uninterested-blueviolet-reindeer-9950-analyzer count \n12 numerical-drift-analyzer-jpodsg histogram \n13 numerical-drift-analyzer-60dfcc histogram \n\n column_count segment_count anomaly_count max_anomaly_per_column \n0 8 1 100 28 \\\n1 3 1 31 28 \n2 3 1 31 28 \n3 3 1 31 28 \n4 3 1 31 28 \n5 3 1 31 28 \n6 3 1 31 28 \n7 15 1 70 28 \n8 7 1 97 26 \n9 104 1 358 7 \n10 3 1 14 6 \n11 101 1 246 6 \n12 1 1 2 2 \n13 1 1 2 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \n0 3 12 1 \\\n1 1 10 3 \n2 1 10 1 \n3 1 10 2 \n4 1 10 2 \n5 1 10 2 \n6 1 10 2 \n7 1 4 1 \n8 3 13 2 \n9 1 3 2 \n10 2 4 1 \n11 1 2 1 \n12 2 2 2 \n13 2 2 2 \n\n action_targets \n0 [email] \n1 [email, slack, email-victor-at-whylabs] \n2 [email] \n3 [email, slack] \n4 [email, slack] \n5 [email, slack] \n6 [email, slack] \n7 [email] \n8 [slack, email] \n9 [email, slack] \n10 [email] \n11 [christine-test-email] \n12 [email, slack] \n13 [email, slack] ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est81100283121[email]
1frequent-items-drift-monitor-uu0ax8frequent-items-drift-analyzer-uu0ax8frequent_items3131281103[email, slack, email-victor-at-whylabs]
2old-crimson-starling-2516old-crimson-starling-2516-analyzerfrequent_items3131281101[email]
3frequent-items-drift-monitor-48ukw1frequent-items-drift-analyzer-48ukw1frequent_items3131281102[email, slack]
4frequent-items-drift-monitor-jepz7tfrequent-items-drift-analyzer-jepz7tfrequent_items3131281102[email, slack]
5frequent-items-drift-monitor-pxexvnfrequent-items-drift-analyzer-pxexvnfrequent_items3131281102[email, slack]
6frequent-items-drift-monitor-u31vmbfrequent-items-drift-analyzer-u31vmbfrequent_items3131281102[email, slack]
7elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio1517028141[email]
8nice-burlywood-tarsier-4771nice-burlywood-tarsier-4771-analyzerunique_est7197263132[slack, email]
9unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio10413587132[email, slack]
10numerical-drift-monitor-zy4q8vnumerical-drift-analyzer-zy4q8vhistogram31146241[email]
11uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount10112466121[christine-test-email]
12numerical-drift-monitor-jpodsgnumerical-drift-analyzer-jpodsghistogram1122222[email, slack]
13numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
\n
" - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors_with_actions])\n" - ], + "execution_count": null, "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-16T15:00:17.603562Z", "start_time": "2024-04-16T15:00:17.595665Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } - } + }, + "outputs": [], + "source": [ + "pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors_with_actions])\n" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [], "metadata": { - "collapsed": false - } + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [] } ], "metadata": { @@ -763,5 +400,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/poetry.lock b/poetry.lock index 7823881..181be90 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -23,7 +22,6 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte name = "autoflake" version = "2.1.1" description = "Removes unused imports and unused variables" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -39,7 +37,6 @@ tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} name = "black" version = "22.12.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -75,7 +72,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "bump2version" version = "1.0.1" description = "Version-bump your software with a single command!" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -87,7 +83,6 @@ files = [ name = "bumpversion" version = "0.6.0" description = "Version-bump your software with a single command!" -category = "dev" optional = false python-versions = "*" files = [ @@ -102,7 +97,6 @@ bump2version = "*" name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -117,7 +111,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -129,7 +122,6 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -144,7 +136,6 @@ test = ["pytest (>=6)"] name = "importlib-resources" version = "5.12.0" description = "Read resources from Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -163,7 +154,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -171,11 +161,24 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = true +python-versions = "*" +files = [ + {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, + {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "jsonschema" version = "4.17.3" description = "An implementation of JSON Schema validation for Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -197,7 +200,6 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- name = "mypy" version = "1.0.1" description = "Optional static typing for Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -244,7 +246,6 @@ reports = ["lxml"] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -252,11 +253,47 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = true +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + [[package]] name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -264,11 +301,77 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pandas" +version = "2.0.3" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] + [[package]] name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -280,7 +383,6 @@ files = [ name = "pkgutil-resolve-name" version = "1.3.10" description = "Resolve a name to an object." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -292,7 +394,6 @@ files = [ name = "platformdirs" version = "3.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -308,7 +409,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -324,7 +424,6 @@ testing = ["pytest", "pytest-benchmark"] name = "protobuf" version = "4.22.3" description = "" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -345,48 +444,47 @@ files = [ [[package]] name = "pydantic" -version = "1.10.7" +version = "1.10.15" description = "Data validation and settings management using python type hints" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-1.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e79e999e539872e903767c417c897e729e015872040e56b96e67968c3b918b2d"}, - {file = "pydantic-1.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01aea3a42c13f2602b7ecbbea484a98169fb568ebd9e247593ea05f01b884b2e"}, - {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:516f1ed9bc2406a0467dd777afc636c7091d71f214d5e413d64fef45174cfc7a"}, - {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae150a63564929c675d7f2303008d88426a0add46efd76c3fc797cd71cb1b46f"}, - {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ecbbc51391248116c0a055899e6c3e7ffbb11fb5e2a4cd6f2d0b93272118a209"}, - {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f4a2b50e2b03d5776e7f21af73e2070e1b5c0d0df255a827e7c632962f8315af"}, - {file = "pydantic-1.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:a7cd2251439988b413cb0a985c4ed82b6c6aac382dbaff53ae03c4b23a70e80a"}, - {file = "pydantic-1.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:68792151e174a4aa9e9fc1b4e653e65a354a2fa0fed169f7b3d09902ad2cb6f1"}, - {file = "pydantic-1.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe2507b8ef209da71b6fb5f4e597b50c5a34b78d7e857c4f8f3115effaef5fe"}, - {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10a86d8c8db68086f1e30a530f7d5f83eb0685e632e411dbbcf2d5c0150e8dcd"}, - {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75ae19d2a3dbb146b6f324031c24f8a3f52ff5d6a9f22f0683694b3afcb16fb"}, - {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:464855a7ff7f2cc2cf537ecc421291b9132aa9c79aef44e917ad711b4a93163b"}, - {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:193924c563fae6ddcb71d3f06fa153866423ac1b793a47936656e806b64e24ca"}, - {file = "pydantic-1.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:b4a849d10f211389502059c33332e91327bc154acc1845f375a99eca3afa802d"}, - {file = "pydantic-1.10.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cc1dde4e50a5fc1336ee0581c1612215bc64ed6d28d2c7c6f25d2fe3e7c3e918"}, - {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cfe895a504c060e5d36b287ee696e2fdad02d89e0d895f83037245218a87fe"}, - {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:670bb4683ad1e48b0ecb06f0cfe2178dcf74ff27921cdf1606e527d2617a81ee"}, - {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:950ce33857841f9a337ce07ddf46bc84e1c4946d2a3bba18f8280297157a3fd1"}, - {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c15582f9055fbc1bfe50266a19771bbbef33dd28c45e78afbe1996fd70966c2a"}, - {file = "pydantic-1.10.7-cp37-cp37m-win_amd64.whl", hash = "sha256:82dffb306dd20bd5268fd6379bc4bfe75242a9c2b79fec58e1041fbbdb1f7914"}, - {file = "pydantic-1.10.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c7f51861d73e8b9ddcb9916ae7ac39fb52761d9ea0df41128e81e2ba42886cd"}, - {file = "pydantic-1.10.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6434b49c0b03a51021ade5c4daa7d70c98f7a79e95b551201fff682fc1661245"}, - {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d34ab766fa056df49013bb6e79921a0265204c071984e75a09cbceacbbdd5d"}, - {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:701daea9ffe9d26f97b52f1d157e0d4121644f0fcf80b443248434958fd03dc3"}, - {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf135c46099ff3f919d2150a948ce94b9ce545598ef2c6c7bf55dca98a304b52"}, - {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0f85904f73161817b80781cc150f8b906d521fa11e3cdabae19a581c3606209"}, - {file = "pydantic-1.10.7-cp38-cp38-win_amd64.whl", hash = "sha256:9f6f0fd68d73257ad6685419478c5aece46432f4bdd8d32c7345f1986496171e"}, - {file = "pydantic-1.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c230c0d8a322276d6e7b88c3f7ce885f9ed16e0910354510e0bae84d54991143"}, - {file = "pydantic-1.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:976cae77ba6a49d80f461fd8bba183ff7ba79f44aa5cfa82f1346b5626542f8e"}, - {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d45fc99d64af9aaf7e308054a0067fdcd87ffe974f2442312372dfa66e1001d"}, - {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2a5ebb48958754d386195fe9e9c5106f11275867051bf017a8059410e9abf1f"}, - {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:abfb7d4a7cd5cc4e1d1887c43503a7c5dd608eadf8bc615413fc498d3e4645cd"}, - {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:80b1fab4deb08a8292d15e43a6edccdffa5377a36a4597bb545b93e79c5ff0a5"}, - {file = "pydantic-1.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:d71e69699498b020ea198468e2480a2f1e7433e32a3a99760058c6520e2bea7e"}, - {file = "pydantic-1.10.7-py3-none-any.whl", hash = "sha256:0cd181f1d0b1d00e2b705f1bf1ac7799a2d938cce3376b8007df62b29be3c2c6"}, - {file = "pydantic-1.10.7.tar.gz", hash = "sha256:cfc83c0678b6ba51b0532bea66860617c4cd4251ecf76e9846fa5a9f3454e97e"}, + {file = "pydantic-1.10.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:22ed12ee588b1df028a2aa5d66f07bf8f8b4c8579c2e96d5a9c1f96b77f3bb55"}, + {file = "pydantic-1.10.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:75279d3cac98186b6ebc2597b06bcbc7244744f6b0b44a23e4ef01e5683cc0d2"}, + {file = "pydantic-1.10.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50f1666a9940d3d68683c9d96e39640f709d7a72ff8702987dab1761036206bb"}, + {file = "pydantic-1.10.15-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82790d4753ee5d00739d6cb5cf56bceb186d9d6ce134aca3ba7befb1eedbc2c8"}, + {file = "pydantic-1.10.15-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:d207d5b87f6cbefbdb1198154292faee8017d7495a54ae58db06762004500d00"}, + {file = "pydantic-1.10.15-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e49db944fad339b2ccb80128ffd3f8af076f9f287197a480bf1e4ca053a866f0"}, + {file = "pydantic-1.10.15-cp310-cp310-win_amd64.whl", hash = "sha256:d3b5c4cbd0c9cb61bbbb19ce335e1f8ab87a811f6d589ed52b0254cf585d709c"}, + {file = "pydantic-1.10.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c3d5731a120752248844676bf92f25a12f6e45425e63ce22e0849297a093b5b0"}, + {file = "pydantic-1.10.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c365ad9c394f9eeffcb30a82f4246c0006417f03a7c0f8315d6211f25f7cb654"}, + {file = "pydantic-1.10.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3287e1614393119c67bd4404f46e33ae3be3ed4cd10360b48d0a4459f420c6a3"}, + {file = "pydantic-1.10.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be51dd2c8596b25fe43c0a4a59c2bee4f18d88efb8031188f9e7ddc6b469cf44"}, + {file = "pydantic-1.10.15-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6a51a1dd4aa7b3f1317f65493a182d3cff708385327c1c82c81e4a9d6d65b2e4"}, + {file = "pydantic-1.10.15-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4e316e54b5775d1eb59187f9290aeb38acf620e10f7fd2f776d97bb788199e53"}, + {file = "pydantic-1.10.15-cp311-cp311-win_amd64.whl", hash = "sha256:0d142fa1b8f2f0ae11ddd5e3e317dcac060b951d605fda26ca9b234b92214986"}, + {file = "pydantic-1.10.15-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7ea210336b891f5ea334f8fc9f8f862b87acd5d4a0cbc9e3e208e7aa1775dabf"}, + {file = "pydantic-1.10.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3453685ccd7140715e05f2193d64030101eaad26076fad4e246c1cc97e1bb30d"}, + {file = "pydantic-1.10.15-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bea1f03b8d4e8e86702c918ccfd5d947ac268f0f0cc6ed71782e4b09353b26f"}, + {file = "pydantic-1.10.15-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:005655cabc29081de8243126e036f2065bd7ea5b9dff95fde6d2c642d39755de"}, + {file = "pydantic-1.10.15-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:af9850d98fc21e5bc24ea9e35dd80a29faf6462c608728a110c0a30b595e58b7"}, + {file = "pydantic-1.10.15-cp37-cp37m-win_amd64.whl", hash = "sha256:d31ee5b14a82c9afe2bd26aaa405293d4237d0591527d9129ce36e58f19f95c1"}, + {file = "pydantic-1.10.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5e09c19df304b8123938dc3c53d3d3be6ec74b9d7d0d80f4f4b5432ae16c2022"}, + {file = "pydantic-1.10.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7ac9237cd62947db00a0d16acf2f3e00d1ae9d3bd602b9c415f93e7a9fc10528"}, + {file = "pydantic-1.10.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:584f2d4c98ffec420e02305cf675857bae03c9d617fcfdc34946b1160213a948"}, + {file = "pydantic-1.10.15-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbc6989fad0c030bd70a0b6f626f98a862224bc2b1e36bfc531ea2facc0a340c"}, + {file = "pydantic-1.10.15-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d573082c6ef99336f2cb5b667b781d2f776d4af311574fb53d908517ba523c22"}, + {file = "pydantic-1.10.15-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6bd7030c9abc80134087d8b6e7aa957e43d35714daa116aced57269a445b8f7b"}, + {file = "pydantic-1.10.15-cp38-cp38-win_amd64.whl", hash = "sha256:3350f527bb04138f8aff932dc828f154847fbdc7a1a44c240fbfff1b57f49a12"}, + {file = "pydantic-1.10.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51d405b42f1b86703555797270e4970a9f9bd7953f3990142e69d1037f9d9e51"}, + {file = "pydantic-1.10.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a980a77c52723b0dc56640ced396b73a024d4b74f02bcb2d21dbbac1debbe9d0"}, + {file = "pydantic-1.10.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f1a1fb467d3f49e1708a3f632b11c69fccb4e748a325d5a491ddc7b5d22383"}, + {file = "pydantic-1.10.15-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:676ed48f2c5bbad835f1a8ed8a6d44c1cd5a21121116d2ac40bd1cd3619746ed"}, + {file = "pydantic-1.10.15-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:92229f73400b80c13afcd050687f4d7e88de9234d74b27e6728aa689abcf58cc"}, + {file = "pydantic-1.10.15-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2746189100c646682eff0bce95efa7d2e203420d8e1c613dc0c6b4c1d9c1fde4"}, + {file = "pydantic-1.10.15-cp39-cp39-win_amd64.whl", hash = "sha256:394f08750bd8eaad714718812e7fab615f873b3cdd0b9d84e76e51ef3b50b6b7"}, + {file = "pydantic-1.10.15-py3-none-any.whl", hash = "sha256:28e552a060ba2740d0d2aabe35162652c1459a0b9069fe0db7f4ee0e18e74d58"}, + {file = "pydantic-1.10.15.tar.gz", hash = "sha256:ca832e124eda231a60a041da4f013e3ff24949d94a01154b137fc2f2a43c3ffb"}, ] [package.dependencies] @@ -400,7 +498,6 @@ email = ["email-validator (>=1.0.3)"] name = "pyflakes" version = "3.0.1" description = "passive checker of Python programs" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -412,7 +509,6 @@ files = [ name = "pyrsistent" version = "0.19.3" description = "Persistent/Functional/Immutable data structures" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -449,7 +545,6 @@ files = [ name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -472,7 +567,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -483,11 +577,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = true +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + [[package]] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -495,11 +599,24 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "tabulate" +version = "0.8.9" +description = "Pretty-print tabular data" +optional = true +python-versions = "*" +files = [ + {file = "tabulate-0.8.9-py3-none-any.whl", hash = "sha256:d7c013fe7abbc5e491394e10fa845f8f32fe54f8dc60c6622c6cf482d25d47e4"}, + {file = "tabulate-0.8.9.tar.gz", hash = "sha256:eb1d13f25760052e8931f2ef80aaf6045a6cceb47514db8beab24cded16f13a7"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -509,26 +626,35 @@ files = [ [[package]] name = "typing-extensions" -version = "4.5.0" -description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" +version = "4.11.0" +description = "Backported and Experimental Type Hints for Python 3.8+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, - {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, +] + +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = true +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, ] [[package]] name = "urllib3" -version = "2.0.1" +version = "2.0.7" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "urllib3-2.0.1-py3-none-any.whl", hash = "sha256:d75e5ece05ff170e323303fd924edf29e705f5ae057c489f453a686b639bb68a"}, - {file = "urllib3-2.0.1.tar.gz", hash = "sha256:2ce66a68134be469f5df5d46d724237489b3cd85b2bba2223dbbee1746548826"}, + {file = "urllib3-2.0.7-py3-none-any.whl", hash = "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e"}, + {file = "urllib3-2.0.7.tar.gz", hash = "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84"}, ] [package.extras] @@ -541,7 +667,6 @@ zstd = ["zstandard (>=0.18.0)"] name = "whylabs-client" version = "0.6.2" description = "WhyLabs API client" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -557,7 +682,6 @@ urllib3 = ">=1.25.3" name = "whylogs" version = "1.1.39" description = "Profile and monitor your ML data pipeline end-to-end" -category = "main" optional = false python-versions = ">=3.7.1,<4" files = [ @@ -589,7 +713,6 @@ whylabs = ["requests (>=2.27,<3.0)"] name = "whylogs-sketching" version = "3.4.1.dev3" description = "sketching library of whylogs" -category = "main" optional = false python-versions = "*" files = [ @@ -630,7 +753,6 @@ files = [ name = "zipp" version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -642,7 +764,10 @@ files = [ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +[extras] +diagnoser = ["isodate", "numpy", "pandas", "python-dateutil", "tabulate"] + [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "33f20a2d8395aad5fc1d6edd38492ef18ef8cfb74b5bbe4d019395f967b899b7" +content-hash = "f620e2af4fec27c6f6f97a9fc08add252e9a0ae55909e671c7d28266505d1fcc" diff --git a/pyproject.toml b/pyproject.toml index e9dbcc7..0cdf524 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "whylabs-toolkit" -version = "0.0.18" +version = "0.0.19" description = "Whylabs Toolkit package." authors = ["Murilo Mendonca ", "Anthony Naddeo ", "Christine Draper "] @@ -12,9 +12,18 @@ include = ["whylabs_toolkit/monitor/schema/schema.json"] [tool.poetry.dependencies] python = "^3.8" whylabs-client = "^0.6.0" -pydantic = "^1.10.4" +pydantic = "1.10.15" whylogs = "^1.1.26" jsonschema = "^4.17.3" +typing-extensions = "^4.11.0" +urllib3 = ">=2.0.2,<2.1.0" + +# diagnoser extra dependencies +pandas = { version="^2.0.3", optional=true } +numpy = { version="^1.24.1", optional=true } +tabulate = { version="0.8.9", optional=true } +isodate = { version="^0.6.1", optional=true } +python-dateutil = { version="^2.8.2", optional=true } [tool.poetry.group.dev.dependencies] autoflake = "^2.0.1" @@ -32,3 +41,6 @@ build-backend = "poetry.core.masonry.api" [tool.flake8] max-line-length = 140 + +[tool.poetry.extras] +diagnoser = ["pandas", "numpy", "tabulate", "isodate", "python-dateutil"] \ No newline at end of file diff --git a/whylabs_toolkit/monitor/diagnoser/test/__init__.py b/tests/monitor/diagnoser/__init__.py similarity index 100% rename from whylabs_toolkit/monitor/diagnoser/test/__init__.py rename to tests/monitor/diagnoser/__init__.py diff --git a/tests/monitor/diagnoser/converters/__init__.py b/tests/monitor/diagnoser/converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/converters/test_granularity.py b/tests/monitor/diagnoser/converters/test_granularity.py similarity index 100% rename from whylabs_toolkit/monitor/diagnoser/converters/test_granularity.py rename to tests/monitor/diagnoser/converters/test_granularity.py diff --git a/tests/monitor/diagnoser/recommendation/__init__.py b/tests/monitor/diagnoser/recommendation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/test_changes.py b/tests/monitor/diagnoser/recommendation/test_changes.py similarity index 100% rename from whylabs_toolkit/monitor/diagnoser/recommendation/test_changes.py rename to tests/monitor/diagnoser/recommendation/test_changes.py diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/test_remove_columns.py b/tests/monitor/diagnoser/recommendation/test_remove_columns.py similarity index 100% rename from whylabs_toolkit/monitor/diagnoser/recommendation/test_remove_columns.py rename to tests/monitor/diagnoser/recommendation/test_remove_columns.py diff --git a/whylabs_toolkit/monitor/diagnoser/test/test_helpers.py b/tests/monitor/diagnoser/test_helpers.py similarity index 100% rename from whylabs_toolkit/monitor/diagnoser/test/test_helpers.py rename to tests/monitor/diagnoser/test_helpers.py From ebb373f62d613fca2039722634f7c9a495434ce5 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Mon, 22 Apr 2024 20:33:58 -0400 Subject: [PATCH 03/14] Tweak dependencies --- poetry.lock | 2 +- pyproject.toml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 181be90..12bbdc7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -770,4 +770,4 @@ diagnoser = ["isodate", "numpy", "pandas", "python-dateutil", "tabulate"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "f620e2af4fec27c6f6f97a9fc08add252e9a0ae55909e671c7d28266505d1fcc" +content-hash = "dd7b6df400a44979482c1d8cddf045ac2297ac954a7e18a40172a26a4f0c36fe" diff --git a/pyproject.toml b/pyproject.toml index 0cdf524..ef44a95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,16 +12,16 @@ include = ["whylabs_toolkit/monitor/schema/schema.json"] [tool.poetry.dependencies] python = "^3.8" whylabs-client = "^0.6.0" -pydantic = "1.10.15" +pydantic = "^1.10.15" whylogs = "^1.1.26" jsonschema = "^4.17.3" typing-extensions = "^4.11.0" -urllib3 = ">=2.0.2,<2.1.0" +urllib3 = "^2.0.2, <2.1" # diagnoser extra dependencies pandas = { version="^2.0.3", optional=true } numpy = { version="^1.24.1", optional=true } -tabulate = { version="0.8.9", optional=true } +tabulate = { version="^0.8.9", optional=true } isodate = { version="^0.6.1", optional=true } python-dateutil = { version="^2.8.2", optional=true } From 4c93105b5f3a98cac4e8cd1bcdbcaad6cbb3c5a5 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Tue, 23 Apr 2024 09:02:52 -0400 Subject: [PATCH 04/14] Fix all lint issues --- poetry.lock | 13 ++++++- pyproject.toml | 1 + .../monitor/diagnoser/helpers/describe.py | 7 ++-- .../monitor/diagnoser/helpers/utils.py | 5 ++- .../diagnoser/models/diagnosis_report.py | 37 ++++++++++++------- .../monitor/diagnoser/monitor_diagnoser.py | 35 ++++++++++-------- .../recommendation/change_recommender.py | 24 +++++++----- .../diagnoser/recommendation/manual_change.py | 2 +- .../recommendation/recommended_change.py | 6 +-- .../recommendation/remove_columns.py | 20 +++++++--- .../monitor/diagnoser/targeting.py | 11 +++--- 11 files changed, 102 insertions(+), 59 deletions(-) diff --git a/poetry.lock b/poetry.lock index 12bbdc7..8b42e46 100644 --- a/poetry.lock +++ b/poetry.lock @@ -624,6 +624,17 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "types-python-dateutil" +version = "2.9.0.20240316" +description = "Typing stubs for python-dateutil" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-python-dateutil-2.9.0.20240316.tar.gz", hash = "sha256:5d2f2e240b86905e40944dd787db6da9263f0deabef1076ddaed797351ec0202"}, + {file = "types_python_dateutil-2.9.0.20240316-py3-none-any.whl", hash = "sha256:6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b"}, +] + [[package]] name = "typing-extensions" version = "4.11.0" @@ -770,4 +781,4 @@ diagnoser = ["isodate", "numpy", "pandas", "python-dateutil", "tabulate"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "dd7b6df400a44979482c1d8cddf045ac2297ac954a7e18a40172a26a4f0c36fe" +content-hash = "14e09374513c67afeab23e3431078c556d5cbe4c83190c7cd9c2dd44f189fb40" diff --git a/pyproject.toml b/pyproject.toml index ef44a95..cf9cb02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ pytest = "^7.2.0" black = "^22.10.0" mypy = "~1.0.1" bumpversion = "^0.6.0" +types-python-dateutil = "^2.9.0.20240316" [tool.black] line-length = 140 diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/describe.py b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py index e726f60..c5da2e6 100644 --- a/whylabs_toolkit/monitor/diagnoser/helpers/describe.py +++ b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py @@ -3,15 +3,16 @@ import pandas as pd -def describe_truncated_list(vals: List[str], num=10) -> str: +def describe_truncated_list(vals: List[str], num: int = 10) -> str: if len(vals) <= num: return str(vals) return f'{vals[0:num]} and {len(vals) - num} more' -def describe_truncated_table(df: Union[pd.DataFrame, pd.Series], num=10) -> str: +def describe_truncated_table(df: Union[pd.DataFrame, pd.Series], num: int = 10) -> str: if len(df) <= num: - return df.to_markdown() + table = df.to_markdown() + return str(table) if table is not None else 'No data to display.' return f'{df[0:num].to_markdown()}\n and {len(df) - num} more' diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/utils.py b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py index a3d73c0..2aed030 100644 --- a/whylabs_toolkit/monitor/diagnoser/helpers/utils.py +++ b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py @@ -1,5 +1,5 @@ import os -from typing import List +from typing import List, Optional from whylabs_client.api.monitor_diagnostics_api import MonitorDiagnosticsApi @@ -18,7 +18,8 @@ def get_monitor_diagnostics_api(config: Config = Config()) -> MonitorDiagnostics return MonitorDiagnosticsApi(api_client=create_client(config=config)) -def env_setup(org_id: str, dataset_id: str, api_key: str = None, whylabs_endpoint: str = None): +def env_setup(org_id: str, dataset_id: str, api_key: Optional[str] = None, + whylabs_endpoint: Optional[str] = None) -> None: """ Set environment variables to work with both whylabs-toolkit and whylogs. Will pick up the API key from the environment if not provided as a parameter. diff --git a/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py index 8f2118c..43f522e 100644 --- a/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py +++ b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py @@ -1,7 +1,8 @@ import pandas as pd from typing import Dict, List, Optional, Tuple from pydantic import BaseModel -from whylabs_toolkit.monitor.models import Analyzer, Monitor, Segment, TargetLevel, FixedThresholdsConfig +from whylabs_toolkit.monitor.models import Analyzer, Monitor, Segment, TargetLevel, FixedThresholdsConfig, \ + ConjunctionConfig, DisjunctionConfig, GlobalAction from whylabs_toolkit.monitor.diagnoser.helpers.describe import describe_truncated_table, filter_by_index, describe_truncated_list from whylabs_toolkit.monitor.diagnoser.helpers.utils import segment_as_readable_text @@ -132,9 +133,9 @@ class DiagnosticDataSummary(BaseModel): def describe(self) -> str: return '\n'.join([ f'Diagnostic segment is "{segment_as_readable_text(self.diagnosticSegment.tags)}".', - self.diagnosticBatches.describe(), + self.diagnosticBatches.describe() if self.diagnosticBatches is not None else '', self.diagnosticProfile.describe() if self.diagnosticProfile is not None else '', - self.analysisResults.describe() + self.analysisResults.describe() if self.analysisResults is not None else '' ]) @@ -166,19 +167,21 @@ def describe_conditions(self) -> str: if len(self.conditions) == 0: return 'No conditions related to noise were detected.' text = 'Conditions that may contribute to noise include:\n' - cols = [] + condition_cols: List[str] = [] for condition in self.conditions: text += f'\t* Condition {condition.name} ({condition.summary})' if condition.columns is not None: - cols += condition.columns - col_text = describe_truncated_list(cols, 10) - text += f' for {len(cols)} columns: {col_text}' + condition_cols += condition.columns + col_text = describe_truncated_list(condition_cols, 10) + text += f' for {len(condition_cols)} columns: {col_text}' text += '\n' - cols = pd.Series(cols).unique() + cols = pd.Series(condition_cols).unique() if len(cols) > 0: text += f'\nAnomalies for columns with these conditions:\n' - count_tuples = [c.to_tuple() for c in self.diagnosticData.analysisResults.anomalies.byColumnCount] + by_col_count = self.diagnosticData.analysisResults.anomalies.byColumnCount if ( + self.diagnosticData.analysisResults is not None) else [] + count_tuples = [c.to_tuple() for c in by_col_count] idx, values = zip(*count_tuples) count_by_col = pd.Series(values, idx) cols_with_count = filter_by_index(cols.tolist(), count_by_col).sort_values( @@ -207,18 +210,26 @@ def describe_monitor(self) -> str: text = (f'Diagnosis is for monitor "{self.monitor.displayName if self.monitor.displayName else self.monitor.id}" ' f'[{self.monitor.id}] in {self.datasetId} {self.orgId}, over interval {self.interval}.\n') if len(self.monitor.actions) > 0: - text += f'Monitor has {len(self.monitor.actions)} notification actions {[a.target for a in self.monitor.actions]}.\n' + text += f'Monitor has {len(self.monitor.actions)} notification actions ' + text += f'{[a.target for a in self.monitor.actions if isinstance(a, GlobalAction)]}.\n' return text def describe_analyzer(self) -> str: - baseline = 'no baseline' if isinstance(self.analyzer.config, FixedThresholdsConfig) else \ + if self.analyzer is None: + return 'No analyzer found.\n' + if isinstance(self.analyzer.config, ConjunctionConfig) or isinstance(self.analyzer.config, DisjunctionConfig): + return f'\nAnalyzer is a composite {self.analyzer.config.type}.' + baseline = 'no baseline' if (isinstance(self.analyzer.config, FixedThresholdsConfig) or + self.analyzer.config.baseline is None) else \ f'{self.analyzer.config.baseline.type} baseline' - # need to add better support for composite analyzers targeting_desc = '' + if self.analyzer is None: + return '' + metric = self.analyzer.config.metric if self.analyzer.targetMatrix is not None and self.analyzer.targetMatrix.type == TargetLevel.column: targeting_desc = (f'\nAnalyzer "{self.analyzer.id}" targets {self.diagnosticData.targetedColumnCount} ' f'columns and ran on {self.analyzedColumnCount} columns in the diagnosed segment.\n') - text = f'Analyzer is {self.analyzer.config.type} configuration for {self.analyzer.config.metric} metric with {baseline}.' + text = f'Analyzer is {self.analyzer.config.type} configuration for {metric} metric with {baseline}.' text += targeting_desc text += '\n' return text diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py index 962666d..36c36ca 100644 --- a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -35,7 +35,7 @@ def __init__(self, org_id: str, dataset_id: str): self._diagnostics_api = get_monitor_diagnostics_api() self._monitor_api = get_monitor_api() self._models_api = get_models_api() - self._monitor_configs = None + self._monitor_configs: Optional[List[Monitor]] = None self._noisy_monitors: Optional[List[NoisyMonitorStats]] = None self._failed_monitors: Optional[List[FailedMonitorStats]] = None self._noisy_segments: Optional[List[NoisySegmentStats]] = None @@ -45,7 +45,7 @@ def __init__(self, org_id: str, dataset_id: str): self._monitor_id: Optional[str] = None self._diagnostic_segment: Optional[Segment] = None self._analyzer: Optional[Analyzer] = None - self._diagnosed_columns: Optional[str] = None + self._diagnosed_columns: Optional[List[str]] = None self._diagnosis: Optional[MonitorDiagnosisReport] = None self.schema: Optional[EntitySchema] = None @@ -88,7 +88,7 @@ def noisy_columns(self) -> List[NoisyColumnStats]: return self._noisy_columns @property - def monitor_configs(self): + def monitor_configs(self) -> List[Monitor]: if self._monitor_configs is None: config = self._monitor_api.get_monitor_config_v3(self.org_id, self.dataset_id) self._monitor_configs = [] @@ -106,8 +106,9 @@ def diagnostic_interval(self) -> str: return self._diagnostic_interval @diagnostic_interval.setter - def diagnostic_interval(self, interval: str): + def diagnostic_interval(self, interval: str) -> str: self._diagnostic_interval = interval + return self._diagnostic_interval @property def diagnostic_segment(self) -> Segment: @@ -116,11 +117,12 @@ def diagnostic_segment(self) -> Segment: return self._diagnostic_segment @diagnostic_segment.setter - def diagnostic_segment(self, segment: Segment): + def diagnostic_segment(self, segment: Segment) -> Segment: if self._diagnostic_segment != segment: self._diagnostic_segment = segment self._noisy_columns = None self._diagnosis = None + return segment @property def monitor_id_to_diagnose(self) -> str: @@ -129,7 +131,7 @@ def monitor_id_to_diagnose(self) -> str: return self._monitor_id @monitor_id_to_diagnose.setter - def monitor_id_to_diagnose(self, monitor_id: str): + def monitor_id_to_diagnose(self, monitor_id: str) -> str: if self._monitor_id != monitor_id: self._monitor_id = monitor_id # Reset anything specific to the monitor @@ -139,12 +141,13 @@ def monitor_id_to_diagnose(self, monitor_id: str): self._noisy_columns = None self._diagnosis = None self._diagnostic_segment = None + return self._monitor_id @property - def monitor_to_diagnose(self) -> Monitor: - return next(m for m in self.monitor_configs if m.id == self._monitor_id) + def monitor_to_diagnose(self) -> Optional[Monitor]: + return next((m for m in self.monitor_configs if m.id == self._monitor_id), None) - def targeted_columns(self): + def targeted_columns(self) -> List[str]: if self.schema is None: self.schema = self._models_api.get_entity_schema(self.org_id, self.dataset_id) return targeted_columns(self.analyzer_to_diagnose.targetMatrix, self.schema) @@ -176,7 +179,7 @@ def choose_dataset_batches(self) -> Tuple[TimeRange, Granularity, str]: lineage = TimeRange(start=resp.start_timestamp, end=resp.end_timestamp) self.granularity = time_period_to_granularity(time_period) - return lineage, self.granularity, self._diagnostic_interval + return lineage, self.granularity, resp.interval def detect_noisy_monitors(self) -> List[NoisyMonitorStats]: """ @@ -202,7 +205,7 @@ def merge_monitor_actions(item: Dict, mon_acts: List[Dict]) -> Dict: 'monitor_id': m.id, 'analyzer_id': m.analyzerIds[0] if len(m.analyzerIds) > 0 else None, 'action_count': len(m.actions), - 'action_targets': [a.target for a in m.actions] + 'action_targets': [a.target for a in m.actions if a.type == 'global'] } for m in self.monitor_configs] self._noisy_monitors = [NoisyMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) for item in resp.noisy_analyzers] @@ -215,13 +218,13 @@ def merge_monitor_actions(item: Dict, mon_acts: List[Dict]) -> Dict: return self._noisy_monitors def get_analyzer_id_for_monitor(self) -> str: - analyzer_id = next((m.analyzerIds[0] for m in self.monitor_configs if m.id == self.monitor_id_to_diagnose), + analyzer_id: Optional[str] = next((m.analyzerIds[0] for m in self.monitor_configs if m.id == self.monitor_id_to_diagnose), None) if analyzer_id is None: raise Exception(f'No analyzer found for monitor {self.monitor_id_to_diagnose}') return analyzer_id - def detect_noisy_segments(self): + def detect_noisy_segments(self) -> List[NoisySegmentStats]: analyzer_id = self.get_analyzer_id_for_monitor() resp: AnalyzerSegmentsDiagnosticResponse = self._diagnostics_api.detect_noisy_segments( self.org_id, @@ -232,7 +235,7 @@ def detect_noisy_segments(self): self.diagnostic_segment = self._noisy_segments[0].segment return self._noisy_segments - def detect_noisy_columns(self): + def detect_noisy_columns(self) -> List[NoisyColumnStats]: analyzer_id = self.get_analyzer_id_for_monitor() resp: AnalyzerSegmentColumnsDiagnosticResponse = self._diagnostics_api.detect_noisy_columns( self.org_id, @@ -275,7 +278,7 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor if columns is None: if self._noisy_columns is None: self.detect_noisy_columns() - self._diagnosed_columns = [c.column for c in self._noisy_columns[:100]] + self._diagnosed_columns = [c.column for c in self.noisy_columns[:100]] else: self._diagnosed_columns = columns[:100] use_local_server = os.environ.get('USE_LOCAL_SERVER', False) @@ -321,6 +324,6 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor **report_dict, analyzer=self.analyzer_to_diagnose, monitor=self.monitor_to_diagnose, - analyzedColumnCount=len(self._noisy_columns) + analyzedColumnCount=len(self.noisy_columns) ) return self._diagnosis diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py index fd75262..d8fb91d 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py @@ -74,6 +74,8 @@ def _sort_conditions(self, conditions: List[ConditionRecord]) -> List[ConditionR @staticmethod def _best_change_for_condition(condition: ConditionRecord) -> RecommendedChange: + if condition.columns is None: + raise ValueError('Condition must have columns to recommend a change') if condition.name in ['changing_discrete', 'changing_continuous']: return RemoveColumns(columns=condition.columns, info=condition.info) info = condition.info if condition.info else {} @@ -86,11 +88,14 @@ def min_anomaly_count(self) -> int: return self._min_anomaly_count @min_anomaly_count.setter - def min_anomaly_count(self, count: int): + def min_anomaly_count(self, count: int) -> int: self._min_anomaly_count = count + return self._min_anomaly_count def recommend(self) -> List[RecommendedChange]: - count_tuples = [c.to_tuple() for c in self.report.diagnosticData.analysisResults.anomalies.byColumnCount] + by_col_count = self.report.diagnosticData.analysisResults.anomalies.byColumnCount if ( + self.report.diagnosticData.analysisResults is not None) else [] + count_tuples = [c.to_tuple() for c in by_col_count] cols, counts = zip(*count_tuples) anom_count = pd.Series(counts, index=cols) cols_to_address = anom_count[anom_count >= self.min_anomaly_count] @@ -102,7 +107,7 @@ def recommend(self) -> List[RecommendedChange]: changes.append(self._best_change_for_condition(c)) return changes - def _update_analyzer(self, updated: Analyzer): + def _update_analyzer(self, updated: Analyzer) -> None: self.monitor_api.put_analyzer( org_id=self.org_id, dataset_id=self.dataset_id, @@ -110,8 +115,9 @@ def _update_analyzer(self, updated: Analyzer): body=updated.dict(exclude_none=True), ) - def _delete_monitor(self): - if self.monitor is not None: + def _delete_monitor(self) -> None: + if self.monitor is not None and self.analyzer is not None: + analyzer: Analyzer = self.analyzer self.monitor_api.delete_monitor( org_id=self.org_id, dataset_id=self.dataset_id, @@ -120,10 +126,10 @@ def _delete_monitor(self): self.monitor_api.delete_analyzer( org_id=self.org_id, dataset_id=self.dataset_id, - analyzer_id=self.analyzer.id + analyzer_id=analyzer.id ) - def _add_new_monitor(self, new_analyzer: Analyzer): + def _add_new_monitor(self, new_analyzer: Analyzer) -> None: new_monitor = Monitor(**self.monitor.dict(), id=new_analyzer.id) if self.monitor else Monitor(id=new_analyzer.id) self.monitor_api.put_monitor( org_id=self.org_id, @@ -131,7 +137,7 @@ def _add_new_monitor(self, new_analyzer: Analyzer): monitor_id=new_analyzer.id, # use same id as the analyzer body=new_monitor.json(exclude_none=True), ) - self._monitor_api.put_analyzer( + self.monitor_api.put_analyzer( org_id=self.org_id, dataset_id=self.dataset_id, analyzer_id=new_analyzer.id, @@ -144,7 +150,7 @@ def make_changes(self, changes: Optional[List[RecommendedChange]] = None) -> Cha failed: List[RecommendedChange] = [] errors: List[str] = [] for c in changes: - if c.can_automate(): + if c.can_automate() and self.analyzer: try: changed_analyzers = c.generate_config(self.analyzer) if next((a.id for a in changed_analyzers), None) is None: diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py index 395a62f..4b034a4 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py @@ -8,7 +8,7 @@ class ManualChange(RecommendedChange): manual = True def summarize(self) -> str: - condition = self.info.get('condition') + condition = self.info.get('condition', '') if self.info else '' if condition == 'narrow_threshold_band': # percent diff of 0 would be bad... need to add info to differentiate return 'Move columns to a new analyzer that uses absolute diff, percent diff or fixed thresholds' diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py index 00cb568..b8fb9c4 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py @@ -11,11 +11,11 @@ class RecommendedChange: name = '' summary = '' manual = True - required_info = [] + required_info: List[str] = [] @classmethod - def from_condition(cls, condition: ConditionRecord): - return cls(condition.columns, condition.info) + def from_condition(cls, condition: ConditionRecord) -> RecommendedChange: + return cls(condition.columns if condition.columns is not None else [], condition.info) def __init__(self, columns: List[str], info: Optional[dict] = None): self.columns = columns diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py index fced1f1..fb0e473 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py @@ -1,14 +1,15 @@ -from typing import List +from typing import List, Union -from whylabs_toolkit.monitor.models import Analyzer, TargetLevel +from whylabs_toolkit.monitor.models import Analyzer, TargetLevel, ColumnMatrix, DatasetMatrix from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange +from whylabs_toolkit.monitor.models.analyzer import ColumnGroups class RemoveColumns(RecommendedChange): name = 'remove_columns' summary = 'Remove columns from the analyzer' - required_info = [] + required_info: List[str] = [] manual = False def _check_can_do(self, analyzer: Analyzer) -> bool: @@ -18,11 +19,18 @@ def _check_can_do(self, analyzer: Analyzer) -> bool: def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: self._check_can_do(analyzer) + if isinstance(analyzer.targetMatrix, DatasetMatrix): + return [analyzer] + target_matrix: ColumnMatrix = analyzer.targetMatrix + include: List[str] = analyzer.targetMatrix.include if analyzer.targetMatrix.include is not None else [] + exclude: List[Union[ColumnGroups, str]] = analyzer.targetMatrix.exclude if analyzer.targetMatrix.exclude is not None else [] to_remove = set(self.columns) # remove from includes if possible, otherwise exclude - remove_includes = set(analyzer.targetMatrix.include).intersection(to_remove) - analyzer.targetMatrix.include = list(set(analyzer.targetMatrix.include) - to_remove) - analyzer.targetMatrix.exclude = list(set(analyzer.targetMatrix.exclude) | (to_remove - remove_includes)) + remove_includes = set(include).intersection(to_remove) + new_includes = list(set(include) - to_remove) + analyzer.targetMatrix.include = new_includes + new_excludes = list(set(exclude).union(to_remove - remove_includes)) + analyzer.targetMatrix.exclude = new_excludes # if nothing's left to target, just remove the analyzer if len(analyzer.targetMatrix.include) == 0: return [] diff --git a/whylabs_toolkit/monitor/diagnoser/targeting.py b/whylabs_toolkit/monitor/diagnoser/targeting.py index 7d1e628..e2b9063 100644 --- a/whylabs_toolkit/monitor/diagnoser/targeting.py +++ b/whylabs_toolkit/monitor/diagnoser/targeting.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Union, Set from whylabs_toolkit.monitor.models import EntitySchema, ColumnMatrix, DatasetMatrix, TargetLevel @@ -21,11 +21,12 @@ def expand_target(target: str, schema: EntitySchema) -> List[str]: def targeted_columns(target_matrix: Union[ColumnMatrix, DatasetMatrix], schema: EntitySchema) -> List[str]: if target_matrix is None: return [] - if target_matrix.type == TargetLevel.dataset: + if isinstance(target_matrix, DatasetMatrix): return ['__internal__datasetMetrics'] - columns = set() - for include in target_matrix.include: - columns.update(expand_target(include, schema)) + columns: Set[str] = set() + if target_matrix.include is not None: + for include in target_matrix.include: + columns.update(expand_target(include, schema)) if target_matrix.exclude is not None: for exclude in target_matrix.exclude: columns = columns - set(expand_target(exclude, schema)) From 1bff986b4d1a2eb71c6d0d9581e4e1402bc26748 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Tue, 23 Apr 2024 09:18:36 -0400 Subject: [PATCH 05/14] Apply format fixes --- .bumpversion.cfg | 2 +- pyproject.toml | 2 +- .../diagnoser/converters/granularity.py | 26 +-- .../monitor/diagnoser/helpers/describe.py | 6 +- .../monitor/diagnoser/helpers/utils.py | 37 ++-- .../diagnoser/models/diagnosis_report.py | 164 ++++++++++-------- .../monitor/diagnoser/monitor_diagnoser.py | 126 +++++++++----- .../recommendation/change_recommender.py | 76 ++++---- .../diagnoser/recommendation/manual_change.py | 12 +- .../recommendation/recommended_change.py | 19 +- .../recommendation/remove_columns.py | 10 +- .../monitor/diagnoser/targeting.py | 23 ++- 12 files changed, 283 insertions(+), 220 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8535547..74205e4 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.18 +current_version = 0.1.19 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/pyproject.toml b/pyproject.toml index cf9cb02..3e7660e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "whylabs-toolkit" -version = "0.0.19" +version = "0.1.19" description = "Whylabs Toolkit package." authors = ["Murilo Mendonca ", "Anthony Naddeo ", "Christine Draper "] diff --git a/whylabs_toolkit/monitor/diagnoser/converters/granularity.py b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py index b62c3cb..2d0cc76 100644 --- a/whylabs_toolkit/monitor/diagnoser/converters/granularity.py +++ b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py @@ -4,26 +4,26 @@ def batches_to_timedelta(time_period: str, batches: int) -> relativedelta: - if time_period == 'PT1H': + if time_period == "PT1H": return relativedelta(hours=batches) - if time_period == 'P1W': + if time_period == "P1W": return relativedelta(weeks=batches) - if time_period == 'P1M': + if time_period == "P1M": return relativedelta(months=batches) return relativedelta(days=batches) def time_period_to_granularity(time_period: str) -> Granularity: - if time_period == 'PT1H': + if time_period == "PT1H": return Granularity.hourly - if time_period == 'P1W': + if time_period == "P1W": return Granularity.weekly - if time_period == 'P1M': + if time_period == "P1M": return Granularity.monthly return Granularity.daily @@ -31,21 +31,21 @@ def time_period_to_granularity(time_period: str) -> Granularity: def calculate_num_batches(interval: str, granularity: str) -> int: # Parse the ISO8601 interval string into a start and end datetime - start, end = interval.split('/') - start_date = parse_datetime(start) if 'T' in start else parse_date(start) + start, end = interval.split("/") + start_date = parse_datetime(start) if "T" in start else parse_date(start) try: - end_date = parse_datetime(end) if 'T' in start else parse_date(end) + end_date = parse_datetime(end) if "T" in start else parse_date(end) except ValueError: end_date = start_date + parse_duration(end) # Calculate the difference based on the granularity - if granularity == 'hourly': + if granularity == "hourly": difference = relativedelta(end_date, start_date).days * 24 + relativedelta(end_date, start_date).hours - elif granularity == 'daily': + elif granularity == "daily": difference = relativedelta(end_date, start_date).days - elif granularity == 'weekly': + elif granularity == "weekly": difference = relativedelta(end_date, start_date).weeks - elif granularity == 'monthly': + elif granularity == "monthly": difference = relativedelta(end_date, start_date).months else: raise ValueError(f"Unsupported granularity: {granularity}") diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/describe.py b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py index c5da2e6..7d52d2d 100644 --- a/whylabs_toolkit/monitor/diagnoser/helpers/describe.py +++ b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py @@ -6,14 +6,14 @@ def describe_truncated_list(vals: List[str], num: int = 10) -> str: if len(vals) <= num: return str(vals) - return f'{vals[0:num]} and {len(vals) - num} more' + return f"{vals[0:num]} and {len(vals) - num} more" def describe_truncated_table(df: Union[pd.DataFrame, pd.Series], num: int = 10) -> str: if len(df) <= num: table = df.to_markdown() - return str(table) if table is not None else 'No data to display.' - return f'{df[0:num].to_markdown()}\n and {len(df) - num} more' + return str(table) if table is not None else "No data to display." + return f"{df[0:num].to_markdown()}\n and {len(df) - num} more" def filter_by_index(items: Union[pd.Index, list], ref: pd.Series) -> pd.Series: diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/utils.py b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py index 2aed030..7591b8e 100644 --- a/whylabs_toolkit/monitor/diagnoser/helpers/utils.py +++ b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py @@ -18,8 +18,9 @@ def get_monitor_diagnostics_api(config: Config = Config()) -> MonitorDiagnostics return MonitorDiagnosticsApi(api_client=create_client(config=config)) -def env_setup(org_id: str, dataset_id: str, api_key: Optional[str] = None, - whylabs_endpoint: Optional[str] = None) -> None: +def env_setup( + org_id: str, dataset_id: str, api_key: Optional[str] = None, whylabs_endpoint: Optional[str] = None +) -> None: """ Set environment variables to work with both whylabs-toolkit and whylogs. Will pick up the API key from the environment if not provided as a parameter. @@ -29,39 +30,39 @@ def env_setup(org_id: str, dataset_id: str, api_key: Optional[str] = None, :param whylabs_endpoint: :return: """ - os.environ['WHYLABS_API_KEY'] = api_key if api_key else os.environ['WHYLABS_API_KEY'] - if not os.environ['WHYLABS_API_KEY']: - raise Exception('Please provide an API key') - os.environ['WHYLABS_DEFAULT_ORG_ID'] = org_id - os.environ['ORG_ID'] = org_id - os.environ['WHYLABS_DEFAULT_DATASET_ID'] = dataset_id + os.environ["WHYLABS_API_KEY"] = api_key if api_key else os.environ["WHYLABS_API_KEY"] + if not os.environ["WHYLABS_API_KEY"]: + raise Exception("Please provide an API key") + os.environ["WHYLABS_DEFAULT_ORG_ID"] = org_id + os.environ["ORG_ID"] = org_id + os.environ["WHYLABS_DEFAULT_DATASET_ID"] = dataset_id if whylabs_endpoint: - os.environ['WHYLABS_API_ENDPOINT'] = whylabs_endpoint - os.environ['WHYLABS_HOST'] = whylabs_endpoint + os.environ["WHYLABS_API_ENDPOINT"] = whylabs_endpoint + os.environ["WHYLABS_HOST"] = whylabs_endpoint def segment_to_text(segment: List[SegmentTag]) -> str: if segment is None or len(segment) == 0: - return '' - text = '' + return "" + text = "" for tag in segment: if len(text) > 0: - text += '&' - text += f'{tag.key}={tag.value}' + text += "&" + text += f"{tag.key}={tag.value}" return text def segment_as_readable_text(segment: List[SegmentTag]) -> str: text = segment_to_text(segment) - return 'overall' if text == '' else text + return "overall" if text == "" else text def text_to_segment(text: str) -> List[SegmentTag]: - if text == '': + if text == "": return [] tags = [] - parts = text.split('&') + parts = text.split("&") for part in parts: - [key, value] = part.split('=', 2) + [key, value] = part.split("=", 2) tags.append(SegmentTag(key=key, value=value)) return tags diff --git a/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py index 43f522e..6226842 100644 --- a/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py +++ b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py @@ -1,10 +1,22 @@ import pandas as pd from typing import Dict, List, Optional, Tuple from pydantic import BaseModel -from whylabs_toolkit.monitor.models import Analyzer, Monitor, Segment, TargetLevel, FixedThresholdsConfig, \ - ConjunctionConfig, DisjunctionConfig, GlobalAction - -from whylabs_toolkit.monitor.diagnoser.helpers.describe import describe_truncated_table, filter_by_index, describe_truncated_list +from whylabs_toolkit.monitor.models import ( + Analyzer, + Monitor, + Segment, + TargetLevel, + FixedThresholdsConfig, + ConjunctionConfig, + DisjunctionConfig, + GlobalAction, +) + +from whylabs_toolkit.monitor.diagnoser.helpers.describe import ( + describe_truncated_table, + filter_by_index, + describe_truncated_list, +) from whylabs_toolkit.monitor.diagnoser.helpers.utils import segment_as_readable_text @@ -44,10 +56,12 @@ class ProfileSummary(BaseModel): maxRowCount: int def describe(self) -> str: - count_desc = str( - self.minRowCount) if self.minRowCount == self.maxRowCount else \ - f'{self.minRowCount} - {self.maxRowCount}' - return f'Diagnostic interval rollup contains {count_desc} rows for the diagnosed columns.\n' + count_desc = ( + str(self.minRowCount) + if self.minRowCount == self.maxRowCount + else f"{self.minRowCount} - {self.maxRowCount}" + ) + return f"Diagnostic interval rollup contains {count_desc} rows for the diagnosed columns.\n" class BatchesSummary(BaseModel): @@ -57,10 +71,12 @@ class BatchesSummary(BaseModel): maxBatchCount: int def describe(self) -> str: - count_desc = str( - self.minBatchCount) if self.minBatchCount == self.maxBatchCount else \ - f'{self.minBatchCount} - {self.maxBatchCount}' - return f'Diagnostic interval contains {count_desc} batches.\n' + count_desc = ( + str(self.minBatchCount) + if self.minBatchCount == self.maxBatchCount + else f"{self.minBatchCount} - {self.maxBatchCount}" + ) + return f"Diagnostic interval contains {count_desc} batches.\n" class ResultRecord(BaseModel): @@ -68,7 +84,7 @@ class ResultRecord(BaseModel): batchCount: int def describe(self) -> str: - return f'Found non-failed results for {self.diagnosedColumnCount} columns and {self.batchCount} batches.' + return f"Found non-failed results for {self.diagnosedColumnCount} columns and {self.batchCount} batches." class FailureRecord(BaseModel): @@ -82,12 +98,13 @@ def describe(self) -> str: failures = pd.DataFrame([c.to_tuple() for c in self.byColumnCount]) failure_types = [t.name for t in self.byTypeCount] if len(failures) == 0: - return 'No failures were detected.' + return "No failures were detected." return ( - f'Found {self.totalFailuresCount} failed results, with up to {self.maxFailuresCount} ' - f'failures per column and {self.meanFailuresCount} failures on average.\n' - f'Failure types are {describe_truncated_list(failure_types)}\n' - f'Columns with failures are: \n{describe_truncated_table(failures)}\n') + f"Found {self.totalFailuresCount} failed results, with up to {self.maxFailuresCount} " + f"failures per column and {self.meanFailuresCount} failures on average.\n" + f"Failure types are {describe_truncated_list(failure_types)}\n" + f"Columns with failures are: \n{describe_truncated_table(failures)}\n" + ) class AnomalyRecord(BaseModel): @@ -105,10 +122,11 @@ def describe(self) -> str: mean_count = float(self.meanAnomalyCount) mean_pct = mean_count * 100 / self.batchCount return ( - f'Found {self.totalAnomalyCount} anomalies in {len(self.byColumnCount)} columns, with up to ' - f'{max_pct:.1f}% ({max_count}) batches having anomalies per column and ' - f'{mean_pct:.1f}% ({mean_count:.1f}) on average.\n' - f'Columns with anomalies are:\n{describe_truncated_table(counts)}\n') + f"Found {self.totalAnomalyCount} anomalies in {len(self.byColumnCount)} columns, with up to " + f"{max_pct:.1f}% ({max_count}) batches having anomalies per column and " + f"{mean_pct:.1f}% ({mean_count:.1f}) on average.\n" + f"Columns with anomalies are:\n{describe_truncated_table(counts)}\n" + ) class AnalysisResultsSummary(BaseModel): @@ -117,10 +135,12 @@ class AnalysisResultsSummary(BaseModel): anomalies: AnomalyRecord def describe(self) -> str: - return (f'Analysis results summary:\n' - f'{self.results.describe()}\n' - f'{self.anomalies.describe()}\n' - f'{self.failures.describe()}\n') + return ( + f"Analysis results summary:\n" + f"{self.results.describe()}\n" + f"{self.anomalies.describe()}\n" + f"{self.failures.describe()}\n" + ) class DiagnosticDataSummary(BaseModel): @@ -131,12 +151,14 @@ class DiagnosticDataSummary(BaseModel): targetedColumnCount: int def describe(self) -> str: - return '\n'.join([ - f'Diagnostic segment is "{segment_as_readable_text(self.diagnosticSegment.tags)}".', - self.diagnosticBatches.describe() if self.diagnosticBatches is not None else '', - self.diagnosticProfile.describe() if self.diagnosticProfile is not None else '', - self.analysisResults.describe() if self.analysisResults is not None else '' - ]) + return "\n".join( + [ + f'Diagnostic segment is "{segment_as_readable_text(self.diagnosticSegment.tags)}".', + self.diagnosticBatches.describe() if self.diagnosticBatches is not None else "", + self.diagnosticProfile.describe() if self.diagnosticProfile is not None else "", + self.analysisResults.describe() if self.analysisResults is not None else "", + ] + ) class AnalyzerDiagnosisReport(BaseModel): @@ -150,46 +172,45 @@ class AnalyzerDiagnosisReport(BaseModel): conditions: List[ConditionRecord] def describe(self) -> str: - text = '\n'.join( - [self.diagnosticData.describe(), - self.describe_quality_issues(), self.describe_conditions()]) + text = "\n".join([self.diagnosticData.describe(), self.describe_quality_issues(), self.describe_conditions()]) return text def describe_quality_issues(self) -> str: if len(self.qualityIssues) == 0: - return 'No issues impacting diagnosis quality were detected' - text = 'Conditions that may impact diagnosis quality include:\n' + return "No issues impacting diagnosis quality were detected" + text = "Conditions that may impact diagnosis quality include:\n" for issue in self.qualityIssues: - text += f'\t* {issue.name}: {issue.description} - detectors {issue.detectors}\n' + text += f"\t* {issue.name}: {issue.description} - detectors {issue.detectors}\n" return text def describe_conditions(self) -> str: if len(self.conditions) == 0: - return 'No conditions related to noise were detected.' - text = 'Conditions that may contribute to noise include:\n' + return "No conditions related to noise were detected." + text = "Conditions that may contribute to noise include:\n" condition_cols: List[str] = [] for condition in self.conditions: - text += f'\t* Condition {condition.name} ({condition.summary})' + text += f"\t* Condition {condition.name} ({condition.summary})" if condition.columns is not None: condition_cols += condition.columns col_text = describe_truncated_list(condition_cols, 10) - text += f' for {len(condition_cols)} columns: {col_text}' - text += '\n' + text += f" for {len(condition_cols)} columns: {col_text}" + text += "\n" cols = pd.Series(condition_cols).unique() if len(cols) > 0: - text += f'\nAnomalies for columns with these conditions:\n' - by_col_count = self.diagnosticData.analysisResults.anomalies.byColumnCount if ( - self.diagnosticData.analysisResults is not None) else [] + text += f"\nAnomalies for columns with these conditions:\n" + by_col_count = ( + self.diagnosticData.analysisResults.anomalies.byColumnCount + if (self.diagnosticData.analysisResults is not None) + else [] + ) count_tuples = [c.to_tuple() for c in by_col_count] idx, values = zip(*count_tuples) count_by_col = pd.Series(values, idx) - cols_with_count = filter_by_index(cols.tolist(), count_by_col).sort_values( - ascending=False) - cols_with_count.rename('anomalies') + cols_with_count = filter_by_index(cols.tolist(), count_by_col).sort_values(ascending=False) + cols_with_count.rename("anomalies") text += describe_truncated_table(cols_with_count) - text += (f'\nAccounting for {cols_with_count.sum()} anomalies out of ' - f'{count_by_col.sum()}\n') + text += f"\nAccounting for {cols_with_count.sum()} anomalies out of " f"{count_by_col.sum()}\n" return text @@ -200,38 +221,43 @@ class MonitorDiagnosisReport(AnalyzerDiagnosisReport): analyzedColumnCount: int def describe(self) -> str: - text = '\n'.join( - [self.describe_monitor(), self.describe_analyzer(), super().describe()]) + text = "\n".join([self.describe_monitor(), self.describe_analyzer(), super().describe()]) return text def describe_monitor(self) -> str: if self.monitor is None: - return 'Monitor has been deleted.\n' - text = (f'Diagnosis is for monitor "{self.monitor.displayName if self.monitor.displayName else self.monitor.id}" ' - f'[{self.monitor.id}] in {self.datasetId} {self.orgId}, over interval {self.interval}.\n') + return "Monitor has been deleted.\n" + text = ( + f'Diagnosis is for monitor "{self.monitor.displayName if self.monitor.displayName else self.monitor.id}" ' + f"[{self.monitor.id}] in {self.datasetId} {self.orgId}, over interval {self.interval}.\n" + ) if len(self.monitor.actions) > 0: - text += f'Monitor has {len(self.monitor.actions)} notification actions ' - text += f'{[a.target for a in self.monitor.actions if isinstance(a, GlobalAction)]}.\n' + text += f"Monitor has {len(self.monitor.actions)} notification actions " + text += f"{[a.target for a in self.monitor.actions if isinstance(a, GlobalAction)]}.\n" return text def describe_analyzer(self) -> str: if self.analyzer is None: - return 'No analyzer found.\n' + return "No analyzer found.\n" if isinstance(self.analyzer.config, ConjunctionConfig) or isinstance(self.analyzer.config, DisjunctionConfig): - return f'\nAnalyzer is a composite {self.analyzer.config.type}.' - baseline = 'no baseline' if (isinstance(self.analyzer.config, FixedThresholdsConfig) or - self.analyzer.config.baseline is None) else \ - f'{self.analyzer.config.baseline.type} baseline' - targeting_desc = '' + return f"\nAnalyzer is a composite {self.analyzer.config.type}." + baseline = ( + "no baseline" + if (isinstance(self.analyzer.config, FixedThresholdsConfig) or self.analyzer.config.baseline is None) + else f"{self.analyzer.config.baseline.type} baseline" + ) + targeting_desc = "" if self.analyzer is None: - return '' + return "" metric = self.analyzer.config.metric if self.analyzer.targetMatrix is not None and self.analyzer.targetMatrix.type == TargetLevel.column: - targeting_desc = (f'\nAnalyzer "{self.analyzer.id}" targets {self.diagnosticData.targetedColumnCount} ' - f'columns and ran on {self.analyzedColumnCount} columns in the diagnosed segment.\n') - text = f'Analyzer is {self.analyzer.config.type} configuration for {metric} metric with {baseline}.' + targeting_desc = ( + f'\nAnalyzer "{self.analyzer.id}" targets {self.diagnosticData.targetedColumnCount} ' + f"columns and ran on {self.analyzedColumnCount} columns in the diagnosed segment.\n" + ) + text = f"Analyzer is {self.analyzer.config.type} configuration for {metric} metric with {baseline}." text += targeting_desc - text += '\n' + text += "\n" return text diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py index 36c36ca..bc71c0a 100644 --- a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -21,8 +21,14 @@ from whylabs_toolkit.monitor.diagnoser.helpers.utils import get_monitor_diagnostics_api, segment_as_readable_text from whylabs_toolkit.monitor.diagnoser.converters.granularity import time_period_to_granularity from whylabs_toolkit.monitor.diagnoser.constants import DEFAULT_BATCHES -from whylabs_toolkit.monitor.diagnoser.models import NoisyMonitorStats, FailedMonitorStats, FailedSegmentStats, \ - NoisySegmentStats, NoisyColumnStats, MonitorDiagnosisReport +from whylabs_toolkit.monitor.diagnoser.models import ( + NoisyMonitorStats, + FailedMonitorStats, + FailedSegmentStats, + NoisySegmentStats, + NoisyColumnStats, + MonitorDiagnosisReport, +) from whylabs_toolkit.monitor.diagnoser.targeting import targeted_columns @@ -92,7 +98,7 @@ def monitor_configs(self) -> List[Monitor]: if self._monitor_configs is None: config = self._monitor_api.get_monitor_config_v3(self.org_id, self.dataset_id) self._monitor_configs = [] - for m in config.get('monitors', []): + for m in config.get("monitors", []): try: self._monitor_configs.append(Monitor.parse_obj(m)) except ValidationError: @@ -168,13 +174,12 @@ def choose_dataset_batches(self) -> Tuple[TimeRange, Granularity, str]: """ # get recommended diagnostic interval and the dataset's batch frequency resp: DiagnosticIntervalResponse = self._diagnostics_api.recommend_diagnostic_interval( - self.org_id, - DiagnosticIntervalRequest(dataset_id=self.dataset_id, batches=self.desired_batches) + self.org_id, DiagnosticIntervalRequest(dataset_id=self.dataset_id, batches=self.desired_batches) ) time_period = resp.time_period self._diagnostic_interval = resp.interval if resp.start_timestamp is None or resp.end_timestamp is None: - raise Exception('No existing batch data') + raise Exception("No existing batch data") lineage = TimeRange(start=resp.start_timestamp, end=resp.end_timestamp) self.granularity = time_period_to_granularity(time_period) @@ -189,39 +194,48 @@ def detect_noisy_monitors(self) -> List[NoisyMonitorStats]: """ def merge_monitor_actions(item: Dict, mon_acts: List[Dict]) -> Dict: - monitor_action = next((m for m in mon_acts if m['analyzer_id'] == item['analyzer_id']), None) + monitor_action = next((m for m in mon_acts if m["analyzer_id"] == item["analyzer_id"]), None) if monitor_action: item.update(monitor_action) else: - item['action_count'] = 0 - item['action_targets'] = [] + item["action_count"] = 0 + item["action_targets"] = [] return item if self._diagnostic_interval is None: self.choose_dataset_batches() resp: AnalyzersDiagnosticResponse = self._diagnostics_api.detect_noisy_analyzers( - self.org_id, AnalyzersDiagnosticRequest(dataset_id=self.dataset_id, interval=self._diagnostic_interval)) - monitor_actions = [{ - 'monitor_id': m.id, - 'analyzer_id': m.analyzerIds[0] if len(m.analyzerIds) > 0 else None, - 'action_count': len(m.actions), - 'action_targets': [a.target for a in m.actions if a.type == 'global'] - } for m in self.monitor_configs] - self._noisy_monitors = [NoisyMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) - for item in resp.noisy_analyzers] - self._failed_monitors = [FailedMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) - for item in resp.failed_analyzers] + self.org_id, AnalyzersDiagnosticRequest(dataset_id=self.dataset_id, interval=self._diagnostic_interval) + ) + monitor_actions = [ + { + "monitor_id": m.id, + "analyzer_id": m.analyzerIds[0] if len(m.analyzerIds) > 0 else None, + "action_count": len(m.actions), + "action_targets": [a.target for a in m.actions if a.type == "global"], + } + for m in self.monitor_configs + ] + self._noisy_monitors = [ + NoisyMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) + for item in resp.noisy_analyzers + ] + self._failed_monitors = [ + FailedMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) + for item in resp.failed_analyzers + ] if len(self._noisy_monitors) == 0: - raise Exception('No noisy monitors found') + raise Exception("No noisy monitors found") if self._monitor_id is None: self._monitor_id = self._noisy_monitors[0].monitor_id return self._noisy_monitors def get_analyzer_id_for_monitor(self) -> str: - analyzer_id: Optional[str] = next((m.analyzerIds[0] for m in self.monitor_configs if m.id == self.monitor_id_to_diagnose), - None) + analyzer_id: Optional[str] = next( + (m.analyzerIds[0] for m in self.monitor_configs if m.id == self.monitor_id_to_diagnose), None + ) if analyzer_id is None: - raise Exception(f'No analyzer found for monitor {self.monitor_id_to_diagnose}') + raise Exception(f"No analyzer found for monitor {self.monitor_id_to_diagnose}") return analyzer_id def detect_noisy_segments(self) -> List[NoisySegmentStats]: @@ -229,7 +243,9 @@ def detect_noisy_segments(self) -> List[NoisySegmentStats]: resp: AnalyzerSegmentsDiagnosticResponse = self._diagnostics_api.detect_noisy_segments( self.org_id, AnalyzerSegmentsDiagnosticRequest( - dataset_id=self.dataset_id, analyzer_id=analyzer_id, interval=self._diagnostic_interval)) + dataset_id=self.dataset_id, analyzer_id=analyzer_id, interval=self._diagnostic_interval + ), + ) self._noisy_segments = [NoisySegmentStats.parse_obj(n.to_dict()) for n in resp.noisy_segments] self._failed_segments = [FailedSegmentStats.parse_obj(n.to_dict()) for n in resp.failed_segments] self.diagnostic_segment = self._noisy_segments[0].segment @@ -240,31 +256,37 @@ def detect_noisy_columns(self) -> List[NoisyColumnStats]: resp: AnalyzerSegmentColumnsDiagnosticResponse = self._diagnostics_api.detect_noisy_columns( self.org_id, AnalyzerSegmentColumnsDiagnosticRequest( - dataset_id=self.dataset_id, analyzer_id=analyzer_id, interval=self._diagnostic_interval, - segment=WhyLabsSegment(tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags]))) + dataset_id=self.dataset_id, + analyzer_id=analyzer_id, + interval=self._diagnostic_interval, + segment=WhyLabsSegment(tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags]), + ), + ) self._noisy_columns = [NoisyColumnStats.parse_obj(n.to_dict()) for n in resp.noisy_columns] return self._noisy_columns def describe_segments(self) -> str: with_anomalies = [s for s in self.noisy_segments if s.total_anomalies > 0] with_failures = [s for s in self.failed_segments if s.total_failed > 0] - text = (f'{len(with_anomalies)} of {len(self.noisy_segments)} analyzed segments have anomalies ' - f'and {len(with_failures)} have failures\n\n') + text = ( + f"{len(with_anomalies)} of {len(self.noisy_segments)} analyzed segments have anomalies " + f"and {len(with_failures)} have failures\n\n" + ) if len(with_anomalies): - text += 'Segments with anomalies:\n' + text += "Segments with anomalies:\n" text += pd.DataFrame.from_records(with_anomalies).to_markdown() - text += '\n' + text += "\n" if len(with_failures): - text += 'Segments with failures:\n' + text += "Segments with failures:\n" text += pd.DataFrame.from_records(with_failures).to_markdown() - text += '\n' + text += "\n" noisiest = segment_as_readable_text(self.noisy_segments[0].segment.tags) - text += f'Noisiest segment selected for diagnosis: {noisiest}\n' + text += f"Noisiest segment selected for diagnosis: {noisiest}\n" return text def describe_columns(self) -> str: cols = self.noisy_columns - text = f'Analysis ran on {len(cols)} columns in the diagnosed segment.\n' + text = f"Analysis ran on {len(cols)} columns in the diagnosed segment.\n" text += pd.DataFrame.from_records(cols).to_markdown() return text @@ -281,17 +303,21 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor self._diagnosed_columns = [c.column for c in self.noisy_columns[:100]] else: self._diagnosed_columns = columns[:100] - use_local_server = os.environ.get('USE_LOCAL_SERVER', False) + use_local_server = os.environ.get("USE_LOCAL_SERVER", False) if use_local_server: # Call the server function directly if configured to do so (for testing) try: from smart_config.server.server import DiagnosisRequest from smart_config.server.diagnosis.analyzer_diagnoser import AnalyzerDiagnoser - if use_local_server == 'library': + + if use_local_server == "library": # Call server code directly analyzer_diagnoser = AnalyzerDiagnoser( - self.org_id, self.dataset_id, self.get_analyzer_id_for_monitor(), self.diagnostic_interval, - os.environ['WHYLABS_API_KEY'] + self.org_id, + self.dataset_id, + self.get_analyzer_id_for_monitor(), + self.diagnostic_interval, + os.environ["WHYLABS_API_KEY"], ) analyzer_diagnoser.assemble_data([t for t in self.diagnostic_segment.tags], self._diagnosed_columns) analyzer_diagnoser.run_detectors() @@ -300,9 +326,16 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor else: # Call local instance of server from smart_config.server.service.diagnosis_service import DiagnosisService - diagnosis_service = DiagnosisService(options={ - 'headers': {'Accept': 'application/json', 'Content-Type': 'application/json', - 'X-API-KEY': os.environ['WHYLABS_API_KEY']}}) + + diagnosis_service = DiagnosisService( + options={ + "headers": { + "Accept": "application/json", + "Content-Type": "application/json", + "X-API-KEY": os.environ["WHYLABS_API_KEY"], + } + } + ) report_dict = diagnosis_service.diagnose_sync( DiagnosisRequest( orgId=self.org_id, @@ -312,18 +345,19 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor columns=self._diagnosed_columns, segment=self.diagnostic_segment, granularity=self.granularity, - )) + ) + ) except ImportError: - raise Exception('USE_LOCAL_SERVER is set but server library is not available.') + raise Exception("USE_LOCAL_SERVER is set but server library is not available.") else: # TODO implement call through songbird/whylabs-client instead of direct # Call the diagnosis API via whyLabs client - raise NotImplementedError('Diagnosis API call not implemented') + raise NotImplementedError("Diagnosis API call not implemented") self._diagnosis = MonitorDiagnosisReport( **report_dict, analyzer=self.analyzer_to_diagnose, monitor=self.monitor_to_diagnose, - analyzedColumnCount=len(self.noisy_columns) + analyzedColumnCount=len(self.noisy_columns), ) return self._diagnosis diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py index d8fb91d..40660c2 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py @@ -21,18 +21,18 @@ class ChangeResults(NamedTuple): manual: List[RecommendedChange] def describe(self) -> str: - description = '' + description = "" if len(self.succeeded): - description += 'Successfully made the following changes:\n' - description += '\n\t'.join(['\t* ' + c.describe() for c in self.succeeded]) + '\n' + description += "Successfully made the following changes:\n" + description += "\n\t".join(["\t* " + c.describe() for c in self.succeeded]) + "\n" if len(self.failed): - description += 'Failed to make the following changes:\n' - description += '\n\t'.join(['\t* ' + c.describe() for c in self.failed]) - description += '\nErrors:\n' - description += '\n\t'.join(['\t* ' + e for e in self.errors]) + '\n' + description += "Failed to make the following changes:\n" + description += "\n\t".join(["\t* " + c.describe() for c in self.failed]) + description += "\nErrors:\n" + description += "\n\t".join(["\t* " + e for e in self.errors]) + "\n" if len(self.manual): - description += 'The following changes require manual intervention:\n' - description += '\n\t'.join(['\t* ' + c.describe() for c in self.manual]) + '\n' + description += "The following changes require manual intervention:\n" + description += "\n\t".join(["\t* " + c.describe() for c in self.manual]) + "\n" return description @@ -40,18 +40,21 @@ class ChangeRecommender: _condition_order = [ # specific conditions unlikely to be rectified by other actions - 'changing_discrete', 'changing_continuous', - 'few_unique', 'many_unique', 'very_few_unique', - 'late_upload_mismatch', - 'narrow_threshold_band', - 'small_nonnull_batches', + "changing_discrete", + "changing_continuous", + "few_unique", + "many_unique", + "very_few_unique", + "late_upload_mismatch", + "narrow_threshold_band", + "small_nonnull_batches", # most general conditions - 'stale_analysis', - 'low_drift_threshold', - 'fixed_threshold_mismatch', - 'stddev_insufficient_baseline', - 'missing_baseline_batches', - 'fixed_baseline_mismatch' + "stale_analysis", + "low_drift_threshold", + "fixed_threshold_mismatch", + "stddev_insufficient_baseline", + "missing_baseline_batches", + "fixed_baseline_mismatch", ] def __init__(self, report: MonitorDiagnosisReport): @@ -75,12 +78,12 @@ def _sort_conditions(self, conditions: List[ConditionRecord]) -> List[ConditionR @staticmethod def _best_change_for_condition(condition: ConditionRecord) -> RecommendedChange: if condition.columns is None: - raise ValueError('Condition must have columns to recommend a change') - if condition.name in ['changing_discrete', 'changing_continuous']: + raise ValueError("Condition must have columns to recommend a change") + if condition.name in ["changing_discrete", "changing_continuous"]: return RemoveColumns(columns=condition.columns, info=condition.info) info = condition.info if condition.info else {} - info['condition'] = condition.name - info['summary'] = condition.summary + info["condition"] = condition.name + info["summary"] = condition.summary return ManualChange(columns=condition.columns, info=info) @property @@ -93,8 +96,11 @@ def min_anomaly_count(self, count: int) -> int: return self._min_anomaly_count def recommend(self) -> List[RecommendedChange]: - by_col_count = self.report.diagnosticData.analysisResults.anomalies.byColumnCount if ( - self.report.diagnosticData.analysisResults is not None) else [] + by_col_count = ( + self.report.diagnosticData.analysisResults.anomalies.byColumnCount + if (self.report.diagnosticData.analysisResults is not None) + else [] + ) count_tuples = [c.to_tuple() for c in by_col_count] cols, counts = zip(*count_tuples) anom_count = pd.Series(counts, index=cols) @@ -118,19 +124,13 @@ def _update_analyzer(self, updated: Analyzer) -> None: def _delete_monitor(self) -> None: if self.monitor is not None and self.analyzer is not None: analyzer: Analyzer = self.analyzer - self.monitor_api.delete_monitor( - org_id=self.org_id, - dataset_id=self.dataset_id, - monitor_id=self.monitor.id - ) - self.monitor_api.delete_analyzer( - org_id=self.org_id, - dataset_id=self.dataset_id, - analyzer_id=analyzer.id - ) + self.monitor_api.delete_monitor(org_id=self.org_id, dataset_id=self.dataset_id, monitor_id=self.monitor.id) + self.monitor_api.delete_analyzer(org_id=self.org_id, dataset_id=self.dataset_id, analyzer_id=analyzer.id) def _add_new_monitor(self, new_analyzer: Analyzer) -> None: - new_monitor = Monitor(**self.monitor.dict(), id=new_analyzer.id) if self.monitor else Monitor(id=new_analyzer.id) + new_monitor = ( + Monitor(**self.monitor.dict(), id=new_analyzer.id) if self.monitor else Monitor(id=new_analyzer.id) + ) self.monitor_api.put_monitor( org_id=self.org_id, dataset_id=self.dataset_id, @@ -165,5 +165,5 @@ def make_changes(self, changes: Optional[List[RecommendedChange]] = None) -> Cha succeeded.append(c) except Exception as e: failed.append(c) - errors.append(f'{c.name} failed with {e}') + errors.append(f"{c.name} failed with {e}") return ChangeResults(succeeded, failed, errors, [c for c in changes if not c.can_automate()]) diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py index 4b034a4..b885fe3 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py @@ -2,14 +2,14 @@ class ManualChange(RecommendedChange): - name = 'manual_change' - summary = 'Make a manual change to the analyzer to address {condition}: {summary}' - required_info = ['condition'] + name = "manual_change" + summary = "Make a manual change to the analyzer to address {condition}: {summary}" + required_info = ["condition"] manual = True def summarize(self) -> str: - condition = self.info.get('condition', '') if self.info else '' - if condition == 'narrow_threshold_band': + condition = self.info.get("condition", "") if self.info else "" + if condition == "narrow_threshold_band": # percent diff of 0 would be bad... need to add info to differentiate - return 'Move columns to a new analyzer that uses absolute diff, percent diff or fixed thresholds' + return "Move columns to a new analyzer that uses absolute diff, percent diff or fixed thresholds" return super().summarize() diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py index b8fb9c4..b900974 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py @@ -8,8 +8,8 @@ class RecommendedChange: - name = '' - summary = '' + name = "" + summary = "" manual = True required_info: List[str] = [] @@ -23,12 +23,12 @@ def __init__(self, columns: List[str], info: Optional[dict] = None): def merge(self, change: RecommendedChange) -> RecommendedChange: if change.name != self.name: - raise ValueError(f'Cannot merge {self.name} and {change.name}') + raise ValueError(f"Cannot merge {self.name} and {change.name}") merged = RecommendedChange(list(set(self.columns) | set(change.columns)), self.info) merged.merge_info(change.info) return merged - def merge_info(self, info: Optional[dict]) -> Optional[dict]: + def merge_info(self, info: Optional[dict]) -> Optional[dict]: if self.info is None: self.info = info elif info is not None: @@ -40,20 +40,21 @@ def summarize(self) -> str: return self.summary.format(**info) def describe(self) -> str: - return f'{self.summarize()} for {describe_truncated_list(self.columns)}' + return f"{self.summarize()} for {describe_truncated_list(self.columns)}" def can_automate(self) -> bool: return all(getattr(self.info, f, False) for f in self.required_info) and not self.manual def _check_can_do(self, analyzer: Analyzer) -> bool: if self.manual: - raise Exception(f'{self.name} has not been automated') + raise Exception(f"{self.name} has not been automated") if not self.can_automate(): - raise Exception(f'{self.name} requires extra information ' - f'{[f for f in self.required_info if self.info is None or f not in self.info.keys()]}') + raise Exception( + f"{self.name} requires extra information " + f"{[f for f in self.required_info if self.info is None or f not in self.info.keys()]}" + ) return True def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: self._check_can_do(analyzer) return [analyzer] - diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py index fb0e473..531a40a 100644 --- a/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py @@ -7,14 +7,14 @@ class RemoveColumns(RecommendedChange): - name = 'remove_columns' - summary = 'Remove columns from the analyzer' + name = "remove_columns" + summary = "Remove columns from the analyzer" required_info: List[str] = [] manual = False def _check_can_do(self, analyzer: Analyzer) -> bool: if analyzer.targetMatrix.type == TargetLevel.dataset: - raise ValueError('Cannot remove columns from a dataset level target matrix') + raise ValueError("Cannot remove columns from a dataset level target matrix") return super()._check_can_do(analyzer) def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: @@ -23,7 +23,9 @@ def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: return [analyzer] target_matrix: ColumnMatrix = analyzer.targetMatrix include: List[str] = analyzer.targetMatrix.include if analyzer.targetMatrix.include is not None else [] - exclude: List[Union[ColumnGroups, str]] = analyzer.targetMatrix.exclude if analyzer.targetMatrix.exclude is not None else [] + exclude: List[Union[ColumnGroups, str]] = ( + analyzer.targetMatrix.exclude if analyzer.targetMatrix.exclude is not None else [] + ) to_remove = set(self.columns) # remove from includes if possible, otherwise exclude remove_includes = set(include).intersection(to_remove) diff --git a/whylabs_toolkit/monitor/diagnoser/targeting.py b/whylabs_toolkit/monitor/diagnoser/targeting.py index e2b9063..e999b88 100644 --- a/whylabs_toolkit/monitor/diagnoser/targeting.py +++ b/whylabs_toolkit/monitor/diagnoser/targeting.py @@ -4,17 +4,17 @@ def expand_target(target: str, schema: EntitySchema) -> List[str]: - if target == '*': + if target == "*": return [str(k) for k in schema.columns.keys()] col_items = schema.columns.items() - if target == 'group:discrete': - return [name for (name, c) in col_items if c.discreteness == 'discrete'] - if target == 'group:continuous': - return [name for (name, c) in col_items if c.discreteness != 'discrete'] - if target == 'group:input': - return [name for (name, c) in col_items if c.classifier == 'input'] - if target == 'group:output': - return [name for (name, c) in col_items if c.classifier == 'output'] + if target == "group:discrete": + return [name for (name, c) in col_items if c.discreteness == "discrete"] + if target == "group:continuous": + return [name for (name, c) in col_items if c.discreteness != "discrete"] + if target == "group:input": + return [name for (name, c) in col_items if c.classifier == "input"] + if target == "group:output": + return [name for (name, c) in col_items if c.classifier == "output"] return [target] @@ -22,13 +22,12 @@ def targeted_columns(target_matrix: Union[ColumnMatrix, DatasetMatrix], schema: if target_matrix is None: return [] if isinstance(target_matrix, DatasetMatrix): - return ['__internal__datasetMetrics'] + return ["__internal__datasetMetrics"] columns: Set[str] = set() if target_matrix.include is not None: for include in target_matrix.include: - columns.update(expand_target(include, schema)) + columns.update(expand_target(include, schema)) if target_matrix.exclude is not None: for exclude in target_matrix.exclude: columns = columns - set(expand_target(exclude, schema)) return list(columns) - From cab753cb114e0ec08d58bda3c1c3a98508f321d4 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Tue, 23 Apr 2024 09:21:03 -0400 Subject: [PATCH 06/14] Update diagnoser readme --- whylabs_toolkit/monitor/diagnoser/README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/whylabs_toolkit/monitor/diagnoser/README.md b/whylabs_toolkit/monitor/diagnoser/README.md index e534e26..26de78c 100644 --- a/whylabs_toolkit/monitor/diagnoser/README.md +++ b/whylabs_toolkit/monitor/diagnoser/README.md @@ -13,9 +13,14 @@ diagnoser may not match the dataset to any known conditions. Users will also usu most appropriate action to take to fix the monitor. A recommender is provided to suggest reasonable actions and to automate some of the basic actions. We are happy to work with you to improve the diagnoser in such cases. +## Usage +To start using the diagnoser, install whylabs_toolkit including the diagnoser extra from PyPI with: +```bash +pip install 'whylabs_toolkit[diagnoser]' +``` + See [diagnoser.ipynb](/examples/example_notebooks/diagnoser.ipynb) for an end-to-end example of identifying noisy monitors, diagnosing the conditions contributing to noise, and getting recommendations for fixing them. -See [customized_diagnoser.ipynb](/examples/example_notebooks/customized_diagnoser.ipynb) for an example of how to -customize the diagnosis for your specific needs. - +See [customized_diagnoser.ipynb](/examples/example_notebooks/customized_diagnoser.ipynb) for an example of how to +customize the diagnosis for your specific needs. \ No newline at end of file From 4a107cae7e620200111191aaa635387bf4519025 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Fri, 26 Apr 2024 16:13:57 -0400 Subject: [PATCH 07/14] Call diagnoser service via API --- .../customized_diagnoser.ipynb | 1202 ++--------------- examples/example_notebooks/diagnoser.ipynb | 474 +++++-- poetry.lock | 8 +- pyproject.toml | 2 +- whylabs_toolkit/helpers/utils.py | 5 + .../monitor/diagnoser/monitor_diagnoser.py | 46 +- 6 files changed, 533 insertions(+), 1204 deletions(-) diff --git a/examples/example_notebooks/customized_diagnoser.ipynb b/examples/example_notebooks/customized_diagnoser.ipynb index ade9188..a88123f 100644 --- a/examples/example_notebooks/customized_diagnoser.ipynb +++ b/examples/example_notebooks/customized_diagnoser.ipynb @@ -19,80 +19,15 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:13.012745Z", - "start_time": "2024-04-16T15:01:09.165663Z" - }, "pycharm": { "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2024-04-26T20:11:58.764825Z", + "start_time": "2024-04-26T20:11:58.762547Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Obtaining file:///Volumes/Workspace/hack/smart-config\r\n", - " Installing build dependencies ... \u001b[?25ldone\r\n", - "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\r\n", - "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\r\n", - "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\r\n", - "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\r\n", - "\u001b[?25hRequirement already satisfied: tabulate in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.9.0)\r\n", - "Requirement already satisfied: pandas in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.0.1)\r\n", - "Requirement already satisfied: numpy in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (1.24.3)\r\n", - "Requirement already satisfied: whylabs-client in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.2)\r\n", - "Requirement already satisfied: whylabs-toolkit in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.0.18)\r\n", - "Requirement already satisfied: pydantic<2 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (1.10.14)\r\n", - "Requirement already satisfied: isodate in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.1)\r\n", - "Requirement already satisfied: python-dateutil in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.8.2)\r\n", - "Requirement already satisfied: fastapi in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.110.0)\r\n", - "Requirement already satisfied: uvicorn[standard] in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (0.28.0)\r\n", - "Requirement already satisfied: requests in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from WhyLabs-Monitor-Diagnoser==0.0.1) (2.31.0)\r\n", - "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from pydantic<2->WhyLabs-Monitor-Diagnoser==0.0.1) (4.9.0)\r\n", - "Requirement already satisfied: starlette<0.37.0,>=0.36.3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (0.36.3)\r\n", - "Requirement already satisfied: six in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from isodate->WhyLabs-Monitor-Diagnoser==0.0.1) (1.16.0)\r\n", - "Requirement already satisfied: pytz>=2020.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from pandas->WhyLabs-Monitor-Diagnoser==0.0.1) (2022.7.1)\r\n", - "Requirement already satisfied: tzdata>=2022.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from pandas->WhyLabs-Monitor-Diagnoser==0.0.1) (2023.3)\r\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (3.3.2)\r\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (3.6)\r\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (2.2.0)\r\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from requests->WhyLabs-Monitor-Diagnoser==0.0.1) (2024.2.2)\r\n", - "Requirement already satisfied: click>=7.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (8.0.4)\r\n", - "Requirement already satisfied: h11>=0.8 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.14.0)\r\n", - "Requirement already satisfied: httptools>=0.5.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.6.1)\r\n", - "Requirement already satisfied: python-dotenv>=0.13 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (1.0.1)\r\n", - "Requirement already satisfied: pyyaml>=5.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (6.0)\r\n", - "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.19.0)\r\n", - "Requirement already satisfied: watchfiles>=0.13 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (0.21.0)\r\n", - "Requirement already satisfied: websockets>=10.4 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from uvicorn[standard]->WhyLabs-Monitor-Diagnoser==0.0.1) (12.0)\r\n", - "Requirement already satisfied: jsonschema<5.0.0,>=4.17.3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (4.21.1)\r\n", - "Requirement already satisfied: whylogs<2.0.0,>=1.1.26 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (1.2.8)\r\n", - "Requirement already satisfied: attrs>=22.2.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (23.2.0)\r\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (2023.12.1)\r\n", - "Requirement already satisfied: referencing>=0.28.4 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (0.33.0)\r\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from jsonschema<5.0.0,>=4.17.3->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (0.18.0)\r\n", - "Requirement already satisfied: anyio<5,>=3.4.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from starlette<0.37.0,>=0.36.3->fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (3.6.2)\r\n", - "Requirement already satisfied: platformdirs<4.0.0,>=3.5.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (3.11.0)\r\n", - "Requirement already satisfied: protobuf>=3.19.4 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (4.25.2)\r\n", - "Requirement already satisfied: types-requests<3.0.0.0,>=2.30.0.0 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (2.31.0.20240125)\r\n", - "Requirement already satisfied: whylogs-sketching>=3.4.1.dev3 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from whylogs<2.0.0,>=1.1.26->whylabs-toolkit->WhyLabs-Monitor-Diagnoser==0.0.1) (3.4.1.dev3)\r\n", - "Requirement already satisfied: sniffio>=1.1 in /Users/cdraper/miniconda3/envs/hackthis/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette<0.37.0,>=0.36.3->fastapi->WhyLabs-Monitor-Diagnoser==0.0.1) (1.3.0)\r\n", - "Building wheels for collected packages: WhyLabs-Monitor-Diagnoser\r\n", - " Building editable for WhyLabs-Monitor-Diagnoser (pyproject.toml) ... \u001b[?25ldone\r\n", - "\u001b[?25h Created wheel for WhyLabs-Monitor-Diagnoser: filename=WhyLabs_Monitor_Diagnoser-0.0.1-0.editable-py3-none-any.whl size=3253 sha256=7b4cbfe8c7d43b46817562de75e01238943321354a771ca71eae6da224702c26\r\n", - " Stored in directory: /private/var/folders/kg/k2sb6xms2650ty85vy98q5qr0000gn/T/pip-ephem-wheel-cache-mw1sol4x/wheels/3b/90/fd/b769d4b005362ce18dbd94fe781f74806d1a79ffbe447812d7\r\n", - "Successfully built WhyLabs-Monitor-Diagnoser\r\n", - "Installing collected packages: WhyLabs-Monitor-Diagnoser\r\n", - " Attempting uninstall: WhyLabs-Monitor-Diagnoser\r\n", - " Found existing installation: WhyLabs-Monitor-Diagnoser 0.0.1\r\n", - " Uninstalling WhyLabs-Monitor-Diagnoser-0.0.1:\r\n", - " Successfully uninstalled WhyLabs-Monitor-Diagnoser-0.0.1\r\n", - "Successfully installed WhyLabs-Monitor-Diagnoser-0.0.1\r\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ "#%pip install whylabs-toolkit[diagnoser]\n" ] @@ -112,26 +47,15 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:16.123058Z", - "start_time": "2024-04-16T15:01:13.014131Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:02.212387Z", + "start_time": "2024-04-26T20:11:58.779115Z" } }, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - " ········\n" - ] - } - ], + "outputs": [], "source": [ "import getpass\n", "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", @@ -152,10 +76,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "Then initialize the Monitor Diagnoser with the org_id and dataset_id." @@ -163,15 +84,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:16.451964Z", - "start_time": "2024-04-16T15:01:16.124858Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:02.537020Z", + "start_time": "2024-04-26T20:12:02.214892Z" } }, "outputs": [], @@ -192,23 +110,19 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:01:17.021550Z", - "start_time": "2024-04-16T15:01:16.452760Z" + "end_time": "2024-04-26T20:12:02.959292Z", + "start_time": "2024-04-26T20:12:02.538022Z" } }, "outputs": [ { "data": { - "text/plain": [ - "(TimeRange(start=datetime.datetime(2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 4, 21, 21, 0, tzinfo=datetime.timezone.utc)),\n", - " ,\n", - " '2024-03-22T00:00:00.000Z/2024-04-21T00:00:00.000Z')" - ] + "text/plain": "(TimeRange(start=datetime.datetime(2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 4, 25, 21, 0, tzinfo=datetime.timezone.utc)),\n ,\n '2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z')" }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -229,269 +143,21 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.724361Z", - "start_time": "2024-04-16T15:01:17.024927Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:04.625414Z", + "start_time": "2024-04-26T20:12:02.961490Z" } }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11303030300[]
1cooperative-maroon-parrot-8886discrete-drift-jensenshannon-analyzerfrequent_items11303030300[]
2famous-salmon-cobra-8902famous-salmon-cobra-8902-analyzermin11303030300[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11303030300[]
4Nonecooperative-maroon-parrot-8886-analyzerfrequent_items11303030300[]
....................................
94glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram1122220[]
95Noneshy-black-raccoon-3594-analyzerhistogram1122220[]
96Noneeager-violet-newt-4599-analyzercount_null_ratio221262110[]
97unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
98expensive-tomato-moose-6522csw-analyzer-2median1111110[]
\n", - "

99 rows × 11 columns

\n", - "
" - ], - "text/plain": [ - " monitor_id analyzer_id \\\n", - "0 kind-cyan-kangaroo-1253 kind-cyan-kangaroo-1253-analyzer \n", - "1 cooperative-maroon-parrot-8886 discrete-drift-jensenshannon-analyzer \n", - "2 famous-salmon-cobra-8902 famous-salmon-cobra-8902-analyzer \n", - "3 proud-seagreen-carabeef-65 proud-seagreen-carabeef-65-analyzer \n", - "4 None cooperative-maroon-parrot-8886-analyzer \n", - ".. ... ... \n", - "94 glamorous-orchid-turtle-6425 glamorous-orchid-turtle-6425-analyzer \n", - "95 None shy-black-raccoon-3594-analyzer \n", - "96 None eager-violet-newt-4599-analyzer \n", - "97 unsightly-bisque-lemur-1917 unsightly-bisque-lemur-1917-analyzer \n", - "98 expensive-tomato-moose-6522 csw-analyzer-2 \n", - "\n", - " metric column_count segment_count anomaly_count \\\n", - "0 histogram 1 1 30 \n", - "1 frequent_items 1 1 30 \n", - "2 min 1 1 30 \n", - "3 histogram 1 1 30 \n", - "4 frequent_items 1 1 30 \n", - ".. ... ... ... ... \n", - "94 histogram 1 1 2 \n", - "95 histogram 1 1 2 \n", - "96 count_null_ratio 22 1 26 \n", - "97 frequent_items 1 1 1 \n", - "98 median 1 1 1 \n", - "\n", - " max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \\\n", - "0 30 30 30 \n", - "1 30 30 30 \n", - "2 30 30 30 \n", - "3 30 30 30 \n", - "4 30 30 30 \n", - ".. ... ... ... \n", - "94 2 2 2 \n", - "95 2 2 2 \n", - "96 2 1 1 \n", - "97 1 1 1 \n", - "98 1 1 1 \n", - "\n", - " action_count action_targets \n", - "0 0 [] \n", - "1 0 [] \n", - "2 0 [] \n", - "3 0 [] \n", - "4 0 [] \n", - ".. ... ... \n", - "94 0 [] \n", - "95 0 [] \n", - "96 0 [] \n", - "97 0 [] \n", - "98 0 [] \n", - "\n", - "[99 rows x 11 columns]" - ] + "text/plain": " monitor_id \\\n0 kind-cyan-kangaroo-1253 \n1 cooperative-maroon-parrot-8886 \n2 famous-salmon-cobra-8902 \n3 proud-seagreen-carabeef-65 \n4 None \n.. ... \n94 glamorous-orchid-turtle-6425 \n95 breakable-limegreen-shrew-7623 \n96 hilarious-powderblue-chamois-8115 \n97 horrible-magenta-sandpiper-8117 \n98 unsightly-bisque-lemur-1917 \n\n analyzer_id metric column_count \\\n0 kind-cyan-kangaroo-1253-analyzer histogram 1 \n1 discrete-drift-jensenshannon-analyzer frequent_items 1 \n2 famous-salmon-cobra-8902-analyzer min 1 \n3 proud-seagreen-carabeef-65-analyzer histogram 1 \n4 cooperative-maroon-parrot-8886-analyzer frequent_items 1 \n.. ... ... ... \n94 glamorous-orchid-turtle-6425-analyzer histogram 1 \n95 breakable-limegreen-shrew-7623-analyzer histogram 1 \n96 hilarious-powderblue-chamois-8115-analyzer histogram 1 \n97 horrible-magenta-sandpiper-8117-analyzer frequent_items 1 \n98 unsightly-bisque-lemur-1917-analyzer frequent_items 1 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 30 30 \n1 1 30 30 \n2 1 30 30 \n3 1 30 30 \n4 1 30 30 \n.. ... ... ... \n94 1 2 2 \n95 1 2 2 \n96 1 2 2 \n97 1 2 2 \n98 1 1 1 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 30 30 0 \n1 30 30 0 \n2 30 30 0 \n3 30 30 0 \n4 30 30 0 \n.. ... ... ... \n94 2 2 0 \n95 2 2 0 \n96 2 2 0 \n97 2 2 0 \n98 1 1 0 \n\n action_targets \n0 [] \n1 [] \n2 [] \n3 [] \n4 [] \n.. ... \n94 [] \n95 [] \n96 [] \n97 [] \n98 [] \n\n[99 rows x 11 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11303030300[]
1cooperative-maroon-parrot-8886discrete-drift-jensenshannon-analyzerfrequent_items11303030300[]
2famous-salmon-cobra-8902famous-salmon-cobra-8902-analyzermin11303030300[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11303030300[]
4Nonecooperative-maroon-parrot-8886-analyzerfrequent_items11303030300[]
....................................
94glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram1122220[]
95breakable-limegreen-shrew-7623breakable-limegreen-shrew-7623-analyzerhistogram1122220[]
96hilarious-powderblue-chamois-8115hilarious-powderblue-chamois-8115-analyzerhistogram1122220[]
97horrible-magenta-sandpiper-8117horrible-magenta-sandpiper-8117-analyzerfrequent_items1122220[]
98unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

99 rows × 11 columns

\n
" }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -506,10 +172,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "Once you have run `detect_noisy_monitors`, you can retrieve the result at any time via the `noisy_monitors` property. You can also retrieve\n", @@ -518,546 +181,21 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.736520Z", - "start_time": "2024-04-16T15:01:18.725303Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:04.638177Z", + "start_time": "2024-04-26T20:12:04.626701Z" } }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est603030301[email]
1good-cornsilk-bear-9359good-cornsilk-bear-9359-analyzercount_null2313303290[]
2elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio68308221[email]
3Noneexpensive-tomato-moose-6522-analyzermedian2109273260[]
4expensive-tomato-moose-6522csw-analyzer-2median877273110[]
5missing-values-ratio-monitor-v9uywimissing-values-ratio-analyzer-v9uywicount_null_ratio2714263251[email]
6curious-lemonchiffon-rabbit-7000curious-lemonchiffon-rabbit-7000-analyzerfrequent_items111111111[test-sort]
7clear-azure-starling-8883clear-azure-starling-8883-analyzerfrequent_items111111111[test-sort]
8light-mintcream-rhinoceros-3655light-mintcream-rhinoceros-3655-analyzerfrequent_items5011280[]
9handsome-lemonchiffon-eel-4222handsome-lemonchiffon-eel-4222-analyzerfrequent_items1311260[]
10witty-blue-koala-8098witty-blue-koala-8098-analyzerhistogram33330[]
11dark-blanchedalmond-ferret-7729dark-blanchedalmond-ferret-7729-analyzerfrequent_items33330[]
12eager-limegreen-hedgehog-1312eager-limegreen-hedgehog-1312-analyzerhistogram33330[]
13famous-yellow-baboon-2243famous-yellow-baboon-2243-analyzerhistogram33330[]
14gifted-coral-bison-842gifted-coral-bison-842-analyzerhistogram33330[]
15glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram33330[]
16inexpensive-maroon-donkey-7562inexpensive-maroon-donkey-7562-analyzerhistogram33330[]
17inferred-data-type-monitor-vjwbpoinferred-data-type-analyzer-vjwbpoinferred_data_type33330[]
18busy-hotpink-gaur-9703busy-hotpink-gaur-9703-analyzercount_null_ratio11110[]
19fancy-chocolate-wasp-8247fancy-chocolate-wasp-8247-analyzercount11110[]
20Noneeager-violet-newt-4599-analyzercount_null_ratio11110[]
21plain-fuchsia-stinkbug-4064plain-fuchsia-stinkbug-4064-analyzercount_null_ratio11110[]
22stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram11110[]
23tame-beige-sardine-3501tame-beige-sardine-3501-analyzercount_null_ratio11110[]
24tough-green-hare-1322tough-green-hare-1322-analyzercount_null_ratio11110[]
25uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount11111[christine-test-email]
26uninterested-red-alpaca-2523uninterested-red-alpaca-2523-analyzercount_null_ratio11110[]
27unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio11112[email, slack]
28happy-snow-grouse-452happy-snow-grouse-452-analyzercount_null_ratio11110[]
29unique-ratio-29f3ef1c-monitorunique-ratio-29f3ef1cunique_est_ratio11110[]
\n", - "
" - ], - "text/plain": [ - " monitor_id \\\n", - "0 energetic-black-cobra-7838 \n", - "1 good-cornsilk-bear-9359 \n", - "2 elated-gray-baboon-4620 \n", - "3 None \n", - "4 expensive-tomato-moose-6522 \n", - "5 missing-values-ratio-monitor-v9uywi \n", - "6 curious-lemonchiffon-rabbit-7000 \n", - "7 clear-azure-starling-8883 \n", - "8 light-mintcream-rhinoceros-3655 \n", - "9 handsome-lemonchiffon-eel-4222 \n", - "10 witty-blue-koala-8098 \n", - "11 dark-blanchedalmond-ferret-7729 \n", - "12 eager-limegreen-hedgehog-1312 \n", - "13 famous-yellow-baboon-2243 \n", - "14 gifted-coral-bison-842 \n", - "15 glamorous-orchid-turtle-6425 \n", - "16 inexpensive-maroon-donkey-7562 \n", - "17 inferred-data-type-monitor-vjwbpo \n", - "18 busy-hotpink-gaur-9703 \n", - "19 fancy-chocolate-wasp-8247 \n", - "20 None \n", - "21 plain-fuchsia-stinkbug-4064 \n", - "22 stormy-olive-butterfly-8693 \n", - "23 tame-beige-sardine-3501 \n", - "24 tough-green-hare-1322 \n", - "25 uninterested-blueviolet-reindeer-9950 \n", - "26 uninterested-red-alpaca-2523 \n", - "27 unique-estimate-ratio-monitor-ccf7cl \n", - "28 happy-snow-grouse-452 \n", - "29 unique-ratio-29f3ef1c-monitor \n", - "\n", - " analyzer_id metric \\\n", - "0 energetic-black-cobra-7838-analyzer unique_est \n", - "1 good-cornsilk-bear-9359-analyzer count_null \n", - "2 elated-gray-baboon-4620-analyzer count_null_ratio \n", - "3 expensive-tomato-moose-6522-analyzer median \n", - "4 csw-analyzer-2 median \n", - "5 missing-values-ratio-analyzer-v9uywi count_null_ratio \n", - "6 curious-lemonchiffon-rabbit-7000-analyzer frequent_items \n", - "7 clear-azure-starling-8883-analyzer frequent_items \n", - "8 light-mintcream-rhinoceros-3655-analyzer frequent_items \n", - "9 handsome-lemonchiffon-eel-4222-analyzer frequent_items \n", - "10 witty-blue-koala-8098-analyzer histogram \n", - "11 dark-blanchedalmond-ferret-7729-analyzer frequent_items \n", - "12 eager-limegreen-hedgehog-1312-analyzer histogram \n", - "13 famous-yellow-baboon-2243-analyzer histogram \n", - "14 gifted-coral-bison-842-analyzer histogram \n", - "15 glamorous-orchid-turtle-6425-analyzer histogram \n", - "16 inexpensive-maroon-donkey-7562-analyzer histogram \n", - "17 inferred-data-type-analyzer-vjwbpo inferred_data_type \n", - "18 busy-hotpink-gaur-9703-analyzer count_null_ratio \n", - "19 fancy-chocolate-wasp-8247-analyzer count \n", - "20 eager-violet-newt-4599-analyzer count_null_ratio \n", - "21 plain-fuchsia-stinkbug-4064-analyzer count_null_ratio \n", - "22 stormy-olive-butterfly-8693-analyzer histogram \n", - "23 tame-beige-sardine-3501-analyzer count_null_ratio \n", - "24 tough-green-hare-1322-analyzer count_null_ratio \n", - "25 uninterested-blueviolet-reindeer-9950-analyzer count \n", - "26 uninterested-red-alpaca-2523-analyzer count_null_ratio \n", - "27 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n", - "28 happy-snow-grouse-452-analyzer count_null_ratio \n", - "29 unique-ratio-29f3ef1c unique_est_ratio \n", - "\n", - " failed_count max_failed_per_column min_failed_per_column \\\n", - "0 60 30 30 \n", - "1 2313 30 3 \n", - "2 68 30 8 \n", - "3 2109 27 3 \n", - "4 877 27 3 \n", - "5 2714 26 3 \n", - "6 11 11 11 \n", - "7 11 11 11 \n", - "8 50 11 2 \n", - "9 13 11 2 \n", - "10 3 3 3 \n", - "11 3 3 3 \n", - "12 3 3 3 \n", - "13 3 3 3 \n", - "14 3 3 3 \n", - "15 3 3 3 \n", - "16 3 3 3 \n", - "17 3 3 3 \n", - "18 1 1 1 \n", - "19 1 1 1 \n", - "20 1 1 1 \n", - "21 1 1 1 \n", - "22 1 1 1 \n", - "23 1 1 1 \n", - "24 1 1 1 \n", - "25 1 1 1 \n", - "26 1 1 1 \n", - "27 1 1 1 \n", - "28 1 1 1 \n", - "29 1 1 1 \n", - "\n", - " avg_failed_per_column action_count action_targets \n", - "0 30 1 [email] \n", - "1 29 0 [] \n", - "2 22 1 [email] \n", - "3 26 0 [] \n", - "4 11 0 [] \n", - "5 25 1 [email] \n", - "6 11 1 [test-sort] \n", - "7 11 1 [test-sort] \n", - "8 8 0 [] \n", - "9 6 0 [] \n", - "10 3 0 [] \n", - "11 3 0 [] \n", - "12 3 0 [] \n", - "13 3 0 [] \n", - "14 3 0 [] \n", - "15 3 0 [] \n", - "16 3 0 [] \n", - "17 3 0 [] \n", - "18 1 0 [] \n", - "19 1 0 [] \n", - "20 1 0 [] \n", - "21 1 0 [] \n", - "22 1 0 [] \n", - "23 1 0 [] \n", - "24 1 0 [] \n", - "25 1 1 [christine-test-email] \n", - "26 1 0 [] \n", - "27 1 2 [email, slack] \n", - "28 1 0 [] \n", - "29 1 0 [] " - ] + "text/plain": " monitor_id \\\n0 good-cornsilk-bear-9359 \n1 energetic-black-cobra-7838 \n2 elated-gray-baboon-4620 \n3 missing-values-ratio-monitor-v9uywi \n4 None \n5 expensive-tomato-moose-6522 \n6 curious-lemonchiffon-rabbit-7000 \n7 clear-azure-starling-8883 \n8 light-mintcream-rhinoceros-3655 \n9 handsome-lemonchiffon-eel-4222 \n10 inferred-data-type-monitor-vjwbpo \n11 busy-hotpink-gaur-9703 \n12 unique-ratio-29f3ef1c-monitor \n13 None \n14 fancy-chocolate-wasp-8247 \n15 happy-snow-grouse-452 \n16 plain-fuchsia-stinkbug-4064 \n17 stormy-olive-butterfly-8693 \n18 tame-beige-sardine-3501 \n19 tough-green-hare-1322 \n20 uninterested-blueviolet-reindeer-9950 \n21 uninterested-red-alpaca-2523 \n22 unique-estimate-ratio-monitor-ccf7cl \n\n analyzer_id metric \\\n0 good-cornsilk-bear-9359-analyzer count_null \n1 energetic-black-cobra-7838-analyzer unique_est \n2 elated-gray-baboon-4620-analyzer count_null_ratio \n3 missing-values-ratio-analyzer-v9uywi count_null_ratio \n4 expensive-tomato-moose-6522-analyzer median \n5 csw-analyzer-2 median \n6 curious-lemonchiffon-rabbit-7000-analyzer frequent_items \n7 clear-azure-starling-8883-analyzer frequent_items \n8 light-mintcream-rhinoceros-3655-analyzer frequent_items \n9 handsome-lemonchiffon-eel-4222-analyzer frequent_items \n10 inferred-data-type-analyzer-vjwbpo inferred_data_type \n11 busy-hotpink-gaur-9703-analyzer count_null_ratio \n12 unique-ratio-29f3ef1c unique_est_ratio \n13 eager-violet-newt-4599-analyzer count_null_ratio \n14 fancy-chocolate-wasp-8247-analyzer count \n15 happy-snow-grouse-452-analyzer count_null_ratio \n16 plain-fuchsia-stinkbug-4064-analyzer count_null_ratio \n17 stormy-olive-butterfly-8693-analyzer histogram \n18 tame-beige-sardine-3501-analyzer count_null_ratio \n19 tough-green-hare-1322-analyzer count_null_ratio \n20 uninterested-blueviolet-reindeer-9950-analyzer count \n21 uninterested-red-alpaca-2523-analyzer count_null_ratio \n22 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n\n failed_count max_failed_per_column min_failed_per_column \\\n0 2310 30 30 \n1 60 30 30 \n2 68 30 8 \n3 2607 25 7 \n4 1794 23 23 \n5 562 23 7 \n6 7 7 7 \n7 7 7 7 \n8 31 7 1 \n9 9 7 2 \n10 3 3 3 \n11 1 1 1 \n12 1 1 1 \n13 1 1 1 \n14 1 1 1 \n15 1 1 1 \n16 1 1 1 \n17 1 1 1 \n18 1 1 1 \n19 1 1 1 \n20 1 1 1 \n21 1 1 1 \n22 1 1 1 \n\n avg_failed_per_column action_count action_targets \n0 30 0 [] \n1 30 1 [email] \n2 22 1 [email] \n3 24 1 [email] \n4 23 0 [] \n5 7 0 [] \n6 7 1 [test-sort] \n7 7 1 [test-sort] \n8 5 0 [] \n9 4 0 [] \n10 3 0 [] \n11 1 0 [] \n12 1 0 [] \n13 1 0 [] \n14 1 0 [] \n15 1 0 [] \n16 1 0 [] \n17 1 0 [] \n18 1 0 [] \n19 1 0 [] \n20 1 1 [christine-test-email] \n21 1 0 [] \n22 1 2 [email, slack] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0good-cornsilk-bear-9359good-cornsilk-bear-9359-analyzercount_null23103030300[]
1energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est603030301[email]
2elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio68308221[email]
3missing-values-ratio-monitor-v9uywimissing-values-ratio-analyzer-v9uywicount_null_ratio2607257241[email]
4Noneexpensive-tomato-moose-6522-analyzermedian17942323230[]
5expensive-tomato-moose-6522csw-analyzer-2median56223770[]
6curious-lemonchiffon-rabbit-7000curious-lemonchiffon-rabbit-7000-analyzerfrequent_items77771[test-sort]
7clear-azure-starling-8883clear-azure-starling-8883-analyzerfrequent_items77771[test-sort]
8light-mintcream-rhinoceros-3655light-mintcream-rhinoceros-3655-analyzerfrequent_items317150[]
9handsome-lemonchiffon-eel-4222handsome-lemonchiffon-eel-4222-analyzerfrequent_items97240[]
10inferred-data-type-monitor-vjwbpoinferred-data-type-analyzer-vjwbpoinferred_data_type33330[]
11busy-hotpink-gaur-9703busy-hotpink-gaur-9703-analyzercount_null_ratio11110[]
12unique-ratio-29f3ef1c-monitorunique-ratio-29f3ef1cunique_est_ratio11110[]
13Noneeager-violet-newt-4599-analyzercount_null_ratio11110[]
14fancy-chocolate-wasp-8247fancy-chocolate-wasp-8247-analyzercount11110[]
15happy-snow-grouse-452happy-snow-grouse-452-analyzercount_null_ratio11110[]
16plain-fuchsia-stinkbug-4064plain-fuchsia-stinkbug-4064-analyzercount_null_ratio11110[]
17stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram11110[]
18tame-beige-sardine-3501tame-beige-sardine-3501-analyzercount_null_ratio11110[]
19tough-green-hare-1322tough-green-hare-1322-analyzercount_null_ratio11110[]
20uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount11111[christine-test-email]
21uninterested-red-alpaca-2523uninterested-red-alpaca-2523-analyzercount_null_ratio11110[]
22unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio11112[email, slack]
\n
" }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1076,21 +214,19 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.740411Z", - "start_time": "2024-04-16T15:01:18.737542Z" + "end_time": "2024-04-26T20:12:04.642005Z", + "start_time": "2024-04-26T20:12:04.639206Z" } }, "outputs": [ { "data": { - "text/plain": [ - "'kind-cyan-kangaroo-1253'" - ] + "text/plain": "'kind-cyan-kangaroo-1253'" }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1102,10 +238,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can get the monitor object from the diagnoser, to see its display name and any other useful information." @@ -1113,25 +246,20 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:18.743620Z", - "start_time": "2024-04-16T15:01:18.741222Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:04.645858Z", + "start_time": "2024-04-26T20:12:04.642843Z" } }, "outputs": [ { "data": { - "text/plain": [ - "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279098033, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253', displayName='kind-cyan-kangaroo-1253', tags=None, analyzerIds=['kind-cyan-kangaroo-1253-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[])" - ] + "text/plain": "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279098033, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253', displayName='kind-cyan-kangaroo-1253', tags=None, analyzerIds=['kind-cyan-kangaroo-1253-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[])" }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1143,10 +271,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can similarly see the configuration of the analyzer that is being diagnosed.\n" @@ -1154,25 +279,20 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:19.544392Z", - "start_time": "2024-04-16T15:01:18.744499Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:05.457603Z", + "start_time": "2024-04-26T20:12:04.647006Z" } }, "outputs": [ { "data": { - "text/plain": [ - "Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279095485, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253-analyzer', displayName=None, tags=['featureSelection:all', 'discreteness:non-discrete'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.02, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" - ] + "text/plain": "Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279095485, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253-analyzer', displayName=None, tags=['featureSelection:all', 'discreteness:non-discrete'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.02, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1192,64 +312,21 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:01:19.860254Z", - "start_time": "2024-04-16T15:01:19.545452Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:05.746559Z", + "start_time": "2024-04-26T20:12:05.458340Z" } }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
segmenttotal_anomaliesbatch_count
0purpose=car&verification_status=Source Verified3030
\n", - "
" - ], - "text/plain": [ - " segment total_anomalies \\\n", - "0 purpose=car&verification_status=Source Verified 30 \n", - "\n", - " batch_count \n", - "0 30 " - ] + "text/plain": " segment total_anomalies \\\n0 purpose=car&verification_status=Source Verified 30 \n\n batch_count \n0 30 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
segmenttotal_anomaliesbatch_count
0purpose=car&verification_status=Source Verified3030
\n
" }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1266,10 +343,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The diagnoser chooses the noisiest segment to diagnose. This can be changed by setting the `diagnostic_segment` property." @@ -1277,21 +351,19 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:01:19.863407Z", - "start_time": "2024-04-16T15:01:19.861065Z" + "end_time": "2024-04-26T20:12:05.752954Z", + "start_time": "2024-04-26T20:12:05.749874Z" } }, "outputs": [ { "data": { - "text/plain": [ - "'purpose=car&verification_status=Source Verified'" - ] + "text/plain": "'purpose=car&verification_status=Source Verified'" }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1311,55 +383,20 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:01:20.057746Z", - "start_time": "2024-04-16T15:01:19.864287Z" + "end_time": "2024-04-26T20:12:05.892565Z", + "start_time": "2024-04-26T20:12:05.754263Z" } }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
columntotal_anomalies
0pred_credit_risk (output)30
\n", - "
" - ], - "text/plain": [ - " column total_anomalies\n", - "0 pred_credit_risk (output) 30" - ] + "text/plain": " column total_anomalies\n0 pred_credit_risk (output) 30", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
columntotal_anomalies
0pred_credit_risk (output)30
\n
" }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1379,21 +416,19 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:01:20.061262Z", - "start_time": "2024-04-16T15:01:20.058650Z" + "end_time": "2024-04-26T20:12:05.896831Z", + "start_time": "2024-04-26T20:12:05.893571Z" } }, "outputs": [ { "data": { - "text/plain": [ - "['pred_credit_risk (output)']" - ] + "text/plain": "['pred_credit_risk (output)']" }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1412,49 +447,28 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:03:35.516085Z", - "start_time": "2024-04-16T15:03:30.514723Z" + "end_time": "2024-04-26T20:12:07.205296Z", + "start_time": "2024-04-26T20:12:05.897820Z" } }, - "outputs": [ - { - "ename": "Exception", - "evalue": "USE_LOCAL_SERVER is set but server library is not available.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py:285\u001b[0m, in \u001b[0;36mMonitorDiagnoser.diagnose\u001b[0;34m(self, columns)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 285\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msmart_config\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mserver\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mserver\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DiagnosisRequest\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msmart_config\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mserver\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdiagnosis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer_diagnoser\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AnalyzerDiagnoser\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'smart_config'", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 3\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUSE_LOCAL_SERVER\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mserver\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m monitor_report \u001b[38;5;241m=\u001b[39m \u001b[43mdiagnoser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdiagnose\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py:314\u001b[0m, in \u001b[0;36mMonitorDiagnoser.diagnose\u001b[0;34m(self, columns)\u001b[0m\n\u001b[1;32m 303\u001b[0m report_dict \u001b[38;5;241m=\u001b[39m diagnosis_service\u001b[38;5;241m.\u001b[39mdiagnose_sync(\n\u001b[1;32m 304\u001b[0m DiagnosisRequest(\n\u001b[1;32m 305\u001b[0m orgId\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39morg_id,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 311\u001b[0m granularity\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgranularity,\n\u001b[1;32m 312\u001b[0m ))\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m--> 314\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUSE_LOCAL_SERVER is set but server library is not available.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 315\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 316\u001b[0m \u001b[38;5;66;03m# TODO implement call through songbird/whylabs-client instead of direct\u001b[39;00m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;66;03m# Call the diagnosis API via whyLabs client\u001b[39;00m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDiagnosis API call not implemented\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mException\u001b[0m: USE_LOCAL_SERVER is set but server library is not available." - ] - } - ], + "outputs": [], "source": [ "# for now, we need to enforce this to run using local server\n", "import os\n", - "os.environ['USE_LOCAL_SERVER'] = 'server'\n", "monitor_report = diagnoser.diagnose(columns)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:03:35.522329Z", - "start_time": "2024-04-16T15:03:35.518688Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:07.214914Z", + "start_time": "2024-04-26T20:12:07.206661Z" } }, "outputs": [ @@ -1462,40 +476,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "Diagnosis is for monitor \"wrong-drift-crowded-orchid-coyote-2773\" [adorable-goldenrod-lion-9438] in model-0 org-0, over interval 2024-03-16T00:00:00.000Z/2024-04-15T00:00:00.000Z.\n", + "Diagnosis is for monitor \"kind-cyan-kangaroo-1253\" [kind-cyan-kangaroo-1253] in model-0 org-0, over interval 2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z.\n", "\n", - "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", - "Analyzer \"adorable-goldenrod-lion-9438-analyzer\" targets 123 columns and ran on 27 columns in the diagnosed segment.\n", + "Analyzer is drift configuration for histogram metric with TrailingWindow baseline.\n", + "Analyzer \"kind-cyan-kangaroo-1253-analyzer\" targets 1 columns and ran on 1 columns in the diagnosed segment.\n", "\n", "\n", - "Diagnostic segment is \"overall\".\n", + "Diagnostic segment is \"purpose=car&verification_status=Source Verified\".\n", "Diagnostic interval contains 30 batches.\n", "\n", - "Diagnostic interval rollup contains 2494691 rows for the diagnosed columns.\n", + "Diagnostic interval rollup contains 10473 rows for the diagnosed columns.\n", "\n", "Analysis results summary:\n", - "Found non-failed results for 27 columns and 30 batches.\n", - "Found 31 anomalies in 2 columns, with up to 100.0% (30) batches having anomalies per column and 50.0% (15.0) on average.\n", + "Found non-failed results for 1 columns and 30 batches.\n", + "Found 30 anomalies in 1 columns, with up to 100.0% (30) batches having anomalies per column and 100.0% (30.0) on average.\n", "Columns with anomalies are:\n", - "| | 0 |\n", - "|:--------|----:|\n", - "| issue_d | 30 |\n", - "| url | 1 |\n", + "| | 0 |\n", + "|---:|:----------------------------------|\n", + "| 0 | ('pred_credit_risk (output)', 30) |\n", "\n", "No failures were detected.\n", "\n", - "Conditions that may impact diagnosis quality include:\n", - "\t* analyzer_changed: Analyzer changed within the diagnostic interval - detectors ['stale_analysis', 'changing_discrete', 'low_drift_threshold', 'missing_baseline_batches', 'small_nonnull_batches']\n", - "\n", + "No issues impacting diagnosis quality were detected\n", "Conditions that may contribute to noise include:\n", - "\t* Condition changing_discrete (many values are unique across batches) for 2 columns: ['issue_d', 'url']\n", + "\t* Condition low_drift_threshold (drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm)\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 1 columns: ['pred_credit_risk (output)']\n", "\n", "Anomalies for columns with these conditions:\n", - "| | 0 |\n", - "|:--------|----:|\n", - "| issue_d | 30 |\n", - "| url | 1 |\n", - "Accounting for 31 anomalies out of 31\n" + "| | 0 |\n", + "|:--------------------------|----:|\n", + "| pred_credit_risk (output) | 30 |\n", + "Accounting for 30 anomalies out of 30\n" ] } ], @@ -1505,11 +516,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:12:07.217841Z", + "start_time": "2024-04-26T20:12:07.216120Z" } }, "outputs": [], diff --git a/examples/example_notebooks/diagnoser.ipynb b/examples/example_notebooks/diagnoser.ipynb index 22d577c..c22cd3b 100644 --- a/examples/example_notebooks/diagnoser.ipynb +++ b/examples/example_notebooks/diagnoser.ipynb @@ -18,14 +18,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T14:58:15.366726Z", - "start_time": "2024-04-16T14:58:15.361250Z" - }, "pycharm": { "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2024-04-26T20:11:47.303557Z", + "start_time": "2024-04-26T20:11:47.300880Z" } }, "outputs": [], @@ -48,41 +48,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 32, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:58:19.211882Z", - "start_time": "2024-04-16T14:58:15.369321Z" + "end_time": "2024-04-26T20:11:49.832834Z", + "start_time": "2024-04-26T20:11:47.324580Z" } }, - "outputs": [ - { - "ename": "TypeError", - "evalue": "issubclass() arg 1 must be a class", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgetpass\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdiagnoser\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhelpers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m env_setup\n\u001b[1;32m 4\u001b[0m org_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124morg-0\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 5\u001b[0m dataset_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel-0\u001b[39m\u001b[38;5;124m'\u001b[39m\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorSetup, MonitorManager\n\u001b[1;32m 4\u001b[0m ALL \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 5\u001b[0m MonitorManager,\n\u001b[1;32m 6\u001b[0m MonitorSetup,\n\u001b[1;32m 7\u001b[0m ]\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/manager/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorManager\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcredentials\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorCredentials\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor_setup\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorSetup\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/manager/manager.py:10\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_client\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnotification_settings_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotificationSettingsApi\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_client\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ModelsApi\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor_setup\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorSetup\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhelpers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor_helpers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_model_granularity\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/manager/monitor_setup.py:9\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_client\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFoundException\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhelpers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_models_api\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtargets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ColumnGroups\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmanager\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcredentials\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MonitorCredentials\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124;03m\"\"\"Console script for monitor_schema.\"\"\"\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcolumn_schema\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcommons\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/analyzer/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124;03m\"\"\"Analyzer module.\"\"\"\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01malgorithms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Analyzer\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbaseline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/analyzer/algorithms.py:7\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, Dict, List, Literal, Optional, Union\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseModel, Field, constr\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01manalyzer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbaseline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 8\u001b[0m ReferenceProfileId,\n\u001b[1;32m 9\u001b[0m SingleBatchBaseline,\n\u001b[1;32m 10\u001b[0m TimeRangeBaseline,\n\u001b[1;32m 11\u001b[0m TrailingWindowBaseline,\n\u001b[1;32m 12\u001b[0m )\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcommons\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NoExtrasBaseModel, TimeRange\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m COLUMN_NAME_TYPE, anyOf_to_oneOf\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/analyzer/baseline.py:7\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m List, Literal, Optional\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Field\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwhylabs_toolkit\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonitor\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcommons\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DATASET_ID_DEF, NoExtrasBaseModel, TimeRange\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mBaselineType\u001b[39;00m(\u001b[38;5;28mstr\u001b[39m, Enum):\n\u001b[1;32m 11\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Supported baseline types.\"\"\"\u001b[39;00m\n", - "File \u001b[0;32m/Volumes/Workspace/whylabs-toolkit/whylabs_toolkit/monitor/models/commons.py:31\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mNoExtrasBaseModel\u001b[39;00m(BaseModel, extra\u001b[38;5;241m=\u001b[39mExtra\u001b[38;5;241m.\u001b[39mforbid): \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"No extras base model.\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \n\u001b[1;32m 27\u001b[0m \u001b[38;5;124;03m Inherit to prevent accidental extra fields.\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 31\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mImmediateSchedule\u001b[39;00m(NoExtrasBaseModel):\n\u001b[1;32m 32\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Schedule the monitor to run immediately.\"\"\"\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28mtype\u001b[39m: Literal[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimmediate\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimmediate\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/main.py:197\u001b[0m, in \u001b[0;36mpydantic.main.ModelMetaclass.__new__\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:506\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField.infer\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:436\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField.__init__\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:552\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField.prepare\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/whylabs-toolkit-w9gS_gKh-py3.9/lib/python3.9/site-packages/pydantic/fields.py:668\u001b[0m, in \u001b[0;36mpydantic.fields.ModelField._type_analysis\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/miniconda3/envs/hackthis/lib/python3.9/typing.py:852\u001b[0m, in \u001b[0;36m_SpecialGenericAlias.__subclasscheck__\u001b[0;34m(self, cls)\u001b[0m\n\u001b[1;32m 850\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__origin__, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__origin__)\n\u001b[1;32m 851\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mcls\u001b[39m, _GenericAlias):\n\u001b[0;32m--> 852\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43missubclass\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__origin__\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__subclasscheck__\u001b[39m(\u001b[38;5;28mcls\u001b[39m)\n", - "\u001b[0;31mTypeError\u001b[0m: issubclass() arg 1 must be a class" - ] - } - ], + "outputs": [], "source": [ "import getpass\n", "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", @@ -109,11 +82,24 @@ }, { "cell_type": "code", - "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-26T20:11:49.837299Z", + "start_time": "2024-04-26T20:11:49.835362Z" + } + }, + "execution_count": 32 + }, + { + "cell_type": "code", + "execution_count": 33, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:58:19.609165Z", - "start_time": "2024-04-16T14:58:19.213684Z" + "end_time": "2024-04-26T20:11:49.847125Z", + "start_time": "2024-04-26T20:11:49.841234Z" } }, "outputs": [], @@ -133,32 +119,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:59:50.931331Z", - "start_time": "2024-04-16T14:59:44.553343Z" + "end_time": "2024-04-26T20:11:54.604061Z", + "start_time": "2024-04-26T20:11:49.849546Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": "MonitorDiagnosisReport(orgId='org-0', datasetId='model-0', analyzerId='kind-cyan-kangaroo-1253-analyzer', interval='2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z', expectedBatchCount=30, diagnosticData=DiagnosticDataSummary(diagnosticSegment=Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')]), diagnosticProfile=ProfileSummary(minRowName='pred_credit_risk (output)', minRowCount=10473, maxRowName='pred_credit_risk (output)', maxRowCount=10473), diagnosticBatches=BatchesSummary(minBatchName='pred_credit_risk (output)', minBatchCount=30, maxBatchName='pred_credit_risk (output)', maxBatchCount=30), analysisResults=AnalysisResultsSummary(results=ResultRecord(diagnosedColumnCount=1, batchCount=30), failures=FailureRecord(totalFailuresCount=0, maxFailuresCount=0, meanFailuresCount=0, byColumnCount=[], byTypeCount=[]), anomalies=AnomalyRecord(totalAnomalyCount=30, maxAnomalyCount=30, meanAnomalyCount=30, batchCount=30, byColumnCount=[NamedCount(name='pred_credit_risk (output)', count=30)], byColumnBatchCount=[NamedCount(name='pred_credit_risk (output)', count=30)])), targetedColumnCount=1), qualityIssues=[], conditions=[ConditionRecord(columns=None, info={'threshold': 0.02, 'expected': 0.7, 'algo': 'hellinger'}, summary='drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm', name='low_drift_threshold'), ConditionRecord(columns=['pred_credit_risk (output)'], info=None, summary='less than 500 non-null records in 50% or more of the batches', name='small_nonnull_batches')], monitor=Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279098033, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253', displayName='kind-cyan-kangaroo-1253', tags=None, analyzerIds=['kind-cyan-kangaroo-1253-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[]), analyzer=Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279095485, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253-analyzer', displayName=None, tags=['featureSelection:all', 'discreteness:non-discrete'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.02, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None))), analyzedColumnCount=1)" + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# for now, we need to enforce this to run using local server\n", - "import os\n", - "os.environ['USE_LOCAL_SERVER'] = 'server'\n", "monitor_report = diagnoser.diagnose()\n", "monitor_report" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:59:50.950021Z", - "start_time": "2024-04-16T14:59:50.932643Z" + "end_time": "2024-04-26T20:11:54.614329Z", + "start_time": "2024-04-26T20:11:54.607449Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"kind-cyan-kangaroo-1253\" [kind-cyan-kangaroo-1253] in model-0 org-0, over interval 2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z.\n", + "\n", + "Analyzer is drift configuration for histogram metric with TrailingWindow baseline.\n", + "Analyzer \"kind-cyan-kangaroo-1253-analyzer\" targets 1 columns and ran on 1 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"purpose=car&verification_status=Source Verified\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 10473 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 1 columns and 30 batches.\n", + "Found 30 anomalies in 1 columns, with up to 100.0% (30) batches having anomalies per column and 100.0% (30.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|---:|:----------------------------------|\n", + "| 0 | ('pred_credit_risk (output)', 30) |\n", + "\n", + "No failures were detected.\n", + "\n", + "No issues impacting diagnosis quality were detected\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition low_drift_threshold (drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm)\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 1 columns: ['pred_credit_risk (output)']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------------------------|----:|\n", + "| pred_credit_risk (output) | 30 |\n", + "Accounting for 30 anomalies out of 30\n" + ] + } + ], "source": [ "print(monitor_report.describe())" ] @@ -177,11 +207,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:59:54.558717Z", - "start_time": "2024-04-16T14:59:54.552542Z" + "end_time": "2024-04-26T20:11:54.618780Z", + "start_time": "2024-04-26T20:11:54.615546Z" } }, "outputs": [], @@ -192,14 +222,202 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:59:56.056487Z", - "start_time": "2024-04-16T14:59:56.051250Z" + "end_time": "2024-04-26T20:11:54.624171Z", + "start_time": "2024-04-26T20:11:54.620454Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"orgId\": \"org-0\",\n", + " \"datasetId\": \"model-0\",\n", + " \"analyzerId\": \"kind-cyan-kangaroo-1253-analyzer\",\n", + " \"interval\": \"2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z\",\n", + " \"expectedBatchCount\": 30,\n", + " \"diagnosticData\": {\n", + " \"diagnosticSegment\": {\n", + " \"tags\": [\n", + " {\n", + " \"key\": \"purpose\",\n", + " \"value\": \"car\"\n", + " },\n", + " {\n", + " \"key\": \"verification_status\",\n", + " \"value\": \"Source Verified\"\n", + " }\n", + " ]\n", + " },\n", + " \"diagnosticProfile\": {\n", + " \"minRowName\": \"pred_credit_risk (output)\",\n", + " \"minRowCount\": 10473,\n", + " \"maxRowName\": \"pred_credit_risk (output)\",\n", + " \"maxRowCount\": 10473\n", + " },\n", + " \"diagnosticBatches\": {\n", + " \"minBatchName\": \"pred_credit_risk (output)\",\n", + " \"minBatchCount\": 30,\n", + " \"maxBatchName\": \"pred_credit_risk (output)\",\n", + " \"maxBatchCount\": 30\n", + " },\n", + " \"analysisResults\": {\n", + " \"results\": {\n", + " \"diagnosedColumnCount\": 1,\n", + " \"batchCount\": 30\n", + " },\n", + " \"failures\": {\n", + " \"totalFailuresCount\": 0,\n", + " \"maxFailuresCount\": 0,\n", + " \"meanFailuresCount\": 0,\n", + " \"byColumnCount\": [],\n", + " \"byTypeCount\": []\n", + " },\n", + " \"anomalies\": {\n", + " \"totalAnomalyCount\": 30,\n", + " \"maxAnomalyCount\": 30,\n", + " \"meanAnomalyCount\": 30,\n", + " \"batchCount\": 30,\n", + " \"byColumnCount\": [\n", + " {\n", + " \"name\": \"pred_credit_risk (output)\",\n", + " \"count\": 30\n", + " }\n", + " ],\n", + " \"byColumnBatchCount\": [\n", + " {\n", + " \"name\": \"pred_credit_risk (output)\",\n", + " \"count\": 30\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " \"targetedColumnCount\": 1\n", + " },\n", + " \"qualityIssues\": [],\n", + " \"conditions\": [\n", + " {\n", + " \"columns\": null,\n", + " \"info\": {\n", + " \"threshold\": 0.02,\n", + " \"expected\": 0.7,\n", + " \"algo\": \"hellinger\"\n", + " },\n", + " \"summary\": \"drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm\",\n", + " \"name\": \"low_drift_threshold\"\n", + " },\n", + " {\n", + " \"columns\": [\n", + " \"pred_credit_risk (output)\"\n", + " ],\n", + " \"info\": null,\n", + " \"summary\": \"less than 500 non-null records in 50% or more of the batches\",\n", + " \"name\": \"small_nonnull_batches\"\n", + " }\n", + " ],\n", + " \"monitor\": {\n", + " \"metadata\": {\n", + " \"version\": 1,\n", + " \"schemaVersion\": 1,\n", + " \"updatedTimestamp\": 1703279098033,\n", + " \"author\": \"user_1759fb08_1a01_4852_9ed4_91c6fceede45\",\n", + " \"description\": null\n", + " },\n", + " \"id\": \"kind-cyan-kangaroo-1253\",\n", + " \"displayName\": \"kind-cyan-kangaroo-1253\",\n", + " \"tags\": null,\n", + " \"analyzerIds\": [\n", + " \"kind-cyan-kangaroo-1253-analyzer\"\n", + " ],\n", + " \"schedule\": {\n", + " \"type\": \"immediate\"\n", + " },\n", + " \"disabled\": null,\n", + " \"severity\": 3,\n", + " \"mode\": {\n", + " \"type\": \"DIGEST\",\n", + " \"filter\": null,\n", + " \"creationTimeOffset\": null,\n", + " \"datasetTimestampOffset\": \"P7D\",\n", + " \"groupBy\": null\n", + " },\n", + " \"actions\": []\n", + " },\n", + " \"analyzer\": {\n", + " \"metadata\": {\n", + " \"version\": 1,\n", + " \"schemaVersion\": 1,\n", + " \"updatedTimestamp\": 1703279095485,\n", + " \"author\": \"user_1759fb08_1a01_4852_9ed4_91c6fceede45\",\n", + " \"description\": null\n", + " },\n", + " \"id\": \"kind-cyan-kangaroo-1253-analyzer\",\n", + " \"displayName\": null,\n", + " \"tags\": [\n", + " \"featureSelection:all\",\n", + " \"discreteness:non-discrete\"\n", + " ],\n", + " \"schedule\": {\n", + " \"type\": \"fixed\",\n", + " \"cadence\": \"daily\",\n", + " \"exclusionRanges\": null\n", + " },\n", + " \"disabled\": null,\n", + " \"disableTargetRollup\": null,\n", + " \"targetMatrix\": {\n", + " \"segments\": [\n", + " {\n", + " \"tags\": [\n", + " {\n", + " \"key\": \"purpose\",\n", + " \"value\": \"car\"\n", + " },\n", + " {\n", + " \"key\": \"verification_status\",\n", + " \"value\": \"Source Verified\"\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " \"type\": \"column\",\n", + " \"include\": [\n", + " \"group:continuous\"\n", + " ],\n", + " \"exclude\": [\n", + " \"group:input\"\n", + " ],\n", + " \"profileId\": null\n", + " },\n", + " \"dataReadinessDuration\": null,\n", + " \"batchCoolDownPeriod\": null,\n", + " \"backfillGracePeriodDuration\": null,\n", + " \"config\": {\n", + " \"schemaVersion\": null,\n", + " \"params\": null,\n", + " \"metric\": \"histogram\",\n", + " \"type\": \"drift\",\n", + " \"algorithm\": \"hellinger\",\n", + " \"threshold\": 0.02,\n", + " \"minBatchSize\": 1,\n", + " \"baseline\": {\n", + " \"datasetId\": null,\n", + " \"inheritSegment\": null,\n", + " \"type\": \"TrailingWindow\",\n", + " \"size\": 7,\n", + " \"offset\": null,\n", + " \"exclusionRanges\": null\n", + " }\n", + " }\n", + " },\n", + " \"analyzedColumnCount\": 1\n", + "}\n" + ] + } + ], "source": [ "from whylabs_toolkit.monitor.diagnoser.models import MonitorDiagnosisReport\n", "\n", @@ -219,14 +437,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T14:59:59.258082Z", - "start_time": "2024-04-16T14:59:59.248989Z" + "end_time": "2024-04-26T20:11:54.628392Z", + "start_time": "2024-04-26T20:11:54.624956Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. Make a manual change to the analyzer to address small_nonnull_batches: less than 500 non-null records in 50% or more of the batches for ['pred_credit_risk (output)']\n" + ] + } + ], "source": [ "from whylabs_toolkit.monitor.diagnoser.recommendation.change_recommender import ChangeRecommender\n", "\n", @@ -252,14 +478,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:00:01.766477Z", - "start_time": "2024-04-16T15:00:01.763192Z" + "end_time": "2024-04-26T20:11:54.632913Z", + "start_time": "2024-04-26T20:11:54.629301Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [] + } + ], "source": [ "automatable_changes = [c for c in changes if c.can_automate()]\n", "print('\\n'.join([c.describe() for c in automatable_changes]))" @@ -267,14 +499,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:00:04.589600Z", - "start_time": "2024-04-16T15:00:02.766087Z" + "end_time": "2024-04-26T20:11:54.635587Z", + "start_time": "2024-04-26T20:11:54.633557Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [] + } + ], "source": [ "change_results = recommender.make_changes(automatable_changes)\n", "print(change_results.describe())" @@ -297,14 +535,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:00:06.815149Z", - "start_time": "2024-04-16T15:00:06.798273Z" + "end_time": "2024-04-26T20:11:54.645911Z", + "start_time": "2024-04-26T20:11:54.636354Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": " monitor_id \\\n0 kind-cyan-kangaroo-1253 \n1 cooperative-maroon-parrot-8886 \n2 famous-salmon-cobra-8902 \n3 proud-seagreen-carabeef-65 \n4 None \n.. ... \n94 glamorous-orchid-turtle-6425 \n95 breakable-limegreen-shrew-7623 \n96 hilarious-powderblue-chamois-8115 \n97 horrible-magenta-sandpiper-8117 \n98 unsightly-bisque-lemur-1917 \n\n analyzer_id metric column_count \\\n0 kind-cyan-kangaroo-1253-analyzer histogram 1 \n1 discrete-drift-jensenshannon-analyzer frequent_items 1 \n2 famous-salmon-cobra-8902-analyzer min 1 \n3 proud-seagreen-carabeef-65-analyzer histogram 1 \n4 cooperative-maroon-parrot-8886-analyzer frequent_items 1 \n.. ... ... ... \n94 glamorous-orchid-turtle-6425-analyzer histogram 1 \n95 breakable-limegreen-shrew-7623-analyzer histogram 1 \n96 hilarious-powderblue-chamois-8115-analyzer histogram 1 \n97 horrible-magenta-sandpiper-8117-analyzer frequent_items 1 \n98 unsightly-bisque-lemur-1917-analyzer frequent_items 1 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 30 30 \n1 1 30 30 \n2 1 30 30 \n3 1 30 30 \n4 1 30 30 \n.. ... ... ... \n94 1 2 2 \n95 1 2 2 \n96 1 2 2 \n97 1 2 2 \n98 1 1 1 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 30 30 0 \n1 30 30 0 \n2 30 30 0 \n3 30 30 0 \n4 30 30 0 \n.. ... ... ... \n94 2 2 0 \n95 2 2 0 \n96 2 2 0 \n97 2 2 0 \n98 1 1 0 \n\n action_targets \n0 [] \n1 [] \n2 [] \n3 [] \n4 [] \n.. ... \n94 [] \n95 [] \n96 [] \n97 [] \n98 [] \n\n[99 rows x 11 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11303030300[]
1cooperative-maroon-parrot-8886discrete-drift-jensenshannon-analyzerfrequent_items11303030300[]
2famous-salmon-cobra-8902famous-salmon-cobra-8902-analyzermin11303030300[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11303030300[]
4Nonecooperative-maroon-parrot-8886-analyzerfrequent_items11303030300[]
....................................
94glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram1122220[]
95breakable-limegreen-shrew-7623breakable-limegreen-shrew-7623-analyzerhistogram1122220[]
96hilarious-powderblue-chamois-8115hilarious-powderblue-chamois-8115-analyzerhistogram1122220[]
97horrible-magenta-sandpiper-8117horrible-magenta-sandpiper-8117-analyzerfrequent_items1122220[]
98unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

99 rows × 11 columns

\n
" + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors])\n", @@ -323,14 +571,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": { "ExecuteTime": { - "end_time": "2024-04-16T15:00:17.594027Z", - "start_time": "2024-04-16T15:00:09.536137Z" + "end_time": "2024-04-26T20:11:57.298847Z", + "start_time": "2024-04-26T20:11:54.646676Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"discrete-drift-jensenshannon\" [cooperative-maroon-parrot-8886] in model-0 org-0, over interval 2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z.\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"discrete-drift-jensenshannon-analyzer\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 1945487 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 30 anomalies in 1 columns, with up to 100.0% (30) batches having anomalies per column and 100.0% (30.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|---:|:----------------|\n", + "| 0 | ('issue_d', 30) |\n", + "\n", + "No failures were detected.\n", + "\n", + "No issues impacting diagnosis quality were detected\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 1 columns: ['issue_d']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "Accounting for 30 anomalies out of 30\n" + ] + } + ], "source": [ "diagnoser.monitor_id_to_diagnose = noisy_monitors_df.iloc[1]['monitor_id']\n", "monitor_report = diagnoser.diagnose()\n", @@ -340,10 +625,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "You can also use the `noisy_monitors_with_actions` property to prioritize noise in monitors with actions, as these are most likely to cause alert fatigue." @@ -351,29 +633,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": { - "ExecuteTime": { - "end_time": "2024-04-16T15:00:17.603562Z", - "start_time": "2024-04-16T15:00:17.595665Z" - }, "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:11:57.307971Z", + "start_time": "2024-04-26T20:11:57.300890Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": " monitor_id \\\n0 frequent-items-drift-monitor-u31vmb \n1 frequent-items-drift-monitor-uu0ax8 \n2 frequent-items-drift-monitor-48ukw1 \n3 frequent-items-drift-monitor-jepz7t \n4 frequent-items-drift-monitor-pxexvn \n5 nice-burlywood-tarsier-4771 \n6 energetic-black-cobra-7838 \n7 elated-gray-baboon-4620 \n8 old-crimson-starling-2516 \n9 uninterested-blueviolet-reindeer-9950 \n10 numerical-drift-monitor-zy4q8v \n11 unique-estimate-ratio-monitor-ccf7cl \n12 numerical-drift-monitor-jpodsg \n13 numerical-drift-monitor-60dfcc \n\n analyzer_id metric \\\n0 frequent-items-drift-analyzer-u31vmb frequent_items \n1 frequent-items-drift-analyzer-uu0ax8 frequent_items \n2 frequent-items-drift-analyzer-48ukw1 frequent_items \n3 frequent-items-drift-analyzer-jepz7t frequent_items \n4 frequent-items-drift-analyzer-pxexvn frequent_items \n5 nice-burlywood-tarsier-4771-analyzer unique_est \n6 energetic-black-cobra-7838-analyzer unique_est \n7 elated-gray-baboon-4620-analyzer count_null_ratio \n8 old-crimson-starling-2516-analyzer frequent_items \n9 uninterested-blueviolet-reindeer-9950-analyzer count \n10 numerical-drift-analyzer-zy4q8v histogram \n11 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n12 numerical-drift-analyzer-jpodsg histogram \n13 numerical-drift-analyzer-60dfcc histogram \n\n column_count segment_count anomaly_count max_anomaly_per_column \\\n0 2 1 31 30 \n1 2 1 31 30 \n2 2 1 31 30 \n3 2 1 31 30 \n4 2 1 31 30 \n5 7 1 106 30 \n6 7 1 80 30 \n7 13 1 64 30 \n8 2 1 24 23 \n9 77 1 152 9 \n10 3 1 18 8 \n11 104 1 394 7 \n12 1 1 2 2 \n13 1 1 2 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 1 15 2 \n1 1 15 3 \n2 1 15 2 \n3 1 15 2 \n4 1 15 2 \n5 2 15 2 \n6 2 11 1 \n7 1 4 1 \n8 1 12 1 \n9 1 1 1 \n10 2 6 1 \n11 1 3 2 \n12 2 2 2 \n13 2 2 2 \n\n action_targets \n0 [email, slack] \n1 [email, slack, email-victor-at-whylabs] \n2 [email, slack] \n3 [email, slack] \n4 [email, slack] \n5 [slack, email] \n6 [email] \n7 [email] \n8 [email] \n9 [christine-test-email] \n10 [email] \n11 [email, slack] \n12 [email, slack] \n13 [email, slack] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-u31vmbfrequent-items-drift-analyzer-u31vmbfrequent_items2131301152[email, slack]
1frequent-items-drift-monitor-uu0ax8frequent-items-drift-analyzer-uu0ax8frequent_items2131301153[email, slack, email-victor-at-whylabs]
2frequent-items-drift-monitor-48ukw1frequent-items-drift-analyzer-48ukw1frequent_items2131301152[email, slack]
3frequent-items-drift-monitor-jepz7tfrequent-items-drift-analyzer-jepz7tfrequent_items2131301152[email, slack]
4frequent-items-drift-monitor-pxexvnfrequent-items-drift-analyzer-pxexvnfrequent_items2131301152[email, slack]
5nice-burlywood-tarsier-4771nice-burlywood-tarsier-4771-analyzerunique_est71106302152[slack, email]
6energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est7180302111[email]
7elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio1316430141[email]
8old-crimson-starling-2516old-crimson-starling-2516-analyzerfrequent_items2124231121[email]
9uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount7711529111[christine-test-email]
10numerical-drift-monitor-zy4q8vnumerical-drift-analyzer-zy4q8vhistogram31188261[email]
11unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio10413947132[email, slack]
12numerical-drift-monitor-jpodsgnumerical-drift-analyzer-jpodsghistogram1122222[email, slack]
13numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
\n
" + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors_with_actions])\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": { "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-04-26T20:11:57.310423Z", + "start_time": "2024-04-26T20:11:57.308851Z" } }, "outputs": [], diff --git a/poetry.lock b/poetry.lock index 8b42e46..a0d8759 100644 --- a/poetry.lock +++ b/poetry.lock @@ -676,13 +676,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "whylabs-client" -version = "0.6.2" +version = "0.6.3" description = "WhyLabs API client" optional = false python-versions = ">=3.6" files = [ - {file = "whylabs-client-0.6.2.tar.gz", hash = "sha256:7d8181317cd75f643935b45e87377fd21e17dd9117674a8996cf9a469be65a90"}, - {file = "whylabs_client-0.6.2-py3-none-any.whl", hash = "sha256:3fe5edff4415ac15426a8aaed6adaf7e803818e997b825535e5aa5417757b7ae"}, + {file = "whylabs-client-0.6.3.tar.gz", hash = "sha256:4df4daa436f7899c60575c5a72641a2b3cbfe9d2f0cc0d6b4831746d13342088"}, + {file = "whylabs_client-0.6.3-py3-none-any.whl", hash = "sha256:050bcfd1493fbb303f38b02b750fb5321abeeed1e775f7dfd570998d3bf5719b"}, ] [package.dependencies] @@ -781,4 +781,4 @@ diagnoser = ["isodate", "numpy", "pandas", "python-dateutil", "tabulate"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "14e09374513c67afeab23e3431078c556d5cbe4c83190c7cd9c2dd44f189fb40" +content-hash = "20fc341db7d79a6f3190c5e3da4008c508584233d74c491ccbd91528c325d681" diff --git a/pyproject.toml b/pyproject.toml index 3e7660e..ebf8be0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ include = ["whylabs_toolkit/monitor/schema/schema.json"] [tool.poetry.dependencies] python = "^3.8" -whylabs-client = "^0.6.0" +whylabs-client = "^0.6.3" pydantic = "^1.10.15" whylogs = "^1.1.26" jsonschema = "^4.17.3" diff --git a/whylabs_toolkit/helpers/utils.py b/whylabs_toolkit/helpers/utils.py index 3d09266..b981501 100644 --- a/whylabs_toolkit/helpers/utils.py +++ b/whylabs_toolkit/helpers/utils.py @@ -1,4 +1,5 @@ from whylabs_client.api.dataset_profile_api import DatasetProfileApi +from whylabs_client.api.monitor_diagnostics_api import MonitorDiagnosticsApi from whylabs_client.api.models_api import ModelsApi from whylabs_client.api.notification_settings_api import NotificationSettingsApi from whylabs_client.api.monitor_api import MonitorApi @@ -21,3 +22,7 @@ def get_notification_api(config: Config = Config()) -> NotificationSettingsApi: def get_monitor_api(config: Config = Config()) -> MonitorApi: return MonitorApi(api_client=create_client(config=config)) + + +def get_monitor_diagnostics_api(config: Config = Config()) -> MonitorDiagnosticsApi: + return MonitorDiagnosticsApi(api_client=create_client(config=config)) diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py index bc71c0a..459f3f5 100644 --- a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -1,4 +1,5 @@ import os +import json import pandas as pd from typing import Tuple, List, Optional, Dict @@ -9,12 +10,13 @@ from whylabs_client.model.analyzer_segments_diagnostic_request import AnalyzerSegmentsDiagnosticRequest from whylabs_client.model.analyzer_segments_diagnostic_response import AnalyzerSegmentsDiagnosticResponse from whylabs_client.model.analyzers_diagnostic_response import AnalyzersDiagnosticResponse +from whylabs_client.model.diagnosis_request import DiagnosisRequest from whylabs_client.model.diagnostic_interval_request import DiagnosticIntervalRequest from whylabs_client.model.diagnostic_interval_response import DiagnosticIntervalResponse from whylabs_client.model.analyzers_diagnostic_request import AnalyzersDiagnosticRequest from whylabs_client.model.segment import Segment as WhyLabsSegment from whylabs_client.model.segment_tag import SegmentTag as WhyLabsSegmentTag -from whylabs_toolkit.helpers.utils import get_monitor_api, get_models_api +from whylabs_toolkit.helpers.utils import get_monitor_api, get_models_api, get_monitor_diagnostics_api from whylabs_toolkit.monitor.models import TimeRange, Monitor, Segment, Analyzer, EntitySchema from whylabs_toolkit.utils.granularity import Granularity @@ -31,6 +33,18 @@ ) from whylabs_toolkit.monitor.diagnoser.targeting import targeted_columns +def to_mapped_dict(obj) -> dict: + """ + Convert a WhyLabs Client class instance into a JSON dictionary with keys mapped to the API schema. For example, + the pythonized 'org_id' attribute becomes 'orgId'. + :param obj: + :return: dict + """ + if hasattr(obj, 'to_dict') and hasattr(obj, 'attribute_map'): + return {obj.attribute_map[k]: to_mapped_dict(getattr(obj, k)) for k, _ in obj.to_dict().items()} + if isinstance(obj, list): + return [to_mapped_dict(i) for i in obj] + return obj class MonitorDiagnoser: def __init__(self, org_id: str, dataset_id: str): @@ -41,6 +55,7 @@ def __init__(self, org_id: str, dataset_id: str): self._diagnostics_api = get_monitor_diagnostics_api() self._monitor_api = get_monitor_api() self._models_api = get_models_api() + self._diagnostics_api = get_monitor_diagnostics_api() self._monitor_configs: Optional[List[Monitor]] = None self._noisy_monitors: Optional[List[NoisyMonitorStats]] = None self._failed_monitors: Optional[List[FailedMonitorStats]] = None @@ -112,9 +127,8 @@ def diagnostic_interval(self) -> str: return self._diagnostic_interval @diagnostic_interval.setter - def diagnostic_interval(self, interval: str) -> str: + def diagnostic_interval(self, interval: str): self._diagnostic_interval = interval - return self._diagnostic_interval @property def diagnostic_segment(self) -> Segment: @@ -123,12 +137,11 @@ def diagnostic_segment(self) -> Segment: return self._diagnostic_segment @diagnostic_segment.setter - def diagnostic_segment(self, segment: Segment) -> Segment: + def diagnostic_segment(self, segment: Segment): if self._diagnostic_segment != segment: self._diagnostic_segment = segment self._noisy_columns = None self._diagnosis = None - return segment @property def monitor_id_to_diagnose(self) -> str: @@ -137,7 +150,7 @@ def monitor_id_to_diagnose(self) -> str: return self._monitor_id @monitor_id_to_diagnose.setter - def monitor_id_to_diagnose(self, monitor_id: str) -> str: + def monitor_id_to_diagnose(self, monitor_id: str): if self._monitor_id != monitor_id: self._monitor_id = monitor_id # Reset anything specific to the monitor @@ -147,7 +160,6 @@ def monitor_id_to_diagnose(self, monitor_id: str) -> str: self._noisy_columns = None self._diagnosis = None self._diagnostic_segment = None - return self._monitor_id @property def monitor_to_diagnose(self) -> Optional[Monitor]: @@ -307,7 +319,7 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor if use_local_server: # Call the server function directly if configured to do so (for testing) try: - from smart_config.server.server import DiagnosisRequest + from smart_config.server.server import DiagnosisRequest as DiagnoserDiagnosisRequest from smart_config.server.diagnosis.analyzer_diagnoser import AnalyzerDiagnoser if use_local_server == "library": @@ -337,22 +349,32 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor } ) report_dict = diagnosis_service.diagnose_sync( - DiagnosisRequest( + DiagnoserDiagnosisRequest( orgId=self.org_id, datasetId=self.dataset_id, analyzerId=self.get_analyzer_id_for_monitor(), interval=self.diagnostic_interval, columns=self._diagnosed_columns, segment=self.diagnostic_segment, - granularity=self.granularity, ) ) except ImportError: raise Exception("USE_LOCAL_SERVER is set but server library is not available.") else: - # TODO implement call through songbird/whylabs-client instead of direct # Call the diagnosis API via whyLabs client - raise NotImplementedError("Diagnosis API call not implemented") + response = self._diagnostics_api.diagnose_analyzer_sync( + self.org_id, + DiagnosisRequest( + dataset_id=self.dataset_id, + analyzer_id=self.get_analyzer_id_for_monitor(), + interval=self.diagnostic_interval, + columns=self._diagnosed_columns, + segment=WhyLabsSegment( + tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags]), + ) + ) + + report_dict = to_mapped_dict(response) self._diagnosis = MonitorDiagnosisReport( **report_dict, From 82bc77a02eded1ae9479695c12c2d0b985789a5c Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Fri, 26 Apr 2024 16:22:25 -0400 Subject: [PATCH 08/14] Fix lint --- .../monitor/diagnoser/monitor_diagnoser.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py index 459f3f5..114ea9c 100644 --- a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -1,5 +1,4 @@ import os -import json import pandas as pd from typing import Tuple, List, Optional, Dict @@ -33,19 +32,21 @@ ) from whylabs_toolkit.monitor.diagnoser.targeting import targeted_columns -def to_mapped_dict(obj) -> dict: + +def to_mapped_dict(obj: object) -> object: """ Convert a WhyLabs Client class instance into a JSON dictionary with keys mapped to the API schema. For example, the pythonized 'org_id' attribute becomes 'orgId'. :param obj: :return: dict """ - if hasattr(obj, 'to_dict') and hasattr(obj, 'attribute_map'): + if hasattr(obj, "to_dict") and hasattr(obj, "attribute_map"): return {obj.attribute_map[k]: to_mapped_dict(getattr(obj, k)) for k, _ in obj.to_dict().items()} if isinstance(obj, list): return [to_mapped_dict(i) for i in obj] return obj + class MonitorDiagnoser: def __init__(self, org_id: str, dataset_id: str): self.org_id: str = org_id @@ -127,7 +128,7 @@ def diagnostic_interval(self) -> str: return self._diagnostic_interval @diagnostic_interval.setter - def diagnostic_interval(self, interval: str): + def diagnostic_interval(self, interval: str) -> None: self._diagnostic_interval = interval @property @@ -137,7 +138,7 @@ def diagnostic_segment(self) -> Segment: return self._diagnostic_segment @diagnostic_segment.setter - def diagnostic_segment(self, segment: Segment): + def diagnostic_segment(self, segment: Segment) -> None: if self._diagnostic_segment != segment: self._diagnostic_segment = segment self._noisy_columns = None @@ -150,7 +151,7 @@ def monitor_id_to_diagnose(self) -> str: return self._monitor_id @monitor_id_to_diagnose.setter - def monitor_id_to_diagnose(self, monitor_id: str): + def monitor_id_to_diagnose(self, monitor_id: str) -> None: if self._monitor_id != monitor_id: self._monitor_id = monitor_id # Reset anything specific to the monitor @@ -370,8 +371,9 @@ def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisRepor interval=self.diagnostic_interval, columns=self._diagnosed_columns, segment=WhyLabsSegment( - tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags]), - ) + tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags] + ), + ), ) report_dict = to_mapped_dict(response) From b64cb6fb2de5f63d3f1ed2a808b7975d4986c6c0 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Fri, 26 Apr 2024 16:29:30 -0400 Subject: [PATCH 09/14] Fix custom metric test --- tests/helpers/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/helpers/test_model.py b/tests/helpers/test_model.py index ef9cd3f..7e3be36 100644 --- a/tests/helpers/test_model.py +++ b/tests/helpers/test_model.py @@ -55,4 +55,4 @@ def test_create_custom_metric(models_api: ModelsApi) -> None: assert entity["metrics"]["temperature.median"].to_dict() == {'column': 'temperature', 'default_metric': 'median','label': 'temperature.median'} - models_api.delete_entity_schema_metric(org_id=org_id, dataset_id="model-7", metric_label="temperature.median") \ No newline at end of file + models_api.delete_entity_schema_metric(org_id=org_id, dataset_id="model-7", metric_name="temperature.median") \ No newline at end of file From 7e01141f576f9a385e56c1edbe03e22341348208 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Fri, 26 Apr 2024 16:31:31 -0400 Subject: [PATCH 10/14] Bump minor version --- .bumpversion.cfg | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 74205e4..3ac74a9 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.19 +current_version = 0.2.0-dev0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/pyproject.toml b/pyproject.toml index ebf8be0..2423640 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "whylabs-toolkit" -version = "0.1.19" +version = "0.2.0-dev0" description = "Whylabs Toolkit package." authors = ["Murilo Mendonca ", "Anthony Naddeo ", "Christine Draper "] From b976eeed74c64153d326e3d9c7af20f152fdcaab Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Tue, 30 Apr 2024 09:57:35 -0400 Subject: [PATCH 11/14] Review comments --- .bumpversion.cfg | 2 +- pyproject.toml | 2 +- whylabs_toolkit/helpers/monitor_helpers.py | 23 +++++++++++-------- .../diagnoser/converters/granularity.py | 13 ----------- .../monitor/diagnoser/monitor_diagnoser.py | 2 +- .../monitor/diagnoser/targeting.py | 4 ++-- 6 files changed, 19 insertions(+), 27 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 3ac74a9..033fb2e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0-dev0 +current_version = 0.1.0-dev0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/pyproject.toml b/pyproject.toml index 2423640..9743f24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "whylabs-toolkit" -version = "0.2.0-dev0" +version = "0.1.0-dev0" description = "Whylabs Toolkit package." authors = ["Murilo Mendonca ", "Anthony Naddeo ", "Christine Draper "] diff --git a/whylabs_toolkit/helpers/monitor_helpers.py b/whylabs_toolkit/helpers/monitor_helpers.py index 2778eeb..fb32ee7 100644 --- a/whylabs_toolkit/helpers/monitor_helpers.py +++ b/whylabs_toolkit/helpers/monitor_helpers.py @@ -78,6 +78,19 @@ def get_analyzers( return None +def time_period_to_granularity(time_period: str) -> Granularity: + if time_period == "PT1H": + return Granularity.hourly + + if time_period == "P1W": + return Granularity.weekly + + if time_period == "P1M": + return Granularity.monthly + + return Granularity.daily + + def get_model_granularity( org_id: Optional[str] = None, dataset_id: Optional[str] = None, config: Config = Config() ) -> Optional[Granularity]: @@ -87,16 +100,8 @@ def get_model_granularity( api = get_models_api(config=config) model_meta = api.get_model(org_id=org_id, model_id=dataset_id) - time_period_to_gran = { - "H": Granularity.hourly, - "D": Granularity.daily, - "W": Granularity.weekly, - "M": Granularity.monthly, - } if model_meta: - for key, value in time_period_to_gran.items(): - if key in model_meta["time_period"]: - return value + return time_period_to_granularity(model_meta["time_period"]) return None diff --git a/whylabs_toolkit/monitor/diagnoser/converters/granularity.py b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py index 2d0cc76..5be8d25 100644 --- a/whylabs_toolkit/monitor/diagnoser/converters/granularity.py +++ b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py @@ -16,19 +16,6 @@ def batches_to_timedelta(time_period: str, batches: int) -> relativedelta: return relativedelta(days=batches) -def time_period_to_granularity(time_period: str) -> Granularity: - if time_period == "PT1H": - return Granularity.hourly - - if time_period == "P1W": - return Granularity.weekly - - if time_period == "P1M": - return Granularity.monthly - - return Granularity.daily - - def calculate_num_batches(interval: str, granularity: str) -> int: # Parse the ISO8601 interval string into a start and end datetime start, end = interval.split("/") diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py index 114ea9c..193132f 100644 --- a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -20,7 +20,7 @@ from whylabs_toolkit.utils.granularity import Granularity from whylabs_toolkit.monitor.diagnoser.helpers.utils import get_monitor_diagnostics_api, segment_as_readable_text -from whylabs_toolkit.monitor.diagnoser.converters.granularity import time_period_to_granularity +from whylabs_toolkit.helpers.monitor_helpers import time_period_to_granularity from whylabs_toolkit.monitor.diagnoser.constants import DEFAULT_BATCHES from whylabs_toolkit.monitor.diagnoser.models import ( NoisyMonitorStats, diff --git a/whylabs_toolkit/monitor/diagnoser/targeting.py b/whylabs_toolkit/monitor/diagnoser/targeting.py index e999b88..9f95608 100644 --- a/whylabs_toolkit/monitor/diagnoser/targeting.py +++ b/whylabs_toolkit/monitor/diagnoser/targeting.py @@ -1,6 +1,6 @@ from typing import List, Union, Set -from whylabs_toolkit.monitor.models import EntitySchema, ColumnMatrix, DatasetMatrix, TargetLevel +from whylabs_toolkit.monitor.models import EntitySchema, ColumnMatrix, DatasetMatrix def expand_target(target: str, schema: EntitySchema) -> List[str]: @@ -10,7 +10,7 @@ def expand_target(target: str, schema: EntitySchema) -> List[str]: if target == "group:discrete": return [name for (name, c) in col_items if c.discreteness == "discrete"] if target == "group:continuous": - return [name for (name, c) in col_items if c.discreteness != "discrete"] + return [name for (name, c) in col_items if c.discreteness == "continuous"] if target == "group:input": return [name for (name, c) in col_items if c.classifier == "input"] if target == "group:output": From f7c344b7544f8fe2cc2f37690bd9816f2143a677 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Tue, 30 Apr 2024 10:18:23 -0400 Subject: [PATCH 12/14] Remove empty cell from example notebook --- examples/example_notebooks/diagnoser.ipynb | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/examples/example_notebooks/diagnoser.ipynb b/examples/example_notebooks/diagnoser.ipynb index c22cd3b..140c2fb 100644 --- a/examples/example_notebooks/diagnoser.ipynb +++ b/examples/example_notebooks/diagnoser.ipynb @@ -80,19 +80,6 @@ "Initialize the Monitor Diagnoser with the org_id and dataset_id." ] }, - { - "cell_type": "code", - "outputs": [], - "source": [], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-26T20:11:49.837299Z", - "start_time": "2024-04-26T20:11:49.835362Z" - } - }, - "execution_count": 32 - }, { "cell_type": "code", "execution_count": 33, From bc0faeef869a60855424e4b03aca687cbf5ca01c Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Thu, 2 May 2024 22:04:46 -0400 Subject: [PATCH 13/14] Update notebooks for prod --- .../customized_diagnoser.ipynb | 150 ++++--- examples/example_notebooks/diagnoser.ipynb | 405 +++++++++++------- 2 files changed, 324 insertions(+), 231 deletions(-) diff --git a/examples/example_notebooks/customized_diagnoser.ipynb b/examples/example_notebooks/customized_diagnoser.ipynb index a88123f..5f183a7 100644 --- a/examples/example_notebooks/customized_diagnoser.ipynb +++ b/examples/example_notebooks/customized_diagnoser.ipynb @@ -23,8 +23,8 @@ "name": "#%%\n" }, "ExecuteTime": { - "end_time": "2024-04-26T20:11:58.764825Z", - "start_time": "2024-04-26T20:11:58.762547Z" + "end_time": "2024-05-03T02:03:15.122705Z", + "start_time": "2024-05-03T02:03:15.119284Z" } }, "outputs": [], @@ -51,8 +51,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:02.212387Z", - "start_time": "2024-04-26T20:11:58.779115Z" + "end_time": "2024-05-03T02:03:31.740325Z", + "start_time": "2024-05-03T02:03:15.137102Z" } }, "outputs": [], @@ -60,10 +60,10 @@ "import getpass\n", "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", "\n", - "org_id = 'org-0'\n", - "dataset_id = 'model-0'\n", - "api_key = getpass.getpass()\n", - "api_endpoint = 'https://songbird.development.whylabsdev.com'\n", + "org_id = input(\"Enter org ID\")\n", + "dataset_id = input(\"Enter model/dataset ID\")\n", + "api_key = getpass.getpass(\"Enter API key\")\n", + "api_endpoint = 'https://api.whylabsapp.com'\n", "\n", "env_setup(\n", " org_id=org_id,\n", @@ -88,8 +88,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:02.537020Z", - "start_time": "2024-04-26T20:12:02.214892Z" + "end_time": "2024-05-03T02:03:32.055442Z", + "start_time": "2024-05-03T02:03:31.743114Z" } }, "outputs": [], @@ -113,14 +113,14 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:12:02.959292Z", - "start_time": "2024-04-26T20:12:02.538022Z" + "end_time": "2024-05-03T02:03:32.690456Z", + "start_time": "2024-05-03T02:03:32.056136Z" } }, "outputs": [ { "data": { - "text/plain": "(TimeRange(start=datetime.datetime(2020, 10, 8, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 4, 25, 21, 0, tzinfo=datetime.timezone.utc)),\n ,\n '2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z')" + "text/plain": "(TimeRange(start=datetime.datetime(2021, 5, 20, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 5, 2, 21, 0, tzinfo=datetime.timezone.utc)),\n ,\n '2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z')" }, "execution_count": 4, "metadata": {}, @@ -147,15 +147,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:04.625414Z", - "start_time": "2024-04-26T20:12:02.961490Z" + "end_time": "2024-05-03T02:03:33.421897Z", + "start_time": "2024-05-03T02:03:32.692755Z" } }, "outputs": [ { "data": { - "text/plain": " monitor_id \\\n0 kind-cyan-kangaroo-1253 \n1 cooperative-maroon-parrot-8886 \n2 famous-salmon-cobra-8902 \n3 proud-seagreen-carabeef-65 \n4 None \n.. ... \n94 glamorous-orchid-turtle-6425 \n95 breakable-limegreen-shrew-7623 \n96 hilarious-powderblue-chamois-8115 \n97 horrible-magenta-sandpiper-8117 \n98 unsightly-bisque-lemur-1917 \n\n analyzer_id metric column_count \\\n0 kind-cyan-kangaroo-1253-analyzer histogram 1 \n1 discrete-drift-jensenshannon-analyzer frequent_items 1 \n2 famous-salmon-cobra-8902-analyzer min 1 \n3 proud-seagreen-carabeef-65-analyzer histogram 1 \n4 cooperative-maroon-parrot-8886-analyzer frequent_items 1 \n.. ... ... ... \n94 glamorous-orchid-turtle-6425-analyzer histogram 1 \n95 breakable-limegreen-shrew-7623-analyzer histogram 1 \n96 hilarious-powderblue-chamois-8115-analyzer histogram 1 \n97 horrible-magenta-sandpiper-8117-analyzer frequent_items 1 \n98 unsightly-bisque-lemur-1917-analyzer frequent_items 1 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 30 30 \n1 1 30 30 \n2 1 30 30 \n3 1 30 30 \n4 1 30 30 \n.. ... ... ... \n94 1 2 2 \n95 1 2 2 \n96 1 2 2 \n97 1 2 2 \n98 1 1 1 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 30 30 0 \n1 30 30 0 \n2 30 30 0 \n3 30 30 0 \n4 30 30 0 \n.. ... ... ... \n94 2 2 0 \n95 2 2 0 \n96 2 2 0 \n97 2 2 0 \n98 1 1 0 \n\n action_targets \n0 [] \n1 [] \n2 [] \n3 [] \n4 [] \n.. ... \n94 [] \n95 [] \n96 [] \n97 [] \n98 [] \n\n[99 rows x 11 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11303030300[]
1cooperative-maroon-parrot-8886discrete-drift-jensenshannon-analyzerfrequent_items11303030300[]
2famous-salmon-cobra-8902famous-salmon-cobra-8902-analyzermin11303030300[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11303030300[]
4Nonecooperative-maroon-parrot-8886-analyzerfrequent_items11303030300[]
....................................
94glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram1122220[]
95breakable-limegreen-shrew-7623breakable-limegreen-shrew-7623-analyzerhistogram1122220[]
96hilarious-powderblue-chamois-8115hilarious-powderblue-chamois-8115-analyzerhistogram1122220[]
97horrible-magenta-sandpiper-8117horrible-magenta-sandpiper-8117-analyzerfrequent_items1122220[]
98unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

99 rows × 11 columns

\n
" + "text/plain": " monitor_id \\\n0 frequent-items-drift-monitor-x2hr9z \n1 discrete-distribution-22ef37c9-monitor \n2 smoggy-chartreuse-owl-3387 \n3 frequent-items-drift-monitor-bx6m80 \n4 frequent-items-drift-monitor-mat0jo \n5 frequent-items-drift-monitor-01rbfl \n6 frequent-items-drift-monitor-0foigt \n7 frequent-items-drift-monitor-3c0hc2 \n8 frequent-items-drift-monitor-9gmtix \n9 elated-palegreen-jaguar-6432 \n10 inferred-data-type-fec5a735-monitor \n11 unique-ratio-b7b84aee-monitor \n12 missing-values-ratio-35881327-monitor \n13 numerical-drift-monitor-6oxi83 \n14 numerical-drift-monitor-8yugth \n15 continuous-distribution-956a280c-monitor \n16 dull-floralwhite-raven-5521 \n\n analyzer_id metric column_count \\\n0 frequent-items-drift-analyzer-x2hr9z frequent_items 3 \n1 discrete-distribution-22ef37c9 frequent_items 3 \n2 smoggy-chartreuse-owl-3387-analyzer frequent_items 3 \n3 frequent-items-drift-analyzer-bx6m80 frequent_items 3 \n4 frequent-items-drift-analyzer-mat0jo frequent_items 3 \n5 frequent-items-drift-analyzer-01rbfl frequent_items 3 \n6 frequent-items-drift-analyzer-0foigt frequent_items 3 \n7 frequent-items-drift-analyzer-3c0hc2 frequent_items 3 \n8 frequent-items-drift-analyzer-9gmtix frequent_items 3 \n9 elated-palegreen-jaguar-6432-analyzer histogram 9 \n10 inferred-data-type-fec5a735 inferred_data_type 1 \n11 unique-ratio-b7b84aee unique_est_ratio 69 \n12 missing-values-ratio-35881327 count_null_ratio 21 \n13 numerical-drift-analyzer-6oxi83 histogram 1 \n14 numerical-drift-analyzer-8yugth histogram 1 \n15 continuous-distribution-956a280c histogram 1 \n16 dull-floralwhite-raven-5521-analyzer count 2 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 34 30 \n1 1 34 30 \n2 1 34 30 \n3 1 34 30 \n4 1 34 30 \n5 1 34 30 \n6 1 34 30 \n7 1 34 30 \n8 1 34 30 \n9 1 75 19 \n10 1 14 14 \n11 1 104 4 \n12 1 27 3 \n13 1 2 2 \n14 1 2 2 \n15 1 2 2 \n16 1 3 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 1 11 1 \n1 1 11 0 \n2 1 11 0 \n3 1 11 0 \n4 1 11 2 \n5 1 11 1 \n6 1 11 0 \n7 1 11 1 \n8 1 11 1 \n9 2 8 0 \n10 14 14 2 \n11 1 1 0 \n12 1 1 0 \n13 2 2 0 \n14 2 2 0 \n15 2 2 0 \n16 1 1 0 \n\n action_targets \n0 [email] \n1 [] \n2 [] \n3 [] \n4 [email, slack] \n5 [email] \n6 [] \n7 [email] \n8 [email] \n9 [] \n10 [email, slack] \n11 [] \n12 [] \n13 [] \n14 [] \n15 [] \n16 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-x2hr9zfrequent-items-drift-analyzer-x2hr9zfrequent_items3134301111[email]
1discrete-distribution-22ef37c9-monitordiscrete-distribution-22ef37c9frequent_items3134301110[]
2smoggy-chartreuse-owl-3387smoggy-chartreuse-owl-3387-analyzerfrequent_items3134301110[]
3frequent-items-drift-monitor-bx6m80frequent-items-drift-analyzer-bx6m80frequent_items3134301110[]
4frequent-items-drift-monitor-mat0jofrequent-items-drift-analyzer-mat0jofrequent_items3134301112[email, slack]
5frequent-items-drift-monitor-01rbflfrequent-items-drift-analyzer-01rbflfrequent_items3134301111[email]
6frequent-items-drift-monitor-0foigtfrequent-items-drift-analyzer-0foigtfrequent_items3134301110[]
7frequent-items-drift-monitor-3c0hc2frequent-items-drift-analyzer-3c0hc2frequent_items3134301111[email]
8frequent-items-drift-monitor-9gmtixfrequent-items-drift-analyzer-9gmtixfrequent_items3134301111[email]
9elated-palegreen-jaguar-6432elated-palegreen-jaguar-6432-analyzerhistogram917519280[]
10inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type11141414142[email, slack]
11unique-ratio-b7b84aee-monitorunique-ratio-b7b84aeeunique_est_ratio6911044110[]
12missing-values-ratio-35881327-monitormissing-values-ratio-35881327count_null_ratio211273110[]
13numerical-drift-monitor-6oxi83numerical-drift-analyzer-6oxi83histogram1122220[]
14numerical-drift-monitor-8yugthnumerical-drift-analyzer-8yugthhistogram1122220[]
15continuous-distribution-956a280c-monitorcontinuous-distribution-956a280chistogram1122220[]
16dull-floralwhite-raven-5521dull-floralwhite-raven-5521-analyzercount2132110[]
\n
" }, "execution_count": 5, "metadata": {}, @@ -185,15 +185,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:04.638177Z", - "start_time": "2024-04-26T20:12:04.626701Z" + "end_time": "2024-05-03T02:03:33.427628Z", + "start_time": "2024-05-03T02:03:33.422561Z" } }, "outputs": [ { "data": { - "text/plain": " monitor_id \\\n0 good-cornsilk-bear-9359 \n1 energetic-black-cobra-7838 \n2 elated-gray-baboon-4620 \n3 missing-values-ratio-monitor-v9uywi \n4 None \n5 expensive-tomato-moose-6522 \n6 curious-lemonchiffon-rabbit-7000 \n7 clear-azure-starling-8883 \n8 light-mintcream-rhinoceros-3655 \n9 handsome-lemonchiffon-eel-4222 \n10 inferred-data-type-monitor-vjwbpo \n11 busy-hotpink-gaur-9703 \n12 unique-ratio-29f3ef1c-monitor \n13 None \n14 fancy-chocolate-wasp-8247 \n15 happy-snow-grouse-452 \n16 plain-fuchsia-stinkbug-4064 \n17 stormy-olive-butterfly-8693 \n18 tame-beige-sardine-3501 \n19 tough-green-hare-1322 \n20 uninterested-blueviolet-reindeer-9950 \n21 uninterested-red-alpaca-2523 \n22 unique-estimate-ratio-monitor-ccf7cl \n\n analyzer_id metric \\\n0 good-cornsilk-bear-9359-analyzer count_null \n1 energetic-black-cobra-7838-analyzer unique_est \n2 elated-gray-baboon-4620-analyzer count_null_ratio \n3 missing-values-ratio-analyzer-v9uywi count_null_ratio \n4 expensive-tomato-moose-6522-analyzer median \n5 csw-analyzer-2 median \n6 curious-lemonchiffon-rabbit-7000-analyzer frequent_items \n7 clear-azure-starling-8883-analyzer frequent_items \n8 light-mintcream-rhinoceros-3655-analyzer frequent_items \n9 handsome-lemonchiffon-eel-4222-analyzer frequent_items \n10 inferred-data-type-analyzer-vjwbpo inferred_data_type \n11 busy-hotpink-gaur-9703-analyzer count_null_ratio \n12 unique-ratio-29f3ef1c unique_est_ratio \n13 eager-violet-newt-4599-analyzer count_null_ratio \n14 fancy-chocolate-wasp-8247-analyzer count \n15 happy-snow-grouse-452-analyzer count_null_ratio \n16 plain-fuchsia-stinkbug-4064-analyzer count_null_ratio \n17 stormy-olive-butterfly-8693-analyzer histogram \n18 tame-beige-sardine-3501-analyzer count_null_ratio \n19 tough-green-hare-1322-analyzer count_null_ratio \n20 uninterested-blueviolet-reindeer-9950-analyzer count \n21 uninterested-red-alpaca-2523-analyzer count_null_ratio \n22 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n\n failed_count max_failed_per_column min_failed_per_column \\\n0 2310 30 30 \n1 60 30 30 \n2 68 30 8 \n3 2607 25 7 \n4 1794 23 23 \n5 562 23 7 \n6 7 7 7 \n7 7 7 7 \n8 31 7 1 \n9 9 7 2 \n10 3 3 3 \n11 1 1 1 \n12 1 1 1 \n13 1 1 1 \n14 1 1 1 \n15 1 1 1 \n16 1 1 1 \n17 1 1 1 \n18 1 1 1 \n19 1 1 1 \n20 1 1 1 \n21 1 1 1 \n22 1 1 1 \n\n avg_failed_per_column action_count action_targets \n0 30 0 [] \n1 30 1 [email] \n2 22 1 [email] \n3 24 1 [email] \n4 23 0 [] \n5 7 0 [] \n6 7 1 [test-sort] \n7 7 1 [test-sort] \n8 5 0 [] \n9 4 0 [] \n10 3 0 [] \n11 1 0 [] \n12 1 0 [] \n13 1 0 [] \n14 1 0 [] \n15 1 0 [] \n16 1 0 [] \n17 1 0 [] \n18 1 0 [] \n19 1 0 [] \n20 1 1 [christine-test-email] \n21 1 0 [] \n22 1 2 [email, slack] ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0good-cornsilk-bear-9359good-cornsilk-bear-9359-analyzercount_null23103030300[]
1energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est603030301[email]
2elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio68308221[email]
3missing-values-ratio-monitor-v9uywimissing-values-ratio-analyzer-v9uywicount_null_ratio2607257241[email]
4Noneexpensive-tomato-moose-6522-analyzermedian17942323230[]
5expensive-tomato-moose-6522csw-analyzer-2median56223770[]
6curious-lemonchiffon-rabbit-7000curious-lemonchiffon-rabbit-7000-analyzerfrequent_items77771[test-sort]
7clear-azure-starling-8883clear-azure-starling-8883-analyzerfrequent_items77771[test-sort]
8light-mintcream-rhinoceros-3655light-mintcream-rhinoceros-3655-analyzerfrequent_items317150[]
9handsome-lemonchiffon-eel-4222handsome-lemonchiffon-eel-4222-analyzerfrequent_items97240[]
10inferred-data-type-monitor-vjwbpoinferred-data-type-analyzer-vjwbpoinferred_data_type33330[]
11busy-hotpink-gaur-9703busy-hotpink-gaur-9703-analyzercount_null_ratio11110[]
12unique-ratio-29f3ef1c-monitorunique-ratio-29f3ef1cunique_est_ratio11110[]
13Noneeager-violet-newt-4599-analyzercount_null_ratio11110[]
14fancy-chocolate-wasp-8247fancy-chocolate-wasp-8247-analyzercount11110[]
15happy-snow-grouse-452happy-snow-grouse-452-analyzercount_null_ratio11110[]
16plain-fuchsia-stinkbug-4064plain-fuchsia-stinkbug-4064-analyzercount_null_ratio11110[]
17stormy-olive-butterfly-8693stormy-olive-butterfly-8693-analyzerhistogram11110[]
18tame-beige-sardine-3501tame-beige-sardine-3501-analyzercount_null_ratio11110[]
19tough-green-hare-1322tough-green-hare-1322-analyzercount_null_ratio11110[]
20uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount11111[christine-test-email]
21uninterested-red-alpaca-2523uninterested-red-alpaca-2523-analyzercount_null_ratio11110[]
22unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio11112[email, slack]
\n
" + "text/plain": " monitor_id analyzer_id \\\n0 inferred-data-type-fec5a735-monitor inferred-data-type-fec5a735 \n1 missing-values-ratio-35881327-monitor missing-values-ratio-35881327 \n2 unique-ratio-b7b84aee-monitor unique-ratio-b7b84aee \n\n metric failed_count max_failed_per_column \\\n0 inferred_data_type 3 3 \n1 count_null_ratio 1 1 \n2 unique_est_ratio 1 1 \n\n min_failed_per_column avg_failed_per_column action_count action_targets \n0 3 3 2 [email, slack] \n1 1 1 0 [] \n2 1 1 0 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type33332[email, slack]
1missing-values-ratio-35881327-monitormissing-values-ratio-35881327count_null_ratio11110[]
2unique-ratio-b7b84aee-monitorunique-ratio-b7b84aeeunique_est_ratio11110[]
\n
" }, "execution_count": 6, "metadata": {}, @@ -217,14 +217,14 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:12:04.642005Z", - "start_time": "2024-04-26T20:12:04.639206Z" + "end_time": "2024-05-03T02:03:33.431118Z", + "start_time": "2024-05-03T02:03:33.428470Z" } }, "outputs": [ { "data": { - "text/plain": "'kind-cyan-kangaroo-1253'" + "text/plain": "'frequent-items-drift-monitor-x2hr9z'" }, "execution_count": 7, "metadata": {}, @@ -250,14 +250,14 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:04.645858Z", - "start_time": "2024-04-26T20:12:04.642843Z" + "end_time": "2024-05-03T02:03:33.434461Z", + "start_time": "2024-05-03T02:03:33.432056Z" } }, "outputs": [ { "data": { - "text/plain": "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279098033, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253', displayName='kind-cyan-kangaroo-1253', tags=None, analyzerIds=['kind-cyan-kangaroo-1253-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[])" + "text/plain": "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1705536890090, author='user_809f777d_3741_4991_8ced_42f09b883ac7', description=None), id='frequent-items-drift-monitor-x2hr9z', displayName=None, tags=None, analyzerIds=['frequent-items-drift-analyzer-x2hr9z'], schedule=ImmediateSchedule(type='immediate'), disabled=False, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset=None, groupBy=None), actions=[GlobalAction(type='global', target='email')])" }, "execution_count": 8, "metadata": {}, @@ -283,14 +283,14 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:05.457603Z", - "start_time": "2024-04-26T20:12:04.647006Z" + "end_time": "2024-05-03T02:03:33.623760Z", + "start_time": "2024-05-03T02:03:33.435077Z" } }, "outputs": [ { "data": { - "text/plain": "Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279095485, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253-analyzer', displayName=None, tags=['featureSelection:all', 'discreteness:non-discrete'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.02, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" + "text/plain": "Analyzer(metadata=Metadata(version=2, schemaVersion=1, updatedTimestamp=1714699900837, author='user_c9292ec40407f7b580f0a2c90745ebfba2b9e6ea81c848ef944d31e48a45f98', description=None), id='frequent-items-drift-analyzer-x2hr9z', displayName=None, tags=None, schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[])], type=, include=[], exclude=[, 'desc', 'issue_d', 'url'], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" }, "execution_count": 9, "metadata": {}, @@ -316,15 +316,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:05.746559Z", - "start_time": "2024-04-26T20:12:05.458340Z" + "end_time": "2024-05-03T02:03:33.839587Z", + "start_time": "2024-05-03T02:03:33.624489Z" } }, "outputs": [ { "data": { - "text/plain": " segment total_anomalies \\\n0 purpose=car&verification_status=Source Verified 30 \n\n batch_count \n0 30 ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
segmenttotal_anomaliesbatch_count
0purpose=car&verification_status=Source Verified3030
\n
" + "text/plain": " segment total_anomalies batch_count\n0 overall 34 30", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
segmenttotal_anomaliesbatch_count
0overall3430
\n
" }, "execution_count": 10, "metadata": {}, @@ -354,14 +354,14 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:12:05.752954Z", - "start_time": "2024-04-26T20:12:05.749874Z" + "end_time": "2024-05-03T02:03:33.846498Z", + "start_time": "2024-05-03T02:03:33.842929Z" } }, "outputs": [ { "data": { - "text/plain": "'purpose=car&verification_status=Source Verified'" + "text/plain": "'overall'" }, "execution_count": 11, "metadata": {}, @@ -386,15 +386,15 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:12:05.892565Z", - "start_time": "2024-04-26T20:12:05.754263Z" + "end_time": "2024-05-03T02:03:33.988624Z", + "start_time": "2024-05-03T02:03:33.847364Z" } }, "outputs": [ { "data": { - "text/plain": " column total_anomalies\n0 pred_credit_risk (output) 30", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
columntotal_anomalies
0pred_credit_risk (output)30
\n
" + "text/plain": " column total_anomalies\n0 issue_d 30\n1 url 3\n2 desc 1\n3 disbursement_method 0\n4 earliest_cr_line 0\n5 emp_length 0\n6 emp_title 0\n7 grade 0\n8 hardship_flag 0\n9 home_ownership 0\n10 initial_list_status 0\n11 last_credit_pull_d 0\n12 last_pymnt_d 0\n13 loan_status 0\n14 next_pymnt_d 0\n15 purpose 0\n16 pymnt_plan 0\n17 sub_grade 0\n18 term 0\n19 title 0\n20 verification_status 0\n21 verification_status_joint 0\n22 addr_state 0\n23 zip_code 0\n24 application_type 0\n25 debt_settlement_flag 0", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
columntotal_anomalies
0issue_d30
1url3
2desc1
3disbursement_method0
4earliest_cr_line0
5emp_length0
6emp_title0
7grade0
8hardship_flag0
9home_ownership0
10initial_list_status0
11last_credit_pull_d0
12last_pymnt_d0
13loan_status0
14next_pymnt_d0
15purpose0
16pymnt_plan0
17sub_grade0
18term0
19title0
20verification_status0
21verification_status_joint0
22addr_state0
23zip_code0
24application_type0
25debt_settlement_flag0
\n
" }, "execution_count": 12, "metadata": {}, @@ -419,14 +419,14 @@ "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:12:05.896831Z", - "start_time": "2024-04-26T20:12:05.893571Z" + "end_time": "2024-05-03T02:03:33.992473Z", + "start_time": "2024-05-03T02:03:33.989400Z" } }, "outputs": [ { "data": { - "text/plain": "['pred_credit_risk (output)']" + "text/plain": "['issue_d',\n 'url',\n 'desc',\n 'disbursement_method',\n 'earliest_cr_line',\n 'emp_length',\n 'emp_title',\n 'grade',\n 'hardship_flag',\n 'home_ownership',\n 'initial_list_status',\n 'last_credit_pull_d',\n 'last_pymnt_d',\n 'loan_status',\n 'next_pymnt_d',\n 'purpose',\n 'pymnt_plan',\n 'sub_grade',\n 'term',\n 'title',\n 'verification_status',\n 'verification_status_joint',\n 'addr_state',\n 'zip_code',\n 'application_type',\n 'debt_settlement_flag']" }, "execution_count": 13, "metadata": {}, @@ -450,8 +450,8 @@ "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:12:07.205296Z", - "start_time": "2024-04-26T20:12:05.897820Z" + "end_time": "2024-05-03T02:03:38.883541Z", + "start_time": "2024-05-03T02:03:33.993121Z" } }, "outputs": [], @@ -467,8 +467,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:12:07.214914Z", - "start_time": "2024-04-26T20:12:07.206661Z" + "end_time": "2024-05-03T02:03:38.892248Z", + "start_time": "2024-05-03T02:03:38.884434Z" } }, "outputs": [ @@ -476,56 +476,50 @@ "name": "stdout", "output_type": "stream", "text": [ - "Diagnosis is for monitor \"kind-cyan-kangaroo-1253\" [kind-cyan-kangaroo-1253] in model-0 org-0, over interval 2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z.\n", + "Diagnosis is for monitor \"frequent-items-drift-monitor-x2hr9z\" [frequent-items-drift-monitor-x2hr9z] in model-0 org-0, over interval 2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z.\n", + "Monitor has 1 notification actions ['email'].\n", "\n", - "Analyzer is drift configuration for histogram metric with TrailingWindow baseline.\n", - "Analyzer \"kind-cyan-kangaroo-1253-analyzer\" targets 1 columns and ran on 1 columns in the diagnosed segment.\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"frequent-items-drift-analyzer-x2hr9z\" targets 27 columns and ran on 26 columns in the diagnosed segment.\n", "\n", "\n", - "Diagnostic segment is \"purpose=car&verification_status=Source Verified\".\n", + "Diagnostic segment is \"overall\".\n", "Diagnostic interval contains 30 batches.\n", "\n", - "Diagnostic interval rollup contains 10473 rows for the diagnosed columns.\n", + "Diagnostic interval rollup contains 1674392 rows for the diagnosed columns.\n", "\n", "Analysis results summary:\n", - "Found non-failed results for 1 columns and 30 batches.\n", - "Found 30 anomalies in 1 columns, with up to 100.0% (30) batches having anomalies per column and 100.0% (30.0) on average.\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 34 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", "Columns with anomalies are:\n", - "| | 0 |\n", - "|---:|:----------------------------------|\n", - "| 0 | ('pred_credit_risk (output)', 30) |\n", + "| | 0 |\n", + "|---:|:----------------|\n", + "| 0 | ('issue_d', 30) |\n", + "| 1 | ('url', 3) |\n", + "| 2 | ('desc', 1) |\n", "\n", "No failures were detected.\n", "\n", - "No issues impacting diagnosis quality were detected\n", + "Conditions that may impact diagnosis quality include:\n", + "\t* analyzer_changed: Analyzer changed within the diagnostic interval - detectors ['stale_analysis', 'changing_discrete', 'low_drift_threshold', 'missing_baseline_batches', 'small_nonnull_batches']\n", + "\n", "Conditions that may contribute to noise include:\n", - "\t* Condition low_drift_threshold (drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm)\n", - "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 1 columns: ['pred_credit_risk (output)']\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", "\n", "Anomalies for columns with these conditions:\n", - "| | 0 |\n", - "|:--------------------------|----:|\n", - "| pred_credit_risk (output) | 30 |\n", - "Accounting for 30 anomalies out of 30\n" + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 3 |\n", + "| desc | 1 |\n", + "Accounting for 34 anomalies out of 34\n" ] } ], "source": [ "print(monitor_report.describe())" ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-26T20:12:07.217841Z", - "start_time": "2024-04-26T20:12:07.216120Z" - } - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/example_notebooks/diagnoser.ipynb b/examples/example_notebooks/diagnoser.ipynb index 140c2fb..ff4d687 100644 --- a/examples/example_notebooks/diagnoser.ipynb +++ b/examples/example_notebooks/diagnoser.ipynb @@ -18,14 +18,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 1, "metadata": { "pycharm": { "name": "#%%\n" }, "ExecuteTime": { - "end_time": "2024-04-26T20:11:47.303557Z", - "start_time": "2024-04-26T20:11:47.300880Z" + "end_time": "2024-05-03T01:31:30.980544Z", + "start_time": "2024-05-03T01:31:30.977943Z" } }, "outputs": [], @@ -48,11 +48,11 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:49.832834Z", - "start_time": "2024-04-26T20:11:47.324580Z" + "end_time": "2024-05-03T01:31:33.173740Z", + "start_time": "2024-05-03T01:31:30.988187Z" } }, "outputs": [], @@ -60,10 +60,10 @@ "import getpass\n", "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", "\n", - "org_id = 'org-0'\n", - "dataset_id = 'model-0'\n", + "org_id = input(\"Enter org ID\")\n", + "dataset_id = input(\"Enter model/dataset ID\")\n", "api_key = getpass.getpass()\n", - "api_endpoint = 'https://songbird.development.whylabsdev.com'\n", + "api_endpoint = 'https://api.whylabsapp.com'\n", "\n", "env_setup(\n", " org_id=org_id,\n", @@ -82,11 +82,11 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:49.847125Z", - "start_time": "2024-04-26T20:11:49.841234Z" + "end_time": "2024-05-03T01:31:33.433678Z", + "start_time": "2024-05-03T01:31:33.175869Z" } }, "outputs": [], @@ -106,19 +106,19 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.604061Z", - "start_time": "2024-04-26T20:11:49.849546Z" + "end_time": "2024-05-03T01:31:40.402540Z", + "start_time": "2024-05-03T01:31:33.436496Z" } }, "outputs": [ { "data": { - "text/plain": "MonitorDiagnosisReport(orgId='org-0', datasetId='model-0', analyzerId='kind-cyan-kangaroo-1253-analyzer', interval='2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z', expectedBatchCount=30, diagnosticData=DiagnosticDataSummary(diagnosticSegment=Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')]), diagnosticProfile=ProfileSummary(minRowName='pred_credit_risk (output)', minRowCount=10473, maxRowName='pred_credit_risk (output)', maxRowCount=10473), diagnosticBatches=BatchesSummary(minBatchName='pred_credit_risk (output)', minBatchCount=30, maxBatchName='pred_credit_risk (output)', maxBatchCount=30), analysisResults=AnalysisResultsSummary(results=ResultRecord(diagnosedColumnCount=1, batchCount=30), failures=FailureRecord(totalFailuresCount=0, maxFailuresCount=0, meanFailuresCount=0, byColumnCount=[], byTypeCount=[]), anomalies=AnomalyRecord(totalAnomalyCount=30, maxAnomalyCount=30, meanAnomalyCount=30, batchCount=30, byColumnCount=[NamedCount(name='pred_credit_risk (output)', count=30)], byColumnBatchCount=[NamedCount(name='pred_credit_risk (output)', count=30)])), targetedColumnCount=1), qualityIssues=[], conditions=[ConditionRecord(columns=None, info={'threshold': 0.02, 'expected': 0.7, 'algo': 'hellinger'}, summary='drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm', name='low_drift_threshold'), ConditionRecord(columns=['pred_credit_risk (output)'], info=None, summary='less than 500 non-null records in 50% or more of the batches', name='small_nonnull_batches')], monitor=Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279098033, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253', displayName='kind-cyan-kangaroo-1253', tags=None, analyzerIds=['kind-cyan-kangaroo-1253-analyzer'], schedule=ImmediateSchedule(type='immediate'), disabled=None, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset='P7D', groupBy=None), actions=[]), analyzer=Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1703279095485, author='user_1759fb08_1a01_4852_9ed4_91c6fceede45', description=None), id='kind-cyan-kangaroo-1253-analyzer', displayName=None, tags=['featureSelection:all', 'discreteness:non-discrete'], schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[SegmentTag(key='purpose', value='car'), SegmentTag(key='verification_status', value='Source Verified')])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.02, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None))), analyzedColumnCount=1)" + "text/plain": "MonitorDiagnosisReport(orgId='org-0', datasetId='model-0', analyzerId='frequent-items-drift-analyzer-x2hr9z', interval='2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z', expectedBatchCount=0, diagnosticData=DiagnosticDataSummary(diagnosticSegment=Segment(tags=[]), diagnosticProfile=ProfileSummary(minRowName='desc', minRowCount=1674392, maxRowName='desc', maxRowCount=1674392), diagnosticBatches=BatchesSummary(minBatchName='desc', minBatchCount=30, maxBatchName='desc', maxBatchCount=30), analysisResults=AnalysisResultsSummary(results=ResultRecord(diagnosedColumnCount=26, batchCount=30), failures=FailureRecord(totalFailuresCount=0, maxFailuresCount=0, meanFailuresCount=0, byColumnCount=[], byTypeCount=[]), anomalies=AnomalyRecord(totalAnomalyCount=34, maxAnomalyCount=30, meanAnomalyCount=11, batchCount=30, byColumnCount=[NamedCount(name='issue_d', count=30), NamedCount(name='url', count=3), NamedCount(name='desc', count=1)], byColumnBatchCount=[NamedCount(name='addr_state', count=30), NamedCount(name='application_type', count=30), NamedCount(name='debt_settlement_flag', count=30), NamedCount(name='desc', count=2), NamedCount(name='disbursement_method', count=30), NamedCount(name='earliest_cr_line', count=30), NamedCount(name='emp_length', count=30), NamedCount(name='emp_title', count=30), NamedCount(name='grade', count=30), NamedCount(name='hardship_flag', count=30), NamedCount(name='home_ownership', count=30), NamedCount(name='initial_list_status', count=30), NamedCount(name='issue_d', count=30), NamedCount(name='last_credit_pull_d', count=30), NamedCount(name='last_pymnt_d', count=30), NamedCount(name='loan_status', count=30), NamedCount(name='next_pymnt_d', count=30), NamedCount(name='purpose', count=30), NamedCount(name='pymnt_plan', count=30), NamedCount(name='sub_grade', count=30), NamedCount(name='term', count=30), NamedCount(name='title', count=30), NamedCount(name='url', count=30), NamedCount(name='verification_status', count=30), NamedCount(name='verification_status_joint', count=30), NamedCount(name='zip_code', count=30)])), targetedColumnCount=30), qualityIssues=[], conditions=[ConditionRecord(columns=['desc', 'issue_d', 'url'], info=None, summary='many values are unique across batches', name='changing_discrete'), ConditionRecord(columns=['desc'], info=None, summary='less than 500 non-null records in 50% or more of the batches', name='small_nonnull_batches')], monitor=Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1705536890090, author='user_809f777d_3741_4991_8ced_42f09b883ac7', description=None), id='frequent-items-drift-monitor-x2hr9z', displayName=None, tags=None, analyzerIds=['frequent-items-drift-analyzer-x2hr9z'], schedule=ImmediateSchedule(type='immediate'), disabled=False, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset=None, groupBy=None), actions=[GlobalAction(type='global', target='email')]), analyzer=Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1705536888574, author='user_809f777d_3741_4991_8ced_42f09b883ac7', description=None), id='frequent-items-drift-analyzer-x2hr9z', displayName=None, tags=None, schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None))), analyzedColumnCount=26)" }, - "execution_count": 34, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -130,11 +130,11 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.614329Z", - "start_time": "2024-04-26T20:11:54.607449Z" + "end_time": "2024-05-03T01:31:40.415056Z", + "start_time": "2024-05-03T01:31:40.405237Z" } }, "outputs": [ @@ -142,37 +142,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "Diagnosis is for monitor \"kind-cyan-kangaroo-1253\" [kind-cyan-kangaroo-1253] in model-0 org-0, over interval 2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z.\n", + "Diagnosis is for monitor \"frequent-items-drift-monitor-x2hr9z\" [frequent-items-drift-monitor-x2hr9z] in model-0 org-0, over interval 2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z.\n", + "Monitor has 1 notification actions ['email'].\n", "\n", - "Analyzer is drift configuration for histogram metric with TrailingWindow baseline.\n", - "Analyzer \"kind-cyan-kangaroo-1253-analyzer\" targets 1 columns and ran on 1 columns in the diagnosed segment.\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"frequent-items-drift-analyzer-x2hr9z\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", "\n", "\n", - "Diagnostic segment is \"purpose=car&verification_status=Source Verified\".\n", + "Diagnostic segment is \"overall\".\n", "Diagnostic interval contains 30 batches.\n", "\n", - "Diagnostic interval rollup contains 10473 rows for the diagnosed columns.\n", + "Diagnostic interval rollup contains 1674392 rows for the diagnosed columns.\n", "\n", "Analysis results summary:\n", - "Found non-failed results for 1 columns and 30 batches.\n", - "Found 30 anomalies in 1 columns, with up to 100.0% (30) batches having anomalies per column and 100.0% (30.0) on average.\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 34 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", "Columns with anomalies are:\n", - "| | 0 |\n", - "|---:|:----------------------------------|\n", - "| 0 | ('pred_credit_risk (output)', 30) |\n", + "| | 0 |\n", + "|---:|:----------------|\n", + "| 0 | ('issue_d', 30) |\n", + "| 1 | ('url', 3) |\n", + "| 2 | ('desc', 1) |\n", "\n", "No failures were detected.\n", "\n", "No issues impacting diagnosis quality were detected\n", "Conditions that may contribute to noise include:\n", - "\t* Condition low_drift_threshold (drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm)\n", - "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 1 columns: ['pred_credit_risk (output)']\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", "\n", "Anomalies for columns with these conditions:\n", - "| | 0 |\n", - "|:--------------------------|----:|\n", - "| pred_credit_risk (output) | 30 |\n", - "Accounting for 30 anomalies out of 30\n" + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 3 |\n", + "| desc | 1 |\n", + "Accounting for 34 anomalies out of 34\n" ] } ], @@ -194,11 +199,11 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.618780Z", - "start_time": "2024-04-26T20:11:54.615546Z" + "end_time": "2024-05-03T01:31:40.419753Z", + "start_time": "2024-05-03T01:31:40.415990Z" } }, "outputs": [], @@ -209,11 +214,11 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.624171Z", - "start_time": "2024-04-26T20:11:54.620454Z" + "end_time": "2024-05-03T01:31:40.424467Z", + "start_time": "2024-05-03T01:31:40.420491Z" } }, "outputs": [ @@ -224,37 +229,28 @@ "{\n", " \"orgId\": \"org-0\",\n", " \"datasetId\": \"model-0\",\n", - " \"analyzerId\": \"kind-cyan-kangaroo-1253-analyzer\",\n", - " \"interval\": \"2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z\",\n", - " \"expectedBatchCount\": 30,\n", + " \"analyzerId\": \"frequent-items-drift-analyzer-x2hr9z\",\n", + " \"interval\": \"2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z\",\n", + " \"expectedBatchCount\": 0,\n", " \"diagnosticData\": {\n", " \"diagnosticSegment\": {\n", - " \"tags\": [\n", - " {\n", - " \"key\": \"purpose\",\n", - " \"value\": \"car\"\n", - " },\n", - " {\n", - " \"key\": \"verification_status\",\n", - " \"value\": \"Source Verified\"\n", - " }\n", - " ]\n", + " \"tags\": []\n", " },\n", " \"diagnosticProfile\": {\n", - " \"minRowName\": \"pred_credit_risk (output)\",\n", - " \"minRowCount\": 10473,\n", - " \"maxRowName\": \"pred_credit_risk (output)\",\n", - " \"maxRowCount\": 10473\n", + " \"minRowName\": \"desc\",\n", + " \"minRowCount\": 1674392,\n", + " \"maxRowName\": \"desc\",\n", + " \"maxRowCount\": 1674392\n", " },\n", " \"diagnosticBatches\": {\n", - " \"minBatchName\": \"pred_credit_risk (output)\",\n", + " \"minBatchName\": \"desc\",\n", " \"minBatchCount\": 30,\n", - " \"maxBatchName\": \"pred_credit_risk (output)\",\n", + " \"maxBatchName\": \"desc\",\n", " \"maxBatchCount\": 30\n", " },\n", " \"analysisResults\": {\n", " \"results\": {\n", - " \"diagnosedColumnCount\": 1,\n", + " \"diagnosedColumnCount\": 26,\n", " \"batchCount\": 30\n", " },\n", " \"failures\": {\n", @@ -265,41 +261,149 @@ " \"byTypeCount\": []\n", " },\n", " \"anomalies\": {\n", - " \"totalAnomalyCount\": 30,\n", + " \"totalAnomalyCount\": 34,\n", " \"maxAnomalyCount\": 30,\n", - " \"meanAnomalyCount\": 30,\n", + " \"meanAnomalyCount\": 11,\n", " \"batchCount\": 30,\n", " \"byColumnCount\": [\n", " {\n", - " \"name\": \"pred_credit_risk (output)\",\n", + " \"name\": \"issue_d\",\n", " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"url\",\n", + " \"count\": 3\n", + " },\n", + " {\n", + " \"name\": \"desc\",\n", + " \"count\": 1\n", " }\n", " ],\n", " \"byColumnBatchCount\": [\n", " {\n", - " \"name\": \"pred_credit_risk (output)\",\n", + " \"name\": \"addr_state\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"application_type\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"debt_settlement_flag\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"desc\",\n", + " \"count\": 2\n", + " },\n", + " {\n", + " \"name\": \"disbursement_method\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"earliest_cr_line\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"emp_length\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"emp_title\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"grade\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"hardship_flag\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"home_ownership\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"initial_list_status\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"issue_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"last_credit_pull_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"last_pymnt_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"loan_status\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"next_pymnt_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"purpose\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"pymnt_plan\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"sub_grade\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"term\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"title\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"url\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"verification_status\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"verification_status_joint\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"zip_code\",\n", " \"count\": 30\n", " }\n", " ]\n", " }\n", " },\n", - " \"targetedColumnCount\": 1\n", + " \"targetedColumnCount\": 30\n", " },\n", " \"qualityIssues\": [],\n", " \"conditions\": [\n", " {\n", - " \"columns\": null,\n", - " \"info\": {\n", - " \"threshold\": 0.02,\n", - " \"expected\": 0.7,\n", - " \"algo\": \"hellinger\"\n", - " },\n", - " \"summary\": \"drift threshold of 0.02 is lower than typical value of 0.7 for the hellinger algorithm\",\n", - " \"name\": \"low_drift_threshold\"\n", + " \"columns\": [\n", + " \"desc\",\n", + " \"issue_d\",\n", + " \"url\"\n", + " ],\n", + " \"info\": null,\n", + " \"summary\": \"many values are unique across batches\",\n", + " \"name\": \"changing_discrete\"\n", " },\n", " {\n", " \"columns\": [\n", - " \"pred_credit_risk (output)\"\n", + " \"desc\"\n", " ],\n", " \"info\": null,\n", " \"summary\": \"less than 500 non-null records in 50% or more of the batches\",\n", @@ -310,44 +414,46 @@ " \"metadata\": {\n", " \"version\": 1,\n", " \"schemaVersion\": 1,\n", - " \"updatedTimestamp\": 1703279098033,\n", - " \"author\": \"user_1759fb08_1a01_4852_9ed4_91c6fceede45\",\n", + " \"updatedTimestamp\": 1705536890090,\n", + " \"author\": \"user_809f777d_3741_4991_8ced_42f09b883ac7\",\n", " \"description\": null\n", " },\n", - " \"id\": \"kind-cyan-kangaroo-1253\",\n", - " \"displayName\": \"kind-cyan-kangaroo-1253\",\n", + " \"id\": \"frequent-items-drift-monitor-x2hr9z\",\n", + " \"displayName\": null,\n", " \"tags\": null,\n", " \"analyzerIds\": [\n", - " \"kind-cyan-kangaroo-1253-analyzer\"\n", + " \"frequent-items-drift-analyzer-x2hr9z\"\n", " ],\n", " \"schedule\": {\n", " \"type\": \"immediate\"\n", " },\n", - " \"disabled\": null,\n", + " \"disabled\": false,\n", " \"severity\": 3,\n", " \"mode\": {\n", " \"type\": \"DIGEST\",\n", " \"filter\": null,\n", " \"creationTimeOffset\": null,\n", - " \"datasetTimestampOffset\": \"P7D\",\n", + " \"datasetTimestampOffset\": null,\n", " \"groupBy\": null\n", " },\n", - " \"actions\": []\n", + " \"actions\": [\n", + " {\n", + " \"type\": \"global\",\n", + " \"target\": \"email\"\n", + " }\n", + " ]\n", " },\n", " \"analyzer\": {\n", " \"metadata\": {\n", " \"version\": 1,\n", " \"schemaVersion\": 1,\n", - " \"updatedTimestamp\": 1703279095485,\n", - " \"author\": \"user_1759fb08_1a01_4852_9ed4_91c6fceede45\",\n", + " \"updatedTimestamp\": 1705536888574,\n", + " \"author\": \"user_809f777d_3741_4991_8ced_42f09b883ac7\",\n", " \"description\": null\n", " },\n", - " \"id\": \"kind-cyan-kangaroo-1253-analyzer\",\n", + " \"id\": \"frequent-items-drift-analyzer-x2hr9z\",\n", " \"displayName\": null,\n", - " \"tags\": [\n", - " \"featureSelection:all\",\n", - " \"discreteness:non-discrete\"\n", - " ],\n", + " \"tags\": null,\n", " \"schedule\": {\n", " \"type\": \"fixed\",\n", " \"cadence\": \"daily\",\n", @@ -358,24 +464,15 @@ " \"targetMatrix\": {\n", " \"segments\": [\n", " {\n", - " \"tags\": [\n", - " {\n", - " \"key\": \"purpose\",\n", - " \"value\": \"car\"\n", - " },\n", - " {\n", - " \"key\": \"verification_status\",\n", - " \"value\": \"Source Verified\"\n", - " }\n", - " ]\n", + " \"tags\": []\n", " }\n", " ],\n", " \"type\": \"column\",\n", " \"include\": [\n", - " \"group:continuous\"\n", + " \"group:discrete\"\n", " ],\n", " \"exclude\": [\n", - " \"group:input\"\n", + " \"group:output\"\n", " ],\n", " \"profileId\": null\n", " },\n", @@ -385,10 +482,10 @@ " \"config\": {\n", " \"schemaVersion\": null,\n", " \"params\": null,\n", - " \"metric\": \"histogram\",\n", + " \"metric\": \"frequent_items\",\n", " \"type\": \"drift\",\n", " \"algorithm\": \"hellinger\",\n", - " \"threshold\": 0.02,\n", + " \"threshold\": 0.7,\n", " \"minBatchSize\": 1,\n", " \"baseline\": {\n", " \"datasetId\": null,\n", @@ -400,7 +497,7 @@ " }\n", " }\n", " },\n", - " \"analyzedColumnCount\": 1\n", + " \"analyzedColumnCount\": 26\n", "}\n" ] } @@ -424,11 +521,11 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.628392Z", - "start_time": "2024-04-26T20:11:54.624956Z" + "end_time": "2024-05-03T01:31:40.435769Z", + "start_time": "2024-05-03T01:31:40.425350Z" } }, "outputs": [ @@ -436,7 +533,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "1. Make a manual change to the analyzer to address small_nonnull_batches: less than 500 non-null records in 50% or more of the batches for ['pred_credit_risk (output)']\n" + "1. Remove columns from the analyzer for ['desc', 'issue_d', 'url']\n", + "2. Make a manual change to the analyzer to address small_nonnull_batches: less than 500 non-null records in 50% or more of the batches for ['desc']\n" ] } ], @@ -465,18 +563,20 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.632913Z", - "start_time": "2024-04-26T20:11:54.629301Z" + "end_time": "2024-05-03T01:31:40.438830Z", + "start_time": "2024-05-03T01:31:40.436652Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", - "text": [] + "text": [ + "Remove columns from the analyzer for ['desc', 'issue_d', 'url']\n" + ] } ], "source": [ @@ -486,18 +586,21 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.635587Z", - "start_time": "2024-04-26T20:11:54.633557Z" + "end_time": "2024-05-03T01:31:41.510071Z", + "start_time": "2024-05-03T01:31:40.439570Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", - "text": [] + "text": [ + "Successfully made the following changes:\n", + "\t* Remove columns from the analyzer for ['desc', 'issue_d', 'url']\n" + ] } ], "source": [ @@ -522,20 +625,20 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:54.645911Z", - "start_time": "2024-04-26T20:11:54.636354Z" + "end_time": "2024-05-03T01:31:41.528882Z", + "start_time": "2024-05-03T01:31:41.513691Z" } }, "outputs": [ { "data": { - "text/plain": " monitor_id \\\n0 kind-cyan-kangaroo-1253 \n1 cooperative-maroon-parrot-8886 \n2 famous-salmon-cobra-8902 \n3 proud-seagreen-carabeef-65 \n4 None \n.. ... \n94 glamorous-orchid-turtle-6425 \n95 breakable-limegreen-shrew-7623 \n96 hilarious-powderblue-chamois-8115 \n97 horrible-magenta-sandpiper-8117 \n98 unsightly-bisque-lemur-1917 \n\n analyzer_id metric column_count \\\n0 kind-cyan-kangaroo-1253-analyzer histogram 1 \n1 discrete-drift-jensenshannon-analyzer frequent_items 1 \n2 famous-salmon-cobra-8902-analyzer min 1 \n3 proud-seagreen-carabeef-65-analyzer histogram 1 \n4 cooperative-maroon-parrot-8886-analyzer frequent_items 1 \n.. ... ... ... \n94 glamorous-orchid-turtle-6425-analyzer histogram 1 \n95 breakable-limegreen-shrew-7623-analyzer histogram 1 \n96 hilarious-powderblue-chamois-8115-analyzer histogram 1 \n97 horrible-magenta-sandpiper-8117-analyzer frequent_items 1 \n98 unsightly-bisque-lemur-1917-analyzer frequent_items 1 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 30 30 \n1 1 30 30 \n2 1 30 30 \n3 1 30 30 \n4 1 30 30 \n.. ... ... ... \n94 1 2 2 \n95 1 2 2 \n96 1 2 2 \n97 1 2 2 \n98 1 1 1 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 30 30 0 \n1 30 30 0 \n2 30 30 0 \n3 30 30 0 \n4 30 30 0 \n.. ... ... ... \n94 2 2 0 \n95 2 2 0 \n96 2 2 0 \n97 2 2 0 \n98 1 1 0 \n\n action_targets \n0 [] \n1 [] \n2 [] \n3 [] \n4 [] \n.. ... \n94 [] \n95 [] \n96 [] \n97 [] \n98 [] \n\n[99 rows x 11 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0kind-cyan-kangaroo-1253kind-cyan-kangaroo-1253-analyzerhistogram11303030300[]
1cooperative-maroon-parrot-8886discrete-drift-jensenshannon-analyzerfrequent_items11303030300[]
2famous-salmon-cobra-8902famous-salmon-cobra-8902-analyzermin11303030300[]
3proud-seagreen-carabeef-65proud-seagreen-carabeef-65-analyzerhistogram11303030300[]
4Nonecooperative-maroon-parrot-8886-analyzerfrequent_items11303030300[]
....................................
94glamorous-orchid-turtle-6425glamorous-orchid-turtle-6425-analyzerhistogram1122220[]
95breakable-limegreen-shrew-7623breakable-limegreen-shrew-7623-analyzerhistogram1122220[]
96hilarious-powderblue-chamois-8115hilarious-powderblue-chamois-8115-analyzerhistogram1122220[]
97horrible-magenta-sandpiper-8117horrible-magenta-sandpiper-8117-analyzerfrequent_items1122220[]
98unsightly-bisque-lemur-1917unsightly-bisque-lemur-1917-analyzerfrequent_items1111110[]
\n

99 rows × 11 columns

\n
" + "text/plain": " monitor_id \\\n0 frequent-items-drift-monitor-x2hr9z \n1 discrete-distribution-22ef37c9-monitor \n2 smoggy-chartreuse-owl-3387 \n3 frequent-items-drift-monitor-bx6m80 \n4 frequent-items-drift-monitor-mat0jo \n5 frequent-items-drift-monitor-01rbfl \n6 frequent-items-drift-monitor-0foigt \n7 frequent-items-drift-monitor-3c0hc2 \n8 frequent-items-drift-monitor-9gmtix \n9 elated-palegreen-jaguar-6432 \n10 inferred-data-type-fec5a735-monitor \n11 unique-ratio-b7b84aee-monitor \n12 missing-values-ratio-35881327-monitor \n13 numerical-drift-monitor-6oxi83 \n14 numerical-drift-monitor-8yugth \n15 continuous-distribution-956a280c-monitor \n16 dull-floralwhite-raven-5521 \n\n analyzer_id metric column_count \\\n0 frequent-items-drift-analyzer-x2hr9z frequent_items 3 \n1 discrete-distribution-22ef37c9 frequent_items 3 \n2 smoggy-chartreuse-owl-3387-analyzer frequent_items 3 \n3 frequent-items-drift-analyzer-bx6m80 frequent_items 3 \n4 frequent-items-drift-analyzer-mat0jo frequent_items 3 \n5 frequent-items-drift-analyzer-01rbfl frequent_items 3 \n6 frequent-items-drift-analyzer-0foigt frequent_items 3 \n7 frequent-items-drift-analyzer-3c0hc2 frequent_items 3 \n8 frequent-items-drift-analyzer-9gmtix frequent_items 3 \n9 elated-palegreen-jaguar-6432-analyzer histogram 9 \n10 inferred-data-type-fec5a735 inferred_data_type 1 \n11 unique-ratio-b7b84aee unique_est_ratio 69 \n12 missing-values-ratio-35881327 count_null_ratio 21 \n13 numerical-drift-analyzer-6oxi83 histogram 1 \n14 numerical-drift-analyzer-8yugth histogram 1 \n15 continuous-distribution-956a280c histogram 1 \n16 dull-floralwhite-raven-5521-analyzer count 2 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 34 30 \n1 1 34 30 \n2 1 34 30 \n3 1 34 30 \n4 1 34 30 \n5 1 34 30 \n6 1 34 30 \n7 1 34 30 \n8 1 34 30 \n9 1 75 19 \n10 1 14 14 \n11 1 104 4 \n12 1 27 3 \n13 1 2 2 \n14 1 2 2 \n15 1 2 2 \n16 1 3 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 1 11 1 \n1 1 11 0 \n2 1 11 0 \n3 1 11 0 \n4 1 11 2 \n5 1 11 1 \n6 1 11 0 \n7 1 11 1 \n8 1 11 1 \n9 2 8 0 \n10 14 14 2 \n11 1 1 0 \n12 1 1 0 \n13 2 2 0 \n14 2 2 0 \n15 2 2 0 \n16 1 1 0 \n\n action_targets \n0 [email] \n1 [] \n2 [] \n3 [] \n4 [email, slack] \n5 [email] \n6 [] \n7 [email] \n8 [email] \n9 [] \n10 [email, slack] \n11 [] \n12 [] \n13 [] \n14 [] \n15 [] \n16 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-x2hr9zfrequent-items-drift-analyzer-x2hr9zfrequent_items3134301111[email]
1discrete-distribution-22ef37c9-monitordiscrete-distribution-22ef37c9frequent_items3134301110[]
2smoggy-chartreuse-owl-3387smoggy-chartreuse-owl-3387-analyzerfrequent_items3134301110[]
3frequent-items-drift-monitor-bx6m80frequent-items-drift-analyzer-bx6m80frequent_items3134301110[]
4frequent-items-drift-monitor-mat0jofrequent-items-drift-analyzer-mat0jofrequent_items3134301112[email, slack]
5frequent-items-drift-monitor-01rbflfrequent-items-drift-analyzer-01rbflfrequent_items3134301111[email]
6frequent-items-drift-monitor-0foigtfrequent-items-drift-analyzer-0foigtfrequent_items3134301110[]
7frequent-items-drift-monitor-3c0hc2frequent-items-drift-analyzer-3c0hc2frequent_items3134301111[email]
8frequent-items-drift-monitor-9gmtixfrequent-items-drift-analyzer-9gmtixfrequent_items3134301111[email]
9elated-palegreen-jaguar-6432elated-palegreen-jaguar-6432-analyzerhistogram917519280[]
10inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type11141414142[email, slack]
11unique-ratio-b7b84aee-monitorunique-ratio-b7b84aeeunique_est_ratio6911044110[]
12missing-values-ratio-35881327-monitormissing-values-ratio-35881327count_null_ratio211273110[]
13numerical-drift-monitor-6oxi83numerical-drift-analyzer-6oxi83histogram1122220[]
14numerical-drift-monitor-8yugthnumerical-drift-analyzer-8yugthhistogram1122220[]
15continuous-distribution-956a280c-monitorcontinuous-distribution-956a280chistogram1122220[]
16dull-floralwhite-raven-5521dull-floralwhite-raven-5521-analyzercount2132110[]
\n
" }, - "execution_count": 41, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -558,11 +661,11 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 20, "metadata": { "ExecuteTime": { - "end_time": "2024-04-26T20:11:57.298847Z", - "start_time": "2024-04-26T20:11:54.646676Z" + "end_time": "2024-05-03T01:59:10.186840Z", + "start_time": "2024-05-03T01:59:03.667269Z" } }, "outputs": [ @@ -570,41 +673,50 @@ "name": "stdout", "output_type": "stream", "text": [ - "Diagnosis is for monitor \"discrete-drift-jensenshannon\" [cooperative-maroon-parrot-8886] in model-0 org-0, over interval 2024-03-26T00:00:00.000Z/2024-04-25T00:00:00.000Z.\n", + "discrete-distribution-22ef37c9-monitor\n", + "Diagnosis is for monitor \"discrete-distribution-22ef37c9-monitor\" [discrete-distribution-22ef37c9-monitor] in model-0 org-0, over interval 2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z.\n", "\n", "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", - "Analyzer \"discrete-drift-jensenshannon-analyzer\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", + "Analyzer \"discrete-distribution-22ef37c9\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", "\n", "\n", "Diagnostic segment is \"overall\".\n", "Diagnostic interval contains 30 batches.\n", "\n", - "Diagnostic interval rollup contains 1945487 rows for the diagnosed columns.\n", + "Diagnostic interval rollup contains 1674392 rows for the diagnosed columns.\n", "\n", "Analysis results summary:\n", "Found non-failed results for 26 columns and 30 batches.\n", - "Found 30 anomalies in 1 columns, with up to 100.0% (30) batches having anomalies per column and 100.0% (30.0) on average.\n", + "Found 34 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", "Columns with anomalies are:\n", "| | 0 |\n", "|---:|:----------------|\n", "| 0 | ('issue_d', 30) |\n", + "| 1 | ('url', 3) |\n", + "| 2 | ('desc', 1) |\n", "\n", "No failures were detected.\n", "\n", - "No issues impacting diagnosis quality were detected\n", + "Conditions that may impact diagnosis quality include:\n", + "\t* analyzer_changed: Analyzer changed within the diagnostic interval - detectors ['stale_analysis', 'changing_discrete', 'low_drift_threshold', 'missing_baseline_batches', 'small_nonnull_batches']\n", + "\n", "Conditions that may contribute to noise include:\n", - "\t* Condition changing_discrete (many values are unique across batches) for 1 columns: ['issue_d']\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", "\n", "Anomalies for columns with these conditions:\n", "| | 0 |\n", "|:--------|----:|\n", "| issue_d | 30 |\n", - "Accounting for 30 anomalies out of 30\n" + "| url | 3 |\n", + "| desc | 1 |\n", + "Accounting for 34 anomalies out of 34\n" ] } ], "source": [ "diagnoser.monitor_id_to_diagnose = noisy_monitors_df.iloc[1]['monitor_id']\n", + "print(diagnoser.monitor_id_to_diagnose)\n", "monitor_report = diagnoser.diagnose()\n", "print(monitor_report.describe())" ] @@ -620,21 +732,21 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 21, "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-26T20:11:57.307971Z", - "start_time": "2024-04-26T20:11:57.300890Z" + "end_time": "2024-05-03T01:59:21.543539Z", + "start_time": "2024-05-03T01:59:21.533198Z" } }, "outputs": [ { "data": { - "text/plain": " monitor_id \\\n0 frequent-items-drift-monitor-u31vmb \n1 frequent-items-drift-monitor-uu0ax8 \n2 frequent-items-drift-monitor-48ukw1 \n3 frequent-items-drift-monitor-jepz7t \n4 frequent-items-drift-monitor-pxexvn \n5 nice-burlywood-tarsier-4771 \n6 energetic-black-cobra-7838 \n7 elated-gray-baboon-4620 \n8 old-crimson-starling-2516 \n9 uninterested-blueviolet-reindeer-9950 \n10 numerical-drift-monitor-zy4q8v \n11 unique-estimate-ratio-monitor-ccf7cl \n12 numerical-drift-monitor-jpodsg \n13 numerical-drift-monitor-60dfcc \n\n analyzer_id metric \\\n0 frequent-items-drift-analyzer-u31vmb frequent_items \n1 frequent-items-drift-analyzer-uu0ax8 frequent_items \n2 frequent-items-drift-analyzer-48ukw1 frequent_items \n3 frequent-items-drift-analyzer-jepz7t frequent_items \n4 frequent-items-drift-analyzer-pxexvn frequent_items \n5 nice-burlywood-tarsier-4771-analyzer unique_est \n6 energetic-black-cobra-7838-analyzer unique_est \n7 elated-gray-baboon-4620-analyzer count_null_ratio \n8 old-crimson-starling-2516-analyzer frequent_items \n9 uninterested-blueviolet-reindeer-9950-analyzer count \n10 numerical-drift-analyzer-zy4q8v histogram \n11 unique-estimate-ratio-analyzer-ccf7cl unique_est_ratio \n12 numerical-drift-analyzer-jpodsg histogram \n13 numerical-drift-analyzer-60dfcc histogram \n\n column_count segment_count anomaly_count max_anomaly_per_column \\\n0 2 1 31 30 \n1 2 1 31 30 \n2 2 1 31 30 \n3 2 1 31 30 \n4 2 1 31 30 \n5 7 1 106 30 \n6 7 1 80 30 \n7 13 1 64 30 \n8 2 1 24 23 \n9 77 1 152 9 \n10 3 1 18 8 \n11 104 1 394 7 \n12 1 1 2 2 \n13 1 1 2 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 1 15 2 \n1 1 15 3 \n2 1 15 2 \n3 1 15 2 \n4 1 15 2 \n5 2 15 2 \n6 2 11 1 \n7 1 4 1 \n8 1 12 1 \n9 1 1 1 \n10 2 6 1 \n11 1 3 2 \n12 2 2 2 \n13 2 2 2 \n\n action_targets \n0 [email, slack] \n1 [email, slack, email-victor-at-whylabs] \n2 [email, slack] \n3 [email, slack] \n4 [email, slack] \n5 [slack, email] \n6 [email] \n7 [email] \n8 [email] \n9 [christine-test-email] \n10 [email] \n11 [email, slack] \n12 [email, slack] \n13 [email, slack] ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-u31vmbfrequent-items-drift-analyzer-u31vmbfrequent_items2131301152[email, slack]
1frequent-items-drift-monitor-uu0ax8frequent-items-drift-analyzer-uu0ax8frequent_items2131301153[email, slack, email-victor-at-whylabs]
2frequent-items-drift-monitor-48ukw1frequent-items-drift-analyzer-48ukw1frequent_items2131301152[email, slack]
3frequent-items-drift-monitor-jepz7tfrequent-items-drift-analyzer-jepz7tfrequent_items2131301152[email, slack]
4frequent-items-drift-monitor-pxexvnfrequent-items-drift-analyzer-pxexvnfrequent_items2131301152[email, slack]
5nice-burlywood-tarsier-4771nice-burlywood-tarsier-4771-analyzerunique_est71106302152[slack, email]
6energetic-black-cobra-7838energetic-black-cobra-7838-analyzerunique_est7180302111[email]
7elated-gray-baboon-4620elated-gray-baboon-4620-analyzercount_null_ratio1316430141[email]
8old-crimson-starling-2516old-crimson-starling-2516-analyzerfrequent_items2124231121[email]
9uninterested-blueviolet-reindeer-9950uninterested-blueviolet-reindeer-9950-analyzercount7711529111[christine-test-email]
10numerical-drift-monitor-zy4q8vnumerical-drift-analyzer-zy4q8vhistogram31188261[email]
11unique-estimate-ratio-monitor-ccf7clunique-estimate-ratio-analyzer-ccf7clunique_est_ratio10413947132[email, slack]
12numerical-drift-monitor-jpodsgnumerical-drift-analyzer-jpodsghistogram1122222[email, slack]
13numerical-drift-monitor-60dfccnumerical-drift-analyzer-60dfcchistogram1122222[email, slack]
\n
" + "text/plain": " monitor_id analyzer_id \\\n0 frequent-items-drift-monitor-x2hr9z frequent-items-drift-analyzer-x2hr9z \n1 frequent-items-drift-monitor-mat0jo frequent-items-drift-analyzer-mat0jo \n2 frequent-items-drift-monitor-01rbfl frequent-items-drift-analyzer-01rbfl \n3 frequent-items-drift-monitor-3c0hc2 frequent-items-drift-analyzer-3c0hc2 \n4 frequent-items-drift-monitor-9gmtix frequent-items-drift-analyzer-9gmtix \n5 inferred-data-type-fec5a735-monitor inferred-data-type-fec5a735 \n\n metric column_count segment_count anomaly_count \\\n0 frequent_items 3 1 34 \n1 frequent_items 3 1 34 \n2 frequent_items 3 1 34 \n3 frequent_items 3 1 34 \n4 frequent_items 3 1 34 \n5 inferred_data_type 1 1 14 \n\n max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \\\n0 30 1 11 \n1 30 1 11 \n2 30 1 11 \n3 30 1 11 \n4 30 1 11 \n5 14 14 14 \n\n action_count action_targets \n0 1 [email] \n1 2 [email, slack] \n2 1 [email] \n3 1 [email] \n4 1 [email] \n5 2 [email, slack] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-x2hr9zfrequent-items-drift-analyzer-x2hr9zfrequent_items3134301111[email]
1frequent-items-drift-monitor-mat0jofrequent-items-drift-analyzer-mat0jofrequent_items3134301112[email, slack]
2frequent-items-drift-monitor-01rbflfrequent-items-drift-analyzer-01rbflfrequent_items3134301111[email]
3frequent-items-drift-monitor-3c0hc2frequent-items-drift-analyzer-3c0hc2frequent_items3134301111[email]
4frequent-items-drift-monitor-9gmtixfrequent-items-drift-analyzer-9gmtixfrequent_items3134301111[email]
5inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type11141414142[email, slack]
\n
" }, - "execution_count": 43, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -642,19 +754,6 @@ "source": [ "pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors_with_actions])\n" ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-26T20:11:57.310423Z", - "start_time": "2024-04-26T20:11:57.308851Z" - } - }, - "outputs": [], - "source": [] } ], "metadata": { From 19f3cdec48b73589840b1c1a96705129254c5b11 Mon Sep 17 00:00:00 2001 From: Christine Draper Date: Thu, 2 May 2024 22:05:23 -0400 Subject: [PATCH 14/14] Bump version for release to prod --- .bumpversion.cfg | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 033fb2e..95bb0a5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0-dev0 +current_version = 0.1.0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/pyproject.toml b/pyproject.toml index 9743f24..aa889ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "whylabs-toolkit" -version = "0.1.0-dev0" +version = "0.1.0" description = "Whylabs Toolkit package." authors = ["Murilo Mendonca ", "Anthony Naddeo ", "Christine Draper "]