diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8535547..95bb0a5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.18 +current_version = 0.1.0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/Makefile b/Makefile index 5370fbb..c20d5f3 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ format-fix: poetry run autoflake --in-place --remove-unused-variables $(PY_SOURCE) setup: - poetry install + poetry install -E diagnoser test: poetry run pytest diff --git a/README.md b/README.md index 5484946..8a4bd60 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,11 @@ pip install whylabs_toolkit The available packages that we have enable different use-cases for the `whylabs_toolkit`. To get started, navigate to one of the following sections and find useful tutorials there. -| Package | Usage | -|---------------------|----------------------| -| [Monitor Manager](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/monitor/manager/README.md) | Author and modify existing WhyLabs monitor with Python | -| [WhyLabs Helpers](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/helpers/README.md) | Interact with and modify your Datasets and ML Models specs in WhyLabs. | +| Package | Usage | +|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------| +| [Monitor Manager](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/monitor/manager/README.md) | Author and modify existing WhyLabs monitor with Python. | +| [Monitor Diagnoser](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/monitor/diagnoser/README.md) | Diagnose problems with monitors. | +| [WhyLabs Helpers](https://github.com/whylabs/whylabs-toolkit/blob/mainline/whylabs_toolkit/helpers/README.md) | Interact with and modify your Datasets and ML Models specs in WhyLabs. | ## Development diff --git a/examples/example_notebooks/customized_diagnoser.ipynb b/examples/example_notebooks/customized_diagnoser.ipynb new file mode 100644 index 0000000..5f183a7 --- /dev/null +++ b/examples/example_notebooks/customized_diagnoser.ipynb @@ -0,0 +1,546 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Detecting noisy monitors\n", + "\n", + "This notebook shows how to use the WhyLabs Monitor Diagnoser to customize the diagnosis of a noisy monitor. It interacts with the diagnoser to get information on noisy and failing monitors, and to make selections about which monitor, segment and columns to diagnose.\n", + "\n", + "## Install requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:15.122705Z", + "start_time": "2024-05-03T02:03:15.119284Z" + } + }, + "outputs": [], + "source": [ + "#%pip install whylabs-toolkit[diagnoser]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Setup whylabs API connection\n", + "\n", + "First, set up the information to connect to WhyLabs. Update the org_id, dataset_id and api_key in the following before running it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:31.740325Z", + "start_time": "2024-05-03T02:03:15.137102Z" + } + }, + "outputs": [], + "source": [ + "import getpass\n", + "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", + "\n", + "org_id = input(\"Enter org ID\")\n", + "dataset_id = input(\"Enter model/dataset ID\")\n", + "api_key = getpass.getpass(\"Enter API key\")\n", + "api_endpoint = 'https://api.whylabsapp.com'\n", + "\n", + "env_setup(\n", + " org_id=org_id,\n", + " dataset_id=dataset_id,\n", + " api_key=api_key,\n", + " whylabs_endpoint=api_endpoint\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Then initialize the Monitor Diagnoser with the org_id and dataset_id." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:32.055442Z", + "start_time": "2024-05-03T02:03:31.743114Z" + } + }, + "outputs": [], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.monitor_diagnoser import MonitorDiagnoser\n", + "diagnoser = MonitorDiagnoser(org_id, dataset_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Running a customized diagnosis\n", + "## Get the recommended diagnostic interval\n", + "\n", + "Get the dataset start/end time, granularity, and a recommended diagnostic interval for the dataset. The diagnoser will use this interval unless you override it by setting the `diagnostic_interval` property." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T02:03:32.690456Z", + "start_time": "2024-05-03T02:03:32.056136Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "(TimeRange(start=datetime.datetime(2021, 5, 20, 0, 0, tzinfo=datetime.timezone.utc), end=datetime.datetime(2024, 5, 2, 21, 0, tzinfo=datetime.timezone.utc)),\n ,\n '2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z')" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lineage, granularity, interval = diagnoser.choose_dataset_batches()\n", + "lineage, granularity, interval" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get information on noisy and failing monitors\n", + "\n", + "Get information on how many anomalies are detected by each monitor in the dataset. The results are ordered so that the monitors with the most anomalies per column are first (i.e. monitors which are firing on the many batches for certain columns). Beyond that, results with a higher average number of anomalies per column are considered noisier." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.421897Z", + "start_time": "2024-05-03T02:03:32.692755Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " monitor_id \\\n0 frequent-items-drift-monitor-x2hr9z \n1 discrete-distribution-22ef37c9-monitor \n2 smoggy-chartreuse-owl-3387 \n3 frequent-items-drift-monitor-bx6m80 \n4 frequent-items-drift-monitor-mat0jo \n5 frequent-items-drift-monitor-01rbfl \n6 frequent-items-drift-monitor-0foigt \n7 frequent-items-drift-monitor-3c0hc2 \n8 frequent-items-drift-monitor-9gmtix \n9 elated-palegreen-jaguar-6432 \n10 inferred-data-type-fec5a735-monitor \n11 unique-ratio-b7b84aee-monitor \n12 missing-values-ratio-35881327-monitor \n13 numerical-drift-monitor-6oxi83 \n14 numerical-drift-monitor-8yugth \n15 continuous-distribution-956a280c-monitor \n16 dull-floralwhite-raven-5521 \n\n analyzer_id metric column_count \\\n0 frequent-items-drift-analyzer-x2hr9z frequent_items 3 \n1 discrete-distribution-22ef37c9 frequent_items 3 \n2 smoggy-chartreuse-owl-3387-analyzer frequent_items 3 \n3 frequent-items-drift-analyzer-bx6m80 frequent_items 3 \n4 frequent-items-drift-analyzer-mat0jo frequent_items 3 \n5 frequent-items-drift-analyzer-01rbfl frequent_items 3 \n6 frequent-items-drift-analyzer-0foigt frequent_items 3 \n7 frequent-items-drift-analyzer-3c0hc2 frequent_items 3 \n8 frequent-items-drift-analyzer-9gmtix frequent_items 3 \n9 elated-palegreen-jaguar-6432-analyzer histogram 9 \n10 inferred-data-type-fec5a735 inferred_data_type 1 \n11 unique-ratio-b7b84aee unique_est_ratio 69 \n12 missing-values-ratio-35881327 count_null_ratio 21 \n13 numerical-drift-analyzer-6oxi83 histogram 1 \n14 numerical-drift-analyzer-8yugth histogram 1 \n15 continuous-distribution-956a280c histogram 1 \n16 dull-floralwhite-raven-5521-analyzer count 2 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 34 30 \n1 1 34 30 \n2 1 34 30 \n3 1 34 30 \n4 1 34 30 \n5 1 34 30 \n6 1 34 30 \n7 1 34 30 \n8 1 34 30 \n9 1 75 19 \n10 1 14 14 \n11 1 104 4 \n12 1 27 3 \n13 1 2 2 \n14 1 2 2 \n15 1 2 2 \n16 1 3 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 1 11 1 \n1 1 11 0 \n2 1 11 0 \n3 1 11 0 \n4 1 11 2 \n5 1 11 1 \n6 1 11 0 \n7 1 11 1 \n8 1 11 1 \n9 2 8 0 \n10 14 14 2 \n11 1 1 0 \n12 1 1 0 \n13 2 2 0 \n14 2 2 0 \n15 2 2 0 \n16 1 1 0 \n\n action_targets \n0 [email] \n1 [] \n2 [] \n3 [] \n4 [email, slack] \n5 [email] \n6 [] \n7 [email] \n8 [email] \n9 [] \n10 [email, slack] \n11 [] \n12 [] \n13 [] \n14 [] \n15 [] \n16 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-x2hr9zfrequent-items-drift-analyzer-x2hr9zfrequent_items3134301111[email]
1discrete-distribution-22ef37c9-monitordiscrete-distribution-22ef37c9frequent_items3134301110[]
2smoggy-chartreuse-owl-3387smoggy-chartreuse-owl-3387-analyzerfrequent_items3134301110[]
3frequent-items-drift-monitor-bx6m80frequent-items-drift-analyzer-bx6m80frequent_items3134301110[]
4frequent-items-drift-monitor-mat0jofrequent-items-drift-analyzer-mat0jofrequent_items3134301112[email, slack]
5frequent-items-drift-monitor-01rbflfrequent-items-drift-analyzer-01rbflfrequent_items3134301111[email]
6frequent-items-drift-monitor-0foigtfrequent-items-drift-analyzer-0foigtfrequent_items3134301110[]
7frequent-items-drift-monitor-3c0hc2frequent-items-drift-analyzer-3c0hc2frequent_items3134301111[email]
8frequent-items-drift-monitor-9gmtixfrequent-items-drift-analyzer-9gmtixfrequent_items3134301111[email]
9elated-palegreen-jaguar-6432elated-palegreen-jaguar-6432-analyzerhistogram917519280[]
10inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type11141414142[email, slack]
11unique-ratio-b7b84aee-monitorunique-ratio-b7b84aeeunique_est_ratio6911044110[]
12missing-values-ratio-35881327-monitormissing-values-ratio-35881327count_null_ratio211273110[]
13numerical-drift-monitor-6oxi83numerical-drift-analyzer-6oxi83histogram1122220[]
14numerical-drift-monitor-8yugthnumerical-drift-analyzer-8yugthhistogram1122220[]
15continuous-distribution-956a280c-monitorcontinuous-distribution-956a280chistogram1122220[]
16dull-floralwhite-raven-5521dull-floralwhite-raven-5521-analyzercount2132110[]
\n
" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "noisy_monitors = diagnoser.detect_noisy_monitors()\n", + "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in noisy_monitors])\n", + "noisy_monitors_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Once you have run `detect_noisy_monitors`, you can retrieve the result at any time via the `noisy_monitors` property. You can also retrieve\n", + " information about monitors with analysis failures using `failed_monitors`. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.427628Z", + "start_time": "2024-05-03T02:03:33.422561Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " monitor_id analyzer_id \\\n0 inferred-data-type-fec5a735-monitor inferred-data-type-fec5a735 \n1 missing-values-ratio-35881327-monitor missing-values-ratio-35881327 \n2 unique-ratio-b7b84aee-monitor unique-ratio-b7b84aee \n\n metric failed_count max_failed_per_column \\\n0 inferred_data_type 3 3 \n1 count_null_ratio 1 1 \n2 unique_est_ratio 1 1 \n\n min_failed_per_column avg_failed_per_column action_count action_targets \n0 3 3 2 [email, slack] \n1 1 1 0 [] \n2 1 1 0 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetricfailed_countmax_failed_per_columnmin_failed_per_columnavg_failed_per_columnaction_countaction_targets
0inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type33332[email, slack]
1missing-values-ratio-35881327-monitormissing-values-ratio-35881327count_null_ratio11110[]
2unique-ratio-b7b84aee-monitorunique-ratio-b7b84aeeunique_est_ratio11110[]
\n
" + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "failed_monitors_df = pd.DataFrame.from_records([n.dict() for n in diagnoser.failed_monitors])\n", + "failed_monitors_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From this information, the diagnoser chooses the most noisy monitor that has notification actions to diagnose. This choice can be overridden by setting the `monitor_id_to_diagnose` property of the diagnoser to the desired monitor id. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.431118Z", + "start_time": "2024-05-03T02:03:33.428470Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "'frequent-items-drift-monitor-x2hr9z'" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnoser.monitor_id_to_diagnose" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "We can get the monitor object from the diagnoser, to see its display name and any other useful information." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.434461Z", + "start_time": "2024-05-03T02:03:33.432056Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1705536890090, author='user_809f777d_3741_4991_8ced_42f09b883ac7', description=None), id='frequent-items-drift-monitor-x2hr9z', displayName=None, tags=None, analyzerIds=['frequent-items-drift-analyzer-x2hr9z'], schedule=ImmediateSchedule(type='immediate'), disabled=False, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset=None, groupBy=None), actions=[GlobalAction(type='global', target='email')])" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnoser.monitor_to_diagnose" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "We can similarly see the configuration of the analyzer that is being diagnosed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.623760Z", + "start_time": "2024-05-03T02:03:33.435077Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "Analyzer(metadata=Metadata(version=2, schemaVersion=1, updatedTimestamp=1714699900837, author='user_c9292ec40407f7b580f0a2c90745ebfba2b9e6ea81c848ef944d31e48a45f98', description=None), id='frequent-items-drift-analyzer-x2hr9z', displayName=None, tags=None, schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[])], type=, include=[], exclude=[, 'desc', 'issue_d', 'url'], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None)))" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnoser.analyzer_to_diagnose" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get information on noisy and failing segments in the analyzer\n", + "\n", + "Now we use the diagnoser to get information about noisy and failing segments in the analyzer, so we can choose a segment to diagnose. The results are sorted so the segment with the most anomalies for the selected monitor is first." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.839587Z", + "start_time": "2024-05-03T02:03:33.624489Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " segment total_anomalies batch_count\n0 overall 34 30", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
segmenttotal_anomaliesbatch_count
0overall3430
\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.helpers.utils import segment_as_readable_text\n", + "\n", + "noisy_segments = diagnoser.detect_noisy_segments()\n", + "noisy_segments_df = pd.DataFrame.from_records([n.dict() for n in noisy_segments])\n", + "noisy_segments_df['segment'] = [segment_as_readable_text(n.segment.tags) for n in noisy_segments]\n", + "noisy_segments_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "The diagnoser chooses the noisiest segment to diagnose. This can be changed by setting the `diagnostic_segment` property." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.846498Z", + "start_time": "2024-05-03T02:03:33.842929Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "'overall'" + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_as_readable_text(diagnoser.diagnostic_segment.tags)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get information on noisy columns \n", + "\n", + "The next step is to get information on the noisy columns within the segment, so we can choose a subset of columns to diagnose. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.988624Z", + "start_time": "2024-05-03T02:03:33.847364Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " column total_anomalies\n0 issue_d 30\n1 url 3\n2 desc 1\n3 disbursement_method 0\n4 earliest_cr_line 0\n5 emp_length 0\n6 emp_title 0\n7 grade 0\n8 hardship_flag 0\n9 home_ownership 0\n10 initial_list_status 0\n11 last_credit_pull_d 0\n12 last_pymnt_d 0\n13 loan_status 0\n14 next_pymnt_d 0\n15 purpose 0\n16 pymnt_plan 0\n17 sub_grade 0\n18 term 0\n19 title 0\n20 verification_status 0\n21 verification_status_joint 0\n22 addr_state 0\n23 zip_code 0\n24 application_type 0\n25 debt_settlement_flag 0", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
columntotal_anomalies
0issue_d30
1url3
2desc1
3disbursement_method0
4earliest_cr_line0
5emp_length0
6emp_title0
7grade0
8hardship_flag0
9home_ownership0
10initial_list_status0
11last_credit_pull_d0
12last_pymnt_d0
13loan_status0
14next_pymnt_d0
15purpose0
16pymnt_plan0
17sub_grade0
18term0
19title0
20verification_status0
21verification_status_joint0
22addr_state0
23zip_code0
24application_type0
25debt_settlement_flag0
\n
" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "noisy_columns = diagnoser.detect_noisy_columns()\n", + "noisy_columns_df = pd.DataFrame.from_records([n.dict() for n in noisy_columns])\n", + "noisy_columns_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The API limits diagnosis to 100 columns at a time, so we choose the top 100 noisy columns. We could then iterate through other columns if desired." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T02:03:33.992473Z", + "start_time": "2024-05-03T02:03:33.989400Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "['issue_d',\n 'url',\n 'desc',\n 'disbursement_method',\n 'earliest_cr_line',\n 'emp_length',\n 'emp_title',\n 'grade',\n 'hardship_flag',\n 'home_ownership',\n 'initial_list_status',\n 'last_credit_pull_d',\n 'last_pymnt_d',\n 'loan_status',\n 'next_pymnt_d',\n 'purpose',\n 'pymnt_plan',\n 'sub_grade',\n 'term',\n 'title',\n 'verification_status',\n 'verification_status_joint',\n 'addr_state',\n 'zip_code',\n 'application_type',\n 'debt_settlement_flag']" + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = list(noisy_columns_df.column[:100])\n", + "columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ask for a monitor diagnosis\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T02:03:38.883541Z", + "start_time": "2024-05-03T02:03:33.993121Z" + } + }, + "outputs": [], + "source": [ + "# for now, we need to enforce this to run using local server\n", + "import os\n", + "monitor_report = diagnoser.diagnose(columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T02:03:38.892248Z", + "start_time": "2024-05-03T02:03:38.884434Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"frequent-items-drift-monitor-x2hr9z\" [frequent-items-drift-monitor-x2hr9z] in model-0 org-0, over interval 2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z.\n", + "Monitor has 1 notification actions ['email'].\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"frequent-items-drift-analyzer-x2hr9z\" targets 27 columns and ran on 26 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 1674392 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 34 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|---:|:----------------|\n", + "| 0 | ('issue_d', 30) |\n", + "| 1 | ('url', 3) |\n", + "| 2 | ('desc', 1) |\n", + "\n", + "No failures were detected.\n", + "\n", + "Conditions that may impact diagnosis quality include:\n", + "\t* analyzer_changed: Analyzer changed within the diagnostic interval - detectors ['stale_analysis', 'changing_discrete', 'low_drift_threshold', 'missing_baseline_batches', 'small_nonnull_batches']\n", + "\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 3 |\n", + "| desc | 1 |\n", + "Accounting for 34 anomalies out of 34\n" + ] + } + ], + "source": [ + "print(monitor_report.describe())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/example_notebooks/diagnoser.ipynb b/examples/example_notebooks/diagnoser.ipynb new file mode 100644 index 0000000..ff4d687 --- /dev/null +++ b/examples/example_notebooks/diagnoser.ipynb @@ -0,0 +1,780 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Detecting noisy monitors\n", + "\n", + "This notebook shows how to detect noisy monitors in a dataset using the WhyLabs Monitor Diagnoser. It uses the diagnoser to automatically detect the noisiest monitor for dataset, get a diagnosis of\n", + "the conditions causing the noise, get recommended changes and where automatable, apply those changes.\n", + "\n", + "## Install requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2024-05-03T01:31:30.980544Z", + "start_time": "2024-05-03T01:31:30.977943Z" + } + }, + "outputs": [], + "source": [ + "# %pip install whylabs-toolkit[diagnoser]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Setup whylabs API connection\n", + "\n", + "First, set up the information to connect to WhyLabs. Update the org_id, dataset_id and api_key in the following before running it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:33.173740Z", + "start_time": "2024-05-03T01:31:30.988187Z" + } + }, + "outputs": [], + "source": [ + "import getpass\n", + "from whylabs_toolkit.monitor.diagnoser.helpers.utils import env_setup\n", + "\n", + "org_id = input(\"Enter org ID\")\n", + "dataset_id = input(\"Enter model/dataset ID\")\n", + "api_key = getpass.getpass()\n", + "api_endpoint = 'https://api.whylabsapp.com'\n", + "\n", + "env_setup(\n", + " org_id=org_id,\n", + " dataset_id=dataset_id,\n", + " api_key=api_key,\n", + " whylabs_endpoint=api_endpoint\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize the Monitor Diagnoser with the org_id and dataset_id." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:33.433678Z", + "start_time": "2024-05-03T01:31:33.175869Z" + } + }, + "outputs": [], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.monitor_diagnoser import MonitorDiagnoser\n", + "diagnoser = MonitorDiagnoser(org_id, dataset_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run the default diagnosis\n", + "\n", + "With no further input, the diagnoser will make a series of calls to identify the noisiest monitor, segment and columns; and then perform a diagnosis." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:40.402540Z", + "start_time": "2024-05-03T01:31:33.436496Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "MonitorDiagnosisReport(orgId='org-0', datasetId='model-0', analyzerId='frequent-items-drift-analyzer-x2hr9z', interval='2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z', expectedBatchCount=0, diagnosticData=DiagnosticDataSummary(diagnosticSegment=Segment(tags=[]), diagnosticProfile=ProfileSummary(minRowName='desc', minRowCount=1674392, maxRowName='desc', maxRowCount=1674392), diagnosticBatches=BatchesSummary(minBatchName='desc', minBatchCount=30, maxBatchName='desc', maxBatchCount=30), analysisResults=AnalysisResultsSummary(results=ResultRecord(diagnosedColumnCount=26, batchCount=30), failures=FailureRecord(totalFailuresCount=0, maxFailuresCount=0, meanFailuresCount=0, byColumnCount=[], byTypeCount=[]), anomalies=AnomalyRecord(totalAnomalyCount=34, maxAnomalyCount=30, meanAnomalyCount=11, batchCount=30, byColumnCount=[NamedCount(name='issue_d', count=30), NamedCount(name='url', count=3), NamedCount(name='desc', count=1)], byColumnBatchCount=[NamedCount(name='addr_state', count=30), NamedCount(name='application_type', count=30), NamedCount(name='debt_settlement_flag', count=30), NamedCount(name='desc', count=2), NamedCount(name='disbursement_method', count=30), NamedCount(name='earliest_cr_line', count=30), NamedCount(name='emp_length', count=30), NamedCount(name='emp_title', count=30), NamedCount(name='grade', count=30), NamedCount(name='hardship_flag', count=30), NamedCount(name='home_ownership', count=30), NamedCount(name='initial_list_status', count=30), NamedCount(name='issue_d', count=30), NamedCount(name='last_credit_pull_d', count=30), NamedCount(name='last_pymnt_d', count=30), NamedCount(name='loan_status', count=30), NamedCount(name='next_pymnt_d', count=30), NamedCount(name='purpose', count=30), NamedCount(name='pymnt_plan', count=30), NamedCount(name='sub_grade', count=30), NamedCount(name='term', count=30), NamedCount(name='title', count=30), NamedCount(name='url', count=30), NamedCount(name='verification_status', count=30), NamedCount(name='verification_status_joint', count=30), NamedCount(name='zip_code', count=30)])), targetedColumnCount=30), qualityIssues=[], conditions=[ConditionRecord(columns=['desc', 'issue_d', 'url'], info=None, summary='many values are unique across batches', name='changing_discrete'), ConditionRecord(columns=['desc'], info=None, summary='less than 500 non-null records in 50% or more of the batches', name='small_nonnull_batches')], monitor=Monitor(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1705536890090, author='user_809f777d_3741_4991_8ced_42f09b883ac7', description=None), id='frequent-items-drift-monitor-x2hr9z', displayName=None, tags=None, analyzerIds=['frequent-items-drift-analyzer-x2hr9z'], schedule=ImmediateSchedule(type='immediate'), disabled=False, severity=3, mode=DigestMode(type='DIGEST', filter=None, creationTimeOffset=None, datasetTimestampOffset=None, groupBy=None), actions=[GlobalAction(type='global', target='email')]), analyzer=Analyzer(metadata=Metadata(version=1, schemaVersion=1, updatedTimestamp=1705536888574, author='user_809f777d_3741_4991_8ced_42f09b883ac7', description=None), id='frequent-items-drift-analyzer-x2hr9z', displayName=None, tags=None, schedule=FixedCadenceSchedule(type='fixed', cadence=, exclusionRanges=None), disabled=None, disableTargetRollup=None, targetMatrix=ColumnMatrix(segments=[Segment(tags=[])], type=, include=[], exclude=[], profileId=None), dataReadinessDuration=None, batchCoolDownPeriod=None, backfillGracePeriodDuration=None, config=DriftConfig(schemaVersion=None, params=None, metric=, type=, algorithm='hellinger', threshold=0.7, minBatchSize=1, baseline=TrailingWindowBaseline(datasetId=None, inheritSegment=None, type=, size=7, offset=None, exclusionRanges=None))), analyzedColumnCount=26)" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "monitor_report = diagnoser.diagnose()\n", + "monitor_report" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:40.415056Z", + "start_time": "2024-05-03T01:31:40.405237Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diagnosis is for monitor \"frequent-items-drift-monitor-x2hr9z\" [frequent-items-drift-monitor-x2hr9z] in model-0 org-0, over interval 2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z.\n", + "Monitor has 1 notification actions ['email'].\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"frequent-items-drift-analyzer-x2hr9z\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 1674392 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 34 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|---:|:----------------|\n", + "| 0 | ('issue_d', 30) |\n", + "| 1 | ('url', 3) |\n", + "| 2 | ('desc', 1) |\n", + "\n", + "No failures were detected.\n", + "\n", + "No issues impacting diagnosis quality were detected\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 3 |\n", + "| desc | 1 |\n", + "Accounting for 34 anomalies out of 34\n" + ] + } + ], + "source": [ + "print(monitor_report.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-09T21:59:48.180867Z", + "start_time": "2024-03-09T21:59:48.177537Z" + } + }, + "source": [ + "The monitor report can be serialized to a JSON file for later use." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:40.419753Z", + "start_time": "2024-05-03T01:31:40.415990Z" + } + }, + "outputs": [], + "source": [ + "with open('monitor_report.json', 'w') as f:\n", + " f.write(monitor_report.json())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:40.424467Z", + "start_time": "2024-05-03T01:31:40.420491Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"orgId\": \"org-0\",\n", + " \"datasetId\": \"model-0\",\n", + " \"analyzerId\": \"frequent-items-drift-analyzer-x2hr9z\",\n", + " \"interval\": \"2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z\",\n", + " \"expectedBatchCount\": 0,\n", + " \"diagnosticData\": {\n", + " \"diagnosticSegment\": {\n", + " \"tags\": []\n", + " },\n", + " \"diagnosticProfile\": {\n", + " \"minRowName\": \"desc\",\n", + " \"minRowCount\": 1674392,\n", + " \"maxRowName\": \"desc\",\n", + " \"maxRowCount\": 1674392\n", + " },\n", + " \"diagnosticBatches\": {\n", + " \"minBatchName\": \"desc\",\n", + " \"minBatchCount\": 30,\n", + " \"maxBatchName\": \"desc\",\n", + " \"maxBatchCount\": 30\n", + " },\n", + " \"analysisResults\": {\n", + " \"results\": {\n", + " \"diagnosedColumnCount\": 26,\n", + " \"batchCount\": 30\n", + " },\n", + " \"failures\": {\n", + " \"totalFailuresCount\": 0,\n", + " \"maxFailuresCount\": 0,\n", + " \"meanFailuresCount\": 0,\n", + " \"byColumnCount\": [],\n", + " \"byTypeCount\": []\n", + " },\n", + " \"anomalies\": {\n", + " \"totalAnomalyCount\": 34,\n", + " \"maxAnomalyCount\": 30,\n", + " \"meanAnomalyCount\": 11,\n", + " \"batchCount\": 30,\n", + " \"byColumnCount\": [\n", + " {\n", + " \"name\": \"issue_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"url\",\n", + " \"count\": 3\n", + " },\n", + " {\n", + " \"name\": \"desc\",\n", + " \"count\": 1\n", + " }\n", + " ],\n", + " \"byColumnBatchCount\": [\n", + " {\n", + " \"name\": \"addr_state\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"application_type\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"debt_settlement_flag\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"desc\",\n", + " \"count\": 2\n", + " },\n", + " {\n", + " \"name\": \"disbursement_method\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"earliest_cr_line\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"emp_length\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"emp_title\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"grade\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"hardship_flag\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"home_ownership\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"initial_list_status\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"issue_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"last_credit_pull_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"last_pymnt_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"loan_status\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"next_pymnt_d\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"purpose\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"pymnt_plan\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"sub_grade\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"term\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"title\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"url\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"verification_status\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"verification_status_joint\",\n", + " \"count\": 30\n", + " },\n", + " {\n", + " \"name\": \"zip_code\",\n", + " \"count\": 30\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " \"targetedColumnCount\": 30\n", + " },\n", + " \"qualityIssues\": [],\n", + " \"conditions\": [\n", + " {\n", + " \"columns\": [\n", + " \"desc\",\n", + " \"issue_d\",\n", + " \"url\"\n", + " ],\n", + " \"info\": null,\n", + " \"summary\": \"many values are unique across batches\",\n", + " \"name\": \"changing_discrete\"\n", + " },\n", + " {\n", + " \"columns\": [\n", + " \"desc\"\n", + " ],\n", + " \"info\": null,\n", + " \"summary\": \"less than 500 non-null records in 50% or more of the batches\",\n", + " \"name\": \"small_nonnull_batches\"\n", + " }\n", + " ],\n", + " \"monitor\": {\n", + " \"metadata\": {\n", + " \"version\": 1,\n", + " \"schemaVersion\": 1,\n", + " \"updatedTimestamp\": 1705536890090,\n", + " \"author\": \"user_809f777d_3741_4991_8ced_42f09b883ac7\",\n", + " \"description\": null\n", + " },\n", + " \"id\": \"frequent-items-drift-monitor-x2hr9z\",\n", + " \"displayName\": null,\n", + " \"tags\": null,\n", + " \"analyzerIds\": [\n", + " \"frequent-items-drift-analyzer-x2hr9z\"\n", + " ],\n", + " \"schedule\": {\n", + " \"type\": \"immediate\"\n", + " },\n", + " \"disabled\": false,\n", + " \"severity\": 3,\n", + " \"mode\": {\n", + " \"type\": \"DIGEST\",\n", + " \"filter\": null,\n", + " \"creationTimeOffset\": null,\n", + " \"datasetTimestampOffset\": null,\n", + " \"groupBy\": null\n", + " },\n", + " \"actions\": [\n", + " {\n", + " \"type\": \"global\",\n", + " \"target\": \"email\"\n", + " }\n", + " ]\n", + " },\n", + " \"analyzer\": {\n", + " \"metadata\": {\n", + " \"version\": 1,\n", + " \"schemaVersion\": 1,\n", + " \"updatedTimestamp\": 1705536888574,\n", + " \"author\": \"user_809f777d_3741_4991_8ced_42f09b883ac7\",\n", + " \"description\": null\n", + " },\n", + " \"id\": \"frequent-items-drift-analyzer-x2hr9z\",\n", + " \"displayName\": null,\n", + " \"tags\": null,\n", + " \"schedule\": {\n", + " \"type\": \"fixed\",\n", + " \"cadence\": \"daily\",\n", + " \"exclusionRanges\": null\n", + " },\n", + " \"disabled\": null,\n", + " \"disableTargetRollup\": null,\n", + " \"targetMatrix\": {\n", + " \"segments\": [\n", + " {\n", + " \"tags\": []\n", + " }\n", + " ],\n", + " \"type\": \"column\",\n", + " \"include\": [\n", + " \"group:discrete\"\n", + " ],\n", + " \"exclude\": [\n", + " \"group:output\"\n", + " ],\n", + " \"profileId\": null\n", + " },\n", + " \"dataReadinessDuration\": null,\n", + " \"batchCoolDownPeriod\": null,\n", + " \"backfillGracePeriodDuration\": null,\n", + " \"config\": {\n", + " \"schemaVersion\": null,\n", + " \"params\": null,\n", + " \"metric\": \"frequent_items\",\n", + " \"type\": \"drift\",\n", + " \"algorithm\": \"hellinger\",\n", + " \"threshold\": 0.7,\n", + " \"minBatchSize\": 1,\n", + " \"baseline\": {\n", + " \"datasetId\": null,\n", + " \"inheritSegment\": null,\n", + " \"type\": \"TrailingWindow\",\n", + " \"size\": 7,\n", + " \"offset\": null,\n", + " \"exclusionRanges\": null\n", + " }\n", + " }\n", + " },\n", + " \"analyzedColumnCount\": 26\n", + "}\n" + ] + } + ], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.models import MonitorDiagnosisReport\n", + "\n", + "with open('monitor_report.json', 'r') as f:\n", + " monitor_report = MonitorDiagnosisReport.parse_raw(f.read())\n", + "print(monitor_report.json(indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ask for recommended changes\n", + "\n", + "Given the diagnosis report for the monitor, the ChangeRecommender will recommend changes to make to the monitor. By default it will make recommendations for all columns where it has detected noise-related conditions. Set the `min_anomaly_count` property to restrict this to only columns that caused a certain number of anomalies.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:40.435769Z", + "start_time": "2024-05-03T01:31:40.425350Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. Remove columns from the analyzer for ['desc', 'issue_d', 'url']\n", + "2. Make a manual change to the analyzer to address small_nonnull_batches: less than 500 non-null records in 50% or more of the batches for ['desc']\n" + ] + } + ], + "source": [ + "from whylabs_toolkit.monitor.diagnoser.recommendation.change_recommender import ChangeRecommender\n", + "\n", + "recommender = ChangeRecommender(monitor_report)\n", + "recommender.min_anomaly_count = 1\n", + "changes = recommender.recommend()\n", + "print('\\n'.join([f'{i+1}. {c.describe()}' for i, c in enumerate(changes)]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-09T21:56:56.392829Z", + "start_time": "2024-03-09T21:56:56.250185Z" + } + }, + "source": [ + "## Execute automatable changes\n", + "\n", + "A subset of recommended changes can be executed automatically by the recommender. Pass the ones you want to make into the `make_changes` call, or pass all changes if you want it to make all of the automatable changes." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:40.438830Z", + "start_time": "2024-05-03T01:31:40.436652Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remove columns from the analyzer for ['desc', 'issue_d', 'url']\n" + ] + } + ], + "source": [ + "automatable_changes = [c for c in changes if c.can_automate()]\n", + "print('\\n'.join([c.describe() for c in automatable_changes]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:41.510071Z", + "start_time": "2024-05-03T01:31:40.439570Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully made the following changes:\n", + "\t* Remove columns from the analyzer for ['desc', 'issue_d', 'url']\n" + ] + } + ], + "source": [ + "change_results = recommender.make_changes(automatable_changes)\n", + "print(change_results.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "start_time": "2024-03-09T21:56:56.394187Z" + } + }, + "source": [ + "Note that the monitor will still appear to the diagnoser as the noisiest monitor until enough time has passed for the impact of the monitor changes to be observed. You may want to use the WhyLabs preview UI to view what impacts may be expected from the change.\n", + "\n", + "## Reviewing other noisy monitors\n", + "\n", + "The diagnoser can be used to review other noisy monitors in the dataset. The `noisy_monitors` property will return a list of the noisiest monitors, and the `monitor_id_to_diagnose` property can be set to the monitor_id of the monitor to diagnose." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:31:41.528882Z", + "start_time": "2024-05-03T01:31:41.513691Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " monitor_id \\\n0 frequent-items-drift-monitor-x2hr9z \n1 discrete-distribution-22ef37c9-monitor \n2 smoggy-chartreuse-owl-3387 \n3 frequent-items-drift-monitor-bx6m80 \n4 frequent-items-drift-monitor-mat0jo \n5 frequent-items-drift-monitor-01rbfl \n6 frequent-items-drift-monitor-0foigt \n7 frequent-items-drift-monitor-3c0hc2 \n8 frequent-items-drift-monitor-9gmtix \n9 elated-palegreen-jaguar-6432 \n10 inferred-data-type-fec5a735-monitor \n11 unique-ratio-b7b84aee-monitor \n12 missing-values-ratio-35881327-monitor \n13 numerical-drift-monitor-6oxi83 \n14 numerical-drift-monitor-8yugth \n15 continuous-distribution-956a280c-monitor \n16 dull-floralwhite-raven-5521 \n\n analyzer_id metric column_count \\\n0 frequent-items-drift-analyzer-x2hr9z frequent_items 3 \n1 discrete-distribution-22ef37c9 frequent_items 3 \n2 smoggy-chartreuse-owl-3387-analyzer frequent_items 3 \n3 frequent-items-drift-analyzer-bx6m80 frequent_items 3 \n4 frequent-items-drift-analyzer-mat0jo frequent_items 3 \n5 frequent-items-drift-analyzer-01rbfl frequent_items 3 \n6 frequent-items-drift-analyzer-0foigt frequent_items 3 \n7 frequent-items-drift-analyzer-3c0hc2 frequent_items 3 \n8 frequent-items-drift-analyzer-9gmtix frequent_items 3 \n9 elated-palegreen-jaguar-6432-analyzer histogram 9 \n10 inferred-data-type-fec5a735 inferred_data_type 1 \n11 unique-ratio-b7b84aee unique_est_ratio 69 \n12 missing-values-ratio-35881327 count_null_ratio 21 \n13 numerical-drift-analyzer-6oxi83 histogram 1 \n14 numerical-drift-analyzer-8yugth histogram 1 \n15 continuous-distribution-956a280c histogram 1 \n16 dull-floralwhite-raven-5521-analyzer count 2 \n\n segment_count anomaly_count max_anomaly_per_column \\\n0 1 34 30 \n1 1 34 30 \n2 1 34 30 \n3 1 34 30 \n4 1 34 30 \n5 1 34 30 \n6 1 34 30 \n7 1 34 30 \n8 1 34 30 \n9 1 75 19 \n10 1 14 14 \n11 1 104 4 \n12 1 27 3 \n13 1 2 2 \n14 1 2 2 \n15 1 2 2 \n16 1 3 2 \n\n min_anomaly_per_column avg_anomaly_per_column action_count \\\n0 1 11 1 \n1 1 11 0 \n2 1 11 0 \n3 1 11 0 \n4 1 11 2 \n5 1 11 1 \n6 1 11 0 \n7 1 11 1 \n8 1 11 1 \n9 2 8 0 \n10 14 14 2 \n11 1 1 0 \n12 1 1 0 \n13 2 2 0 \n14 2 2 0 \n15 2 2 0 \n16 1 1 0 \n\n action_targets \n0 [email] \n1 [] \n2 [] \n3 [] \n4 [email, slack] \n5 [email] \n6 [] \n7 [email] \n8 [email] \n9 [] \n10 [email, slack] \n11 [] \n12 [] \n13 [] \n14 [] \n15 [] \n16 [] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-x2hr9zfrequent-items-drift-analyzer-x2hr9zfrequent_items3134301111[email]
1discrete-distribution-22ef37c9-monitordiscrete-distribution-22ef37c9frequent_items3134301110[]
2smoggy-chartreuse-owl-3387smoggy-chartreuse-owl-3387-analyzerfrequent_items3134301110[]
3frequent-items-drift-monitor-bx6m80frequent-items-drift-analyzer-bx6m80frequent_items3134301110[]
4frequent-items-drift-monitor-mat0jofrequent-items-drift-analyzer-mat0jofrequent_items3134301112[email, slack]
5frequent-items-drift-monitor-01rbflfrequent-items-drift-analyzer-01rbflfrequent_items3134301111[email]
6frequent-items-drift-monitor-0foigtfrequent-items-drift-analyzer-0foigtfrequent_items3134301110[]
7frequent-items-drift-monitor-3c0hc2frequent-items-drift-analyzer-3c0hc2frequent_items3134301111[email]
8frequent-items-drift-monitor-9gmtixfrequent-items-drift-analyzer-9gmtixfrequent_items3134301111[email]
9elated-palegreen-jaguar-6432elated-palegreen-jaguar-6432-analyzerhistogram917519280[]
10inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type11141414142[email, slack]
11unique-ratio-b7b84aee-monitorunique-ratio-b7b84aeeunique_est_ratio6911044110[]
12missing-values-ratio-35881327-monitormissing-values-ratio-35881327count_null_ratio211273110[]
13numerical-drift-monitor-6oxi83numerical-drift-analyzer-6oxi83histogram1122220[]
14numerical-drift-monitor-8yugthnumerical-drift-analyzer-8yugthhistogram1122220[]
15continuous-distribution-956a280c-monitorcontinuous-distribution-956a280chistogram1122220[]
16dull-floralwhite-raven-5521dull-floralwhite-raven-5521-analyzercount2132110[]
\n
" + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "noisy_monitors_df = pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors])\n", + "noisy_monitors_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2024-03-09T21:56:56.396893Z", + "start_time": "2024-03-09T21:56:56.395306Z" + } + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2024-05-03T01:59:10.186840Z", + "start_time": "2024-05-03T01:59:03.667269Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "discrete-distribution-22ef37c9-monitor\n", + "Diagnosis is for monitor \"discrete-distribution-22ef37c9-monitor\" [discrete-distribution-22ef37c9-monitor] in model-0 org-0, over interval 2024-04-02T00:00:00.000Z/2024-05-02T00:00:00.000Z.\n", + "\n", + "Analyzer is drift configuration for frequent_items metric with TrailingWindow baseline.\n", + "Analyzer \"discrete-distribution-22ef37c9\" targets 30 columns and ran on 26 columns in the diagnosed segment.\n", + "\n", + "\n", + "Diagnostic segment is \"overall\".\n", + "Diagnostic interval contains 30 batches.\n", + "\n", + "Diagnostic interval rollup contains 1674392 rows for the diagnosed columns.\n", + "\n", + "Analysis results summary:\n", + "Found non-failed results for 26 columns and 30 batches.\n", + "Found 34 anomalies in 3 columns, with up to 100.0% (30) batches having anomalies per column and 36.7% (11.0) on average.\n", + "Columns with anomalies are:\n", + "| | 0 |\n", + "|---:|:----------------|\n", + "| 0 | ('issue_d', 30) |\n", + "| 1 | ('url', 3) |\n", + "| 2 | ('desc', 1) |\n", + "\n", + "No failures were detected.\n", + "\n", + "Conditions that may impact diagnosis quality include:\n", + "\t* analyzer_changed: Analyzer changed within the diagnostic interval - detectors ['stale_analysis', 'changing_discrete', 'low_drift_threshold', 'missing_baseline_batches', 'small_nonnull_batches']\n", + "\n", + "Conditions that may contribute to noise include:\n", + "\t* Condition changing_discrete (many values are unique across batches) for 3 columns: ['desc', 'issue_d', 'url']\n", + "\t* Condition small_nonnull_batches (less than 500 non-null records in 50% or more of the batches) for 4 columns: ['desc', 'issue_d', 'url', 'desc']\n", + "\n", + "Anomalies for columns with these conditions:\n", + "| | 0 |\n", + "|:--------|----:|\n", + "| issue_d | 30 |\n", + "| url | 3 |\n", + "| desc | 1 |\n", + "Accounting for 34 anomalies out of 34\n" + ] + } + ], + "source": [ + "diagnoser.monitor_id_to_diagnose = noisy_monitors_df.iloc[1]['monitor_id']\n", + "print(diagnoser.monitor_id_to_diagnose)\n", + "monitor_report = diagnoser.diagnose()\n", + "print(monitor_report.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "You can also use the `noisy_monitors_with_actions` property to prioritize noise in monitors with actions, as these are most likely to cause alert fatigue." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-03T01:59:21.543539Z", + "start_time": "2024-05-03T01:59:21.533198Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": " monitor_id analyzer_id \\\n0 frequent-items-drift-monitor-x2hr9z frequent-items-drift-analyzer-x2hr9z \n1 frequent-items-drift-monitor-mat0jo frequent-items-drift-analyzer-mat0jo \n2 frequent-items-drift-monitor-01rbfl frequent-items-drift-analyzer-01rbfl \n3 frequent-items-drift-monitor-3c0hc2 frequent-items-drift-analyzer-3c0hc2 \n4 frequent-items-drift-monitor-9gmtix frequent-items-drift-analyzer-9gmtix \n5 inferred-data-type-fec5a735-monitor inferred-data-type-fec5a735 \n\n metric column_count segment_count anomaly_count \\\n0 frequent_items 3 1 34 \n1 frequent_items 3 1 34 \n2 frequent_items 3 1 34 \n3 frequent_items 3 1 34 \n4 frequent_items 3 1 34 \n5 inferred_data_type 1 1 14 \n\n max_anomaly_per_column min_anomaly_per_column avg_anomaly_per_column \\\n0 30 1 11 \n1 30 1 11 \n2 30 1 11 \n3 30 1 11 \n4 30 1 11 \n5 14 14 14 \n\n action_count action_targets \n0 1 [email] \n1 2 [email, slack] \n2 1 [email] \n3 1 [email] \n4 1 [email] \n5 2 [email, slack] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
monitor_idanalyzer_idmetriccolumn_countsegment_countanomaly_countmax_anomaly_per_columnmin_anomaly_per_columnavg_anomaly_per_columnaction_countaction_targets
0frequent-items-drift-monitor-x2hr9zfrequent-items-drift-analyzer-x2hr9zfrequent_items3134301111[email]
1frequent-items-drift-monitor-mat0jofrequent-items-drift-analyzer-mat0jofrequent_items3134301112[email, slack]
2frequent-items-drift-monitor-01rbflfrequent-items-drift-analyzer-01rbflfrequent_items3134301111[email]
3frequent-items-drift-monitor-3c0hc2frequent-items-drift-analyzer-3c0hc2frequent_items3134301111[email]
4frequent-items-drift-monitor-9gmtixfrequent-items-drift-analyzer-9gmtixfrequent_items3134301111[email]
5inferred-data-type-fec5a735-monitorinferred-data-type-fec5a735inferred_data_type11141414142[email, slack]
\n
" + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame.from_records([m.dict() for m in diagnoser.noisy_monitors_with_actions])\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/poetry.lock b/poetry.lock index 7823881..a0d8759 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -23,7 +22,6 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte name = "autoflake" version = "2.1.1" description = "Removes unused imports and unused variables" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -39,7 +37,6 @@ tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} name = "black" version = "22.12.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -75,7 +72,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "bump2version" version = "1.0.1" description = "Version-bump your software with a single command!" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -87,7 +83,6 @@ files = [ name = "bumpversion" version = "0.6.0" description = "Version-bump your software with a single command!" -category = "dev" optional = false python-versions = "*" files = [ @@ -102,7 +97,6 @@ bump2version = "*" name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -117,7 +111,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -129,7 +122,6 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -144,7 +136,6 @@ test = ["pytest (>=6)"] name = "importlib-resources" version = "5.12.0" description = "Read resources from Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -163,7 +154,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -171,11 +161,24 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = true +python-versions = "*" +files = [ + {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, + {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "jsonschema" version = "4.17.3" description = "An implementation of JSON Schema validation for Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -197,7 +200,6 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- name = "mypy" version = "1.0.1" description = "Optional static typing for Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -244,7 +246,6 @@ reports = ["lxml"] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -252,11 +253,47 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = true +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + [[package]] name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -264,11 +301,77 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pandas" +version = "2.0.3" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] + [[package]] name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -280,7 +383,6 @@ files = [ name = "pkgutil-resolve-name" version = "1.3.10" description = "Resolve a name to an object." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -292,7 +394,6 @@ files = [ name = "platformdirs" version = "3.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -308,7 +409,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -324,7 +424,6 @@ testing = ["pytest", "pytest-benchmark"] name = "protobuf" version = "4.22.3" description = "" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -345,48 +444,47 @@ files = [ [[package]] name = "pydantic" -version = "1.10.7" +version = "1.10.15" description = "Data validation and settings management using python type hints" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-1.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e79e999e539872e903767c417c897e729e015872040e56b96e67968c3b918b2d"}, - {file = "pydantic-1.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01aea3a42c13f2602b7ecbbea484a98169fb568ebd9e247593ea05f01b884b2e"}, - {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:516f1ed9bc2406a0467dd777afc636c7091d71f214d5e413d64fef45174cfc7a"}, - {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae150a63564929c675d7f2303008d88426a0add46efd76c3fc797cd71cb1b46f"}, - {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ecbbc51391248116c0a055899e6c3e7ffbb11fb5e2a4cd6f2d0b93272118a209"}, - {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f4a2b50e2b03d5776e7f21af73e2070e1b5c0d0df255a827e7c632962f8315af"}, - {file = "pydantic-1.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:a7cd2251439988b413cb0a985c4ed82b6c6aac382dbaff53ae03c4b23a70e80a"}, - {file = "pydantic-1.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:68792151e174a4aa9e9fc1b4e653e65a354a2fa0fed169f7b3d09902ad2cb6f1"}, - {file = "pydantic-1.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe2507b8ef209da71b6fb5f4e597b50c5a34b78d7e857c4f8f3115effaef5fe"}, - {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10a86d8c8db68086f1e30a530f7d5f83eb0685e632e411dbbcf2d5c0150e8dcd"}, - {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75ae19d2a3dbb146b6f324031c24f8a3f52ff5d6a9f22f0683694b3afcb16fb"}, - {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:464855a7ff7f2cc2cf537ecc421291b9132aa9c79aef44e917ad711b4a93163b"}, - {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:193924c563fae6ddcb71d3f06fa153866423ac1b793a47936656e806b64e24ca"}, - {file = "pydantic-1.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:b4a849d10f211389502059c33332e91327bc154acc1845f375a99eca3afa802d"}, - {file = "pydantic-1.10.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cc1dde4e50a5fc1336ee0581c1612215bc64ed6d28d2c7c6f25d2fe3e7c3e918"}, - {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cfe895a504c060e5d36b287ee696e2fdad02d89e0d895f83037245218a87fe"}, - {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:670bb4683ad1e48b0ecb06f0cfe2178dcf74ff27921cdf1606e527d2617a81ee"}, - {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:950ce33857841f9a337ce07ddf46bc84e1c4946d2a3bba18f8280297157a3fd1"}, - {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c15582f9055fbc1bfe50266a19771bbbef33dd28c45e78afbe1996fd70966c2a"}, - {file = "pydantic-1.10.7-cp37-cp37m-win_amd64.whl", hash = "sha256:82dffb306dd20bd5268fd6379bc4bfe75242a9c2b79fec58e1041fbbdb1f7914"}, - {file = "pydantic-1.10.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c7f51861d73e8b9ddcb9916ae7ac39fb52761d9ea0df41128e81e2ba42886cd"}, - {file = "pydantic-1.10.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6434b49c0b03a51021ade5c4daa7d70c98f7a79e95b551201fff682fc1661245"}, - {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d34ab766fa056df49013bb6e79921a0265204c071984e75a09cbceacbbdd5d"}, - {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:701daea9ffe9d26f97b52f1d157e0d4121644f0fcf80b443248434958fd03dc3"}, - {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf135c46099ff3f919d2150a948ce94b9ce545598ef2c6c7bf55dca98a304b52"}, - {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0f85904f73161817b80781cc150f8b906d521fa11e3cdabae19a581c3606209"}, - {file = "pydantic-1.10.7-cp38-cp38-win_amd64.whl", hash = "sha256:9f6f0fd68d73257ad6685419478c5aece46432f4bdd8d32c7345f1986496171e"}, - {file = "pydantic-1.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c230c0d8a322276d6e7b88c3f7ce885f9ed16e0910354510e0bae84d54991143"}, - {file = "pydantic-1.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:976cae77ba6a49d80f461fd8bba183ff7ba79f44aa5cfa82f1346b5626542f8e"}, - {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d45fc99d64af9aaf7e308054a0067fdcd87ffe974f2442312372dfa66e1001d"}, - {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2a5ebb48958754d386195fe9e9c5106f11275867051bf017a8059410e9abf1f"}, - {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:abfb7d4a7cd5cc4e1d1887c43503a7c5dd608eadf8bc615413fc498d3e4645cd"}, - {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:80b1fab4deb08a8292d15e43a6edccdffa5377a36a4597bb545b93e79c5ff0a5"}, - {file = "pydantic-1.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:d71e69699498b020ea198468e2480a2f1e7433e32a3a99760058c6520e2bea7e"}, - {file = "pydantic-1.10.7-py3-none-any.whl", hash = "sha256:0cd181f1d0b1d00e2b705f1bf1ac7799a2d938cce3376b8007df62b29be3c2c6"}, - {file = "pydantic-1.10.7.tar.gz", hash = "sha256:cfc83c0678b6ba51b0532bea66860617c4cd4251ecf76e9846fa5a9f3454e97e"}, + {file = "pydantic-1.10.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:22ed12ee588b1df028a2aa5d66f07bf8f8b4c8579c2e96d5a9c1f96b77f3bb55"}, + {file = "pydantic-1.10.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:75279d3cac98186b6ebc2597b06bcbc7244744f6b0b44a23e4ef01e5683cc0d2"}, + {file = "pydantic-1.10.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50f1666a9940d3d68683c9d96e39640f709d7a72ff8702987dab1761036206bb"}, + {file = "pydantic-1.10.15-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82790d4753ee5d00739d6cb5cf56bceb186d9d6ce134aca3ba7befb1eedbc2c8"}, + {file = "pydantic-1.10.15-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:d207d5b87f6cbefbdb1198154292faee8017d7495a54ae58db06762004500d00"}, + {file = "pydantic-1.10.15-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e49db944fad339b2ccb80128ffd3f8af076f9f287197a480bf1e4ca053a866f0"}, + {file = "pydantic-1.10.15-cp310-cp310-win_amd64.whl", hash = "sha256:d3b5c4cbd0c9cb61bbbb19ce335e1f8ab87a811f6d589ed52b0254cf585d709c"}, + {file = "pydantic-1.10.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c3d5731a120752248844676bf92f25a12f6e45425e63ce22e0849297a093b5b0"}, + {file = "pydantic-1.10.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c365ad9c394f9eeffcb30a82f4246c0006417f03a7c0f8315d6211f25f7cb654"}, + {file = "pydantic-1.10.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3287e1614393119c67bd4404f46e33ae3be3ed4cd10360b48d0a4459f420c6a3"}, + {file = "pydantic-1.10.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be51dd2c8596b25fe43c0a4a59c2bee4f18d88efb8031188f9e7ddc6b469cf44"}, + {file = "pydantic-1.10.15-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6a51a1dd4aa7b3f1317f65493a182d3cff708385327c1c82c81e4a9d6d65b2e4"}, + {file = "pydantic-1.10.15-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4e316e54b5775d1eb59187f9290aeb38acf620e10f7fd2f776d97bb788199e53"}, + {file = "pydantic-1.10.15-cp311-cp311-win_amd64.whl", hash = "sha256:0d142fa1b8f2f0ae11ddd5e3e317dcac060b951d605fda26ca9b234b92214986"}, + {file = "pydantic-1.10.15-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7ea210336b891f5ea334f8fc9f8f862b87acd5d4a0cbc9e3e208e7aa1775dabf"}, + {file = "pydantic-1.10.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3453685ccd7140715e05f2193d64030101eaad26076fad4e246c1cc97e1bb30d"}, + {file = "pydantic-1.10.15-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bea1f03b8d4e8e86702c918ccfd5d947ac268f0f0cc6ed71782e4b09353b26f"}, + {file = "pydantic-1.10.15-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:005655cabc29081de8243126e036f2065bd7ea5b9dff95fde6d2c642d39755de"}, + {file = "pydantic-1.10.15-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:af9850d98fc21e5bc24ea9e35dd80a29faf6462c608728a110c0a30b595e58b7"}, + {file = "pydantic-1.10.15-cp37-cp37m-win_amd64.whl", hash = "sha256:d31ee5b14a82c9afe2bd26aaa405293d4237d0591527d9129ce36e58f19f95c1"}, + {file = "pydantic-1.10.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5e09c19df304b8123938dc3c53d3d3be6ec74b9d7d0d80f4f4b5432ae16c2022"}, + {file = "pydantic-1.10.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7ac9237cd62947db00a0d16acf2f3e00d1ae9d3bd602b9c415f93e7a9fc10528"}, + {file = "pydantic-1.10.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:584f2d4c98ffec420e02305cf675857bae03c9d617fcfdc34946b1160213a948"}, + {file = "pydantic-1.10.15-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbc6989fad0c030bd70a0b6f626f98a862224bc2b1e36bfc531ea2facc0a340c"}, + {file = "pydantic-1.10.15-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d573082c6ef99336f2cb5b667b781d2f776d4af311574fb53d908517ba523c22"}, + {file = "pydantic-1.10.15-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6bd7030c9abc80134087d8b6e7aa957e43d35714daa116aced57269a445b8f7b"}, + {file = "pydantic-1.10.15-cp38-cp38-win_amd64.whl", hash = "sha256:3350f527bb04138f8aff932dc828f154847fbdc7a1a44c240fbfff1b57f49a12"}, + {file = "pydantic-1.10.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51d405b42f1b86703555797270e4970a9f9bd7953f3990142e69d1037f9d9e51"}, + {file = "pydantic-1.10.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a980a77c52723b0dc56640ced396b73a024d4b74f02bcb2d21dbbac1debbe9d0"}, + {file = "pydantic-1.10.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f1a1fb467d3f49e1708a3f632b11c69fccb4e748a325d5a491ddc7b5d22383"}, + {file = "pydantic-1.10.15-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:676ed48f2c5bbad835f1a8ed8a6d44c1cd5a21121116d2ac40bd1cd3619746ed"}, + {file = "pydantic-1.10.15-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:92229f73400b80c13afcd050687f4d7e88de9234d74b27e6728aa689abcf58cc"}, + {file = "pydantic-1.10.15-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2746189100c646682eff0bce95efa7d2e203420d8e1c613dc0c6b4c1d9c1fde4"}, + {file = "pydantic-1.10.15-cp39-cp39-win_amd64.whl", hash = "sha256:394f08750bd8eaad714718812e7fab615f873b3cdd0b9d84e76e51ef3b50b6b7"}, + {file = "pydantic-1.10.15-py3-none-any.whl", hash = "sha256:28e552a060ba2740d0d2aabe35162652c1459a0b9069fe0db7f4ee0e18e74d58"}, + {file = "pydantic-1.10.15.tar.gz", hash = "sha256:ca832e124eda231a60a041da4f013e3ff24949d94a01154b137fc2f2a43c3ffb"}, ] [package.dependencies] @@ -400,7 +498,6 @@ email = ["email-validator (>=1.0.3)"] name = "pyflakes" version = "3.0.1" description = "passive checker of Python programs" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -412,7 +509,6 @@ files = [ name = "pyrsistent" version = "0.19.3" description = "Persistent/Functional/Immutable data structures" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -449,7 +545,6 @@ files = [ name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -472,7 +567,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -483,11 +577,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = true +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + [[package]] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -495,11 +599,24 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "tabulate" +version = "0.8.9" +description = "Pretty-print tabular data" +optional = true +python-versions = "*" +files = [ + {file = "tabulate-0.8.9-py3-none-any.whl", hash = "sha256:d7c013fe7abbc5e491394e10fa845f8f32fe54f8dc60c6622c6cf482d25d47e4"}, + {file = "tabulate-0.8.9.tar.gz", hash = "sha256:eb1d13f25760052e8931f2ef80aaf6045a6cceb47514db8beab24cded16f13a7"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -507,28 +624,48 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "types-python-dateutil" +version = "2.9.0.20240316" +description = "Typing stubs for python-dateutil" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-python-dateutil-2.9.0.20240316.tar.gz", hash = "sha256:5d2f2e240b86905e40944dd787db6da9263f0deabef1076ddaed797351ec0202"}, + {file = "types_python_dateutil-2.9.0.20240316-py3-none-any.whl", hash = "sha256:6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b"}, +] + [[package]] name = "typing-extensions" -version = "4.5.0" -description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" +version = "4.11.0" +description = "Backported and Experimental Type Hints for Python 3.8+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, +] + +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = true +python-versions = ">=2" files = [ - {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, - {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, ] [[package]] name = "urllib3" -version = "2.0.1" +version = "2.0.7" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "urllib3-2.0.1-py3-none-any.whl", hash = "sha256:d75e5ece05ff170e323303fd924edf29e705f5ae057c489f453a686b639bb68a"}, - {file = "urllib3-2.0.1.tar.gz", hash = "sha256:2ce66a68134be469f5df5d46d724237489b3cd85b2bba2223dbbee1746548826"}, + {file = "urllib3-2.0.7-py3-none-any.whl", hash = "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e"}, + {file = "urllib3-2.0.7.tar.gz", hash = "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84"}, ] [package.extras] @@ -539,14 +676,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "whylabs-client" -version = "0.6.2" +version = "0.6.3" description = "WhyLabs API client" -category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "whylabs-client-0.6.2.tar.gz", hash = "sha256:7d8181317cd75f643935b45e87377fd21e17dd9117674a8996cf9a469be65a90"}, - {file = "whylabs_client-0.6.2-py3-none-any.whl", hash = "sha256:3fe5edff4415ac15426a8aaed6adaf7e803818e997b825535e5aa5417757b7ae"}, + {file = "whylabs-client-0.6.3.tar.gz", hash = "sha256:4df4daa436f7899c60575c5a72641a2b3cbfe9d2f0cc0d6b4831746d13342088"}, + {file = "whylabs_client-0.6.3-py3-none-any.whl", hash = "sha256:050bcfd1493fbb303f38b02b750fb5321abeeed1e775f7dfd570998d3bf5719b"}, ] [package.dependencies] @@ -557,7 +693,6 @@ urllib3 = ">=1.25.3" name = "whylogs" version = "1.1.39" description = "Profile and monitor your ML data pipeline end-to-end" -category = "main" optional = false python-versions = ">=3.7.1,<4" files = [ @@ -589,7 +724,6 @@ whylabs = ["requests (>=2.27,<3.0)"] name = "whylogs-sketching" version = "3.4.1.dev3" description = "sketching library of whylogs" -category = "main" optional = false python-versions = "*" files = [ @@ -630,7 +764,6 @@ files = [ name = "zipp" version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -642,7 +775,10 @@ files = [ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +[extras] +diagnoser = ["isodate", "numpy", "pandas", "python-dateutil", "tabulate"] + [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "33f20a2d8395aad5fc1d6edd38492ef18ef8cfb74b5bbe4d019395f967b899b7" +content-hash = "20fc341db7d79a6f3190c5e3da4008c508584233d74c491ccbd91528c325d681" diff --git a/pyproject.toml b/pyproject.toml index 57d45e2..aa889ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [tool.poetry] name = "whylabs-toolkit" -version = "0.0.18" -description = "Whylabs CLI and Helpers package." -authors = ["Murilo Mendonca ", "Anthony Naddeo "] +version = "0.1.0" +description = "Whylabs Toolkit package." +authors = ["Murilo Mendonca ", "Anthony Naddeo ", + "Christine Draper "] license = "Apache-2.0 license" readme = "README.md" packages = [{include = "whylabs_toolkit/**/*.py"}] @@ -10,10 +11,19 @@ include = ["whylabs_toolkit/monitor/schema/schema.json"] [tool.poetry.dependencies] python = "^3.8" -whylabs-client = "^0.6.0" -pydantic = "^1.10.4" +whylabs-client = "^0.6.3" +pydantic = "^1.10.15" whylogs = "^1.1.26" jsonschema = "^4.17.3" +typing-extensions = "^4.11.0" +urllib3 = "^2.0.2, <2.1" + +# diagnoser extra dependencies +pandas = { version="^2.0.3", optional=true } +numpy = { version="^1.24.1", optional=true } +tabulate = { version="^0.8.9", optional=true } +isodate = { version="^0.6.1", optional=true } +python-dateutil = { version="^2.8.2", optional=true } [tool.poetry.group.dev.dependencies] autoflake = "^2.0.1" @@ -21,6 +31,7 @@ pytest = "^7.2.0" black = "^22.10.0" mypy = "~1.0.1" bumpversion = "^0.6.0" +types-python-dateutil = "^2.9.0.20240316" [tool.black] line-length = 140 @@ -31,3 +42,6 @@ build-backend = "poetry.core.masonry.api" [tool.flake8] max-line-length = 140 + +[tool.poetry.extras] +diagnoser = ["pandas", "numpy", "tabulate", "isodate", "python-dateutil"] \ No newline at end of file diff --git a/tests/helpers/test_model.py b/tests/helpers/test_model.py index ef9cd3f..7e3be36 100644 --- a/tests/helpers/test_model.py +++ b/tests/helpers/test_model.py @@ -55,4 +55,4 @@ def test_create_custom_metric(models_api: ModelsApi) -> None: assert entity["metrics"]["temperature.median"].to_dict() == {'column': 'temperature', 'default_metric': 'median','label': 'temperature.median'} - models_api.delete_entity_schema_metric(org_id=org_id, dataset_id="model-7", metric_label="temperature.median") \ No newline at end of file + models_api.delete_entity_schema_metric(org_id=org_id, dataset_id="model-7", metric_name="temperature.median") \ No newline at end of file diff --git a/tests/monitor/diagnoser/__init__.py b/tests/monitor/diagnoser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/monitor/diagnoser/converters/__init__.py b/tests/monitor/diagnoser/converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/monitor/diagnoser/converters/test_granularity.py b/tests/monitor/diagnoser/converters/test_granularity.py new file mode 100644 index 0000000..5e18a5c --- /dev/null +++ b/tests/monitor/diagnoser/converters/test_granularity.py @@ -0,0 +1,33 @@ +from whylabs_toolkit.monitor.diagnoser.converters.granularity import calculate_num_batches + + +def test_calculate_num_batches_hourly(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-01T03:30:00Z', 'hourly') == 3 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-03T01:00:00Z', 'hourly') == 49 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-02T00:00:00Z', 'hourly') == 24 + + +def test_calculate_num_batches_daily(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-02T00:00:00Z', 'daily') == 1 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-09T00:00:00Z', 'daily') == 8 + + +def test_calculate_num_batches_weekly(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-02T00:00:00Z', 'weekly') == 0 + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-01-09T00:00:00Z', 'weekly') == 1 + + +def test_calculate_num_batches_monthly(): + assert calculate_num_batches('2022-01-01T00:00:00Z/2022-02-02T00:00:00Z', 'monthly') == 1 + + +def test_calculate_num_batches_duration(): + assert calculate_num_batches('2022-01-01T00:00:00Z/P3D', 'daily') == 3 + assert calculate_num_batches('2022-01-01T00:00:00Z/P1W', 'daily') == 7 + assert calculate_num_batches('2022-01-01T00:00:00Z/P1D', 'hourly') == 24 + + +def test_calculate_num_batches_format(): + assert calculate_num_batches('2022-01-01T00:00/2022-01-02T00:00', 'daily') == 1 + assert calculate_num_batches('2022-01-01/2022-01-02', 'daily') == 1 + assert calculate_num_batches('2022-01-01/P1D', 'daily') == 1 diff --git a/tests/monitor/diagnoser/recommendation/__init__.py b/tests/monitor/diagnoser/recommendation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/monitor/diagnoser/recommendation/test_changes.py b/tests/monitor/diagnoser/recommendation/test_changes.py new file mode 100644 index 0000000..6f77d6f --- /dev/null +++ b/tests/monitor/diagnoser/recommendation/test_changes.py @@ -0,0 +1,21 @@ +from whylabs_toolkit.monitor.diagnoser.models import ConditionRecord +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange + + +def test_from_condition(): + info = {'k1': 3} + condition = ConditionRecord(name="fixed_threshold_mismatch", summary='a mismatch', columns=['col1', 'col3', 'col4'], info=info) + change = RecommendedChange.from_condition(condition) + assert change.columns == condition.columns + assert change.info == condition.info + + +def test_merge_changes(): + change1 = RecommendedChange(columns=['c1', 'c2'], info={'f1': 1, 'f2': 2}) + change2 = RecommendedChange(columns=['c1', 'c3'], info={'f1': 0, 'f3': 3}) + merged = change1.merge(change2) + assert change1.columns == ['c1', 'c2'] + assert change2.columns == ['c1', 'c3'] + assert set(merged.columns) == {'c1', 'c2', 'c3'} + assert merged.info == {'f1': 0, 'f2': 2, 'f3': 3} + diff --git a/tests/monitor/diagnoser/recommendation/test_remove_columns.py b/tests/monitor/diagnoser/recommendation/test_remove_columns.py new file mode 100644 index 0000000..20788c7 --- /dev/null +++ b/tests/monitor/diagnoser/recommendation/test_remove_columns.py @@ -0,0 +1,38 @@ +from typing import Optional + +from whylabs_toolkit.monitor.models import Analyzer + +from whylabs_toolkit.monitor.diagnoser.models import ConditionRecord +from whylabs_toolkit.monitor.diagnoser.recommendation.remove_columns import RemoveColumns + + +def gen_analyzer(metric='mean', config: Optional[dict] = None, + target_matrix: Optional[dict] = None, baseline: Optional[dict] = None): + target_matrix = {'type': 'column', 'include': ['col1']} if target_matrix is None else target_matrix + config = {'type': 'fixed', 'metric': metric, 'upper': 1.0} if config is None else config + if config['type'] != 'fixed': + config['baseline'] = {'type': 'TrailingWindow', 'size': 7} if baseline is None else baseline + return Analyzer.parse_obj( + { + 'id': 'test_analyzer', + 'config': config, + 'targetMatrix': target_matrix, + }) + + +def test_remove_columns(): + analyzer = gen_analyzer(target_matrix={'type': 'column', 'include': ['col1', 'col2'], 'exclude': ['col3']}) + condition = ConditionRecord(name='fixed_threshold', summary='', columns=['col1', 'col3', 'col4']) + change = RemoveColumns.from_condition(condition) + result = change.generate_config(analyzer) + assert len(result) == 1 + updated = result[0] + assert updated.targetMatrix.include == ['col2'] + assert updated.targetMatrix.exclude.sort() == ['col3', 'col4'].sort() + + +def test_remove_columns2(): + analyzer = gen_analyzer(target_matrix={'type': 'column', 'include': ['col1', 'col2'], 'exclude': ['col3']}) + action = RemoveColumns(['col1', 'col2']) + result = action.generate_config(analyzer) + assert len(result) == 0 diff --git a/tests/monitor/diagnoser/test_helpers.py b/tests/monitor/diagnoser/test_helpers.py new file mode 100644 index 0000000..fe67cb8 --- /dev/null +++ b/tests/monitor/diagnoser/test_helpers.py @@ -0,0 +1,12 @@ +import pandas as pd +from pandas.testing import assert_series_equal + +from whylabs_toolkit.monitor.diagnoser.helpers.describe import filter_by_index + + +def test_filter_by_index(): + to_sort = pd.Series([0, 1, 1], index=['c3', 'c4', 'c1']) + ref = pd.Series([10, 9, 8], index=['c1', 'c2', 'c3']) + expected = pd.Series([10, 8, 0], index=['c1', 'c3', 'c4']) + assert_series_equal(filter_by_index(to_sort.index, ref), expected) + assert_series_equal(filter_by_index(['c3', 'c4', 'c1'], ref), expected) diff --git a/whylabs_toolkit/helpers/monitor_helpers.py b/whylabs_toolkit/helpers/monitor_helpers.py index 2778eeb..fb32ee7 100644 --- a/whylabs_toolkit/helpers/monitor_helpers.py +++ b/whylabs_toolkit/helpers/monitor_helpers.py @@ -78,6 +78,19 @@ def get_analyzers( return None +def time_period_to_granularity(time_period: str) -> Granularity: + if time_period == "PT1H": + return Granularity.hourly + + if time_period == "P1W": + return Granularity.weekly + + if time_period == "P1M": + return Granularity.monthly + + return Granularity.daily + + def get_model_granularity( org_id: Optional[str] = None, dataset_id: Optional[str] = None, config: Config = Config() ) -> Optional[Granularity]: @@ -87,16 +100,8 @@ def get_model_granularity( api = get_models_api(config=config) model_meta = api.get_model(org_id=org_id, model_id=dataset_id) - time_period_to_gran = { - "H": Granularity.hourly, - "D": Granularity.daily, - "W": Granularity.weekly, - "M": Granularity.monthly, - } if model_meta: - for key, value in time_period_to_gran.items(): - if key in model_meta["time_period"]: - return value + return time_period_to_granularity(model_meta["time_period"]) return None diff --git a/whylabs_toolkit/helpers/utils.py b/whylabs_toolkit/helpers/utils.py index 3d09266..b981501 100644 --- a/whylabs_toolkit/helpers/utils.py +++ b/whylabs_toolkit/helpers/utils.py @@ -1,4 +1,5 @@ from whylabs_client.api.dataset_profile_api import DatasetProfileApi +from whylabs_client.api.monitor_diagnostics_api import MonitorDiagnosticsApi from whylabs_client.api.models_api import ModelsApi from whylabs_client.api.notification_settings_api import NotificationSettingsApi from whylabs_client.api.monitor_api import MonitorApi @@ -21,3 +22,7 @@ def get_notification_api(config: Config = Config()) -> NotificationSettingsApi: def get_monitor_api(config: Config = Config()) -> MonitorApi: return MonitorApi(api_client=create_client(config=config)) + + +def get_monitor_diagnostics_api(config: Config = Config()) -> MonitorDiagnosticsApi: + return MonitorDiagnosticsApi(api_client=create_client(config=config)) diff --git a/whylabs_toolkit/monitor/diagnoser/README.md b/whylabs_toolkit/monitor/diagnoser/README.md new file mode 100644 index 0000000..26de78c --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/README.md @@ -0,0 +1,26 @@ +# Noisy monitor diagnosis + +This package helps users diagnose and fix noisy monitors in WhyLabs. This workflow has the following steps: +* Identify the noisiest monitors for a selected organization and dataset, and choose one to diagnose. +* Identify the noisiest segment of the monitor to be the diagnostic segment. +* Within that segment, identify the noisiest columns. +* Identify the conditions contributing to the noise in the diagnostic segment and noisiest columns. +* Determine the appropriate action to take to fix the conditions contributing to the noise. +* Apply the actions to the monitor. + +Most of the above steps are automated by the monitor diagnoser for common noise conditions, although in some cases the +diagnoser may not match the dataset to any known conditions. Users will also usually need to manually consider the +most appropriate action to take to fix the monitor. A recommender is provided to suggest reasonable actions +and to automate some of the basic actions. We are happy to work with you to improve the diagnoser in such cases. + +## Usage +To start using the diagnoser, install whylabs_toolkit including the diagnoser extra from PyPI with: +```bash +pip install 'whylabs_toolkit[diagnoser]' +``` + +See [diagnoser.ipynb](/examples/example_notebooks/diagnoser.ipynb) for an end-to-end example of identifying noisy +monitors, diagnosing the conditions contributing to noise, and getting recommendations for fixing them. + +See [customized_diagnoser.ipynb](/examples/example_notebooks/customized_diagnoser.ipynb) for an example of how to +customize the diagnosis for your specific needs. \ No newline at end of file diff --git a/whylabs_toolkit/monitor/diagnoser/__init__.py b/whylabs_toolkit/monitor/diagnoser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/constants.py b/whylabs_toolkit/monitor/diagnoser/constants.py new file mode 100644 index 0000000..70cdc15 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/constants.py @@ -0,0 +1,4 @@ +MAX_COLUMNS = 100 +DEFAULT_BATCHES = 30 +MAX_PROFILES = 10000 +assert DEFAULT_BATCHES * MAX_COLUMNS <= MAX_PROFILES diff --git a/whylabs_toolkit/monitor/diagnoser/converters/__init__.py b/whylabs_toolkit/monitor/diagnoser/converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/converters/granularity.py b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py new file mode 100644 index 0000000..5be8d25 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/converters/granularity.py @@ -0,0 +1,40 @@ +from dateutil.relativedelta import relativedelta +from whylabs_toolkit.utils.granularity import Granularity +from isodate import parse_datetime, parse_duration, parse_date + + +def batches_to_timedelta(time_period: str, batches: int) -> relativedelta: + if time_period == "PT1H": + return relativedelta(hours=batches) + + if time_period == "P1W": + return relativedelta(weeks=batches) + + if time_period == "P1M": + return relativedelta(months=batches) + + return relativedelta(days=batches) + + +def calculate_num_batches(interval: str, granularity: str) -> int: + # Parse the ISO8601 interval string into a start and end datetime + start, end = interval.split("/") + start_date = parse_datetime(start) if "T" in start else parse_date(start) + try: + end_date = parse_datetime(end) if "T" in start else parse_date(end) + except ValueError: + end_date = start_date + parse_duration(end) + + # Calculate the difference based on the granularity + if granularity == "hourly": + difference = relativedelta(end_date, start_date).days * 24 + relativedelta(end_date, start_date).hours + elif granularity == "daily": + difference = relativedelta(end_date, start_date).days + elif granularity == "weekly": + difference = relativedelta(end_date, start_date).weeks + elif granularity == "monthly": + difference = relativedelta(end_date, start_date).months + else: + raise ValueError(f"Unsupported granularity: {granularity}") + + return difference diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/__init__.py b/whylabs_toolkit/monitor/diagnoser/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/describe.py b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py new file mode 100644 index 0000000..7d52d2d --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/helpers/describe.py @@ -0,0 +1,31 @@ +from typing import List, Union + +import pandas as pd + + +def describe_truncated_list(vals: List[str], num: int = 10) -> str: + if len(vals) <= num: + return str(vals) + return f"{vals[0:num]} and {len(vals) - num} more" + + +def describe_truncated_table(df: Union[pd.DataFrame, pd.Series], num: int = 10) -> str: + if len(df) <= num: + table = df.to_markdown() + return str(table) if table is not None else "No data to display." + return f"{df[0:num].to_markdown()}\n and {len(df) - num} more" + + +def filter_by_index(items: Union[pd.Index, list], ref: pd.Series) -> pd.Series: + """ + Filters the reference by items in its index. Appends 0 values for any + items not in the ref index. + + Example use... ref is anomalies by column, items are columns in a condition. + """ + index = items if isinstance(items, pd.Index) else pd.Index(items) + diff = index.difference(ref.index) + if len(diff) == 0: + return ref.loc[index].sort_index() + expanded_ref = pd.concat([ref, pd.Series([0] * len(diff), index=diff)]) + return expanded_ref.loc[index].sort_index() diff --git a/whylabs_toolkit/monitor/diagnoser/helpers/utils.py b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py new file mode 100644 index 0000000..7591b8e --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/helpers/utils.py @@ -0,0 +1,68 @@ +import os +from typing import List, Optional + +from whylabs_client.api.monitor_diagnostics_api import MonitorDiagnosticsApi + +from whylabs_toolkit.helpers.client import create_client +from whylabs_toolkit.helpers.config import Config +from whylabs_toolkit.monitor.models import SegmentTag + + +def get_monitor_diagnostics_api(config: Config = Config()) -> MonitorDiagnosticsApi: + """ + Get the monitor diagnostics API, which is used to interact with the WhyLabs Monitor Diagnostics service + to diagnose noisy monitors. + :param config: + :return: + """ + return MonitorDiagnosticsApi(api_client=create_client(config=config)) + + +def env_setup( + org_id: str, dataset_id: str, api_key: Optional[str] = None, whylabs_endpoint: Optional[str] = None +) -> None: + """ + Set environment variables to work with both whylabs-toolkit and whylogs. Will pick up the API + key from the environment if not provided as a parameter. + :param org_id: + :param dataset_id: + :param api_key: + :param whylabs_endpoint: + :return: + """ + os.environ["WHYLABS_API_KEY"] = api_key if api_key else os.environ["WHYLABS_API_KEY"] + if not os.environ["WHYLABS_API_KEY"]: + raise Exception("Please provide an API key") + os.environ["WHYLABS_DEFAULT_ORG_ID"] = org_id + os.environ["ORG_ID"] = org_id + os.environ["WHYLABS_DEFAULT_DATASET_ID"] = dataset_id + if whylabs_endpoint: + os.environ["WHYLABS_API_ENDPOINT"] = whylabs_endpoint + os.environ["WHYLABS_HOST"] = whylabs_endpoint + + +def segment_to_text(segment: List[SegmentTag]) -> str: + if segment is None or len(segment) == 0: + return "" + text = "" + for tag in segment: + if len(text) > 0: + text += "&" + text += f"{tag.key}={tag.value}" + return text + + +def segment_as_readable_text(segment: List[SegmentTag]) -> str: + text = segment_to_text(segment) + return "overall" if text == "" else text + + +def text_to_segment(text: str) -> List[SegmentTag]: + if text == "": + return [] + tags = [] + parts = text.split("&") + for part in parts: + [key, value] = part.split("=", 2) + tags.append(SegmentTag(key=key, value=value)) + return tags diff --git a/whylabs_toolkit/monitor/diagnoser/models/__init__.py b/whylabs_toolkit/monitor/diagnoser/models/__init__.py new file mode 100644 index 0000000..f72a901 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/models/__init__.py @@ -0,0 +1,2 @@ +from .noisy_monitors import * +from .diagnosis_report import * diff --git a/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py new file mode 100644 index 0000000..6226842 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/models/diagnosis_report.py @@ -0,0 +1,265 @@ +import pandas as pd +from typing import Dict, List, Optional, Tuple +from pydantic import BaseModel +from whylabs_toolkit.monitor.models import ( + Analyzer, + Monitor, + Segment, + TargetLevel, + FixedThresholdsConfig, + ConjunctionConfig, + DisjunctionConfig, + GlobalAction, +) + +from whylabs_toolkit.monitor.diagnoser.helpers.describe import ( + describe_truncated_table, + filter_by_index, + describe_truncated_list, +) +from whylabs_toolkit.monitor.diagnoser.helpers.utils import segment_as_readable_text + + +class SegmentReport(BaseModel): + batchCount: int + segment: Segment + totalAnomalies: int + totalFailures: int + totalColumns: int + + +class NamedCount(BaseModel): + name: str + count: int + + def to_tuple(self) -> Tuple[str, int]: + return self.name, self.count + + +class ConditionRecord(BaseModel): + columns: Optional[List[str]] # not present for some conditions like stale analysis + info: Optional[Dict] + summary: str + name: str + + +class QualityIssueRecord(BaseModel): + name: str + description: str + detectors: List[str] + + +class ProfileSummary(BaseModel): + minRowName: str + minRowCount: int + maxRowName: str + maxRowCount: int + + def describe(self) -> str: + count_desc = ( + str(self.minRowCount) + if self.minRowCount == self.maxRowCount + else f"{self.minRowCount} - {self.maxRowCount}" + ) + return f"Diagnostic interval rollup contains {count_desc} rows for the diagnosed columns.\n" + + +class BatchesSummary(BaseModel): + minBatchName: str + minBatchCount: int + maxBatchName: str + maxBatchCount: int + + def describe(self) -> str: + count_desc = ( + str(self.minBatchCount) + if self.minBatchCount == self.maxBatchCount + else f"{self.minBatchCount} - {self.maxBatchCount}" + ) + return f"Diagnostic interval contains {count_desc} batches.\n" + + +class ResultRecord(BaseModel): + diagnosedColumnCount: int + batchCount: int + + def describe(self) -> str: + return f"Found non-failed results for {self.diagnosedColumnCount} columns and {self.batchCount} batches." + + +class FailureRecord(BaseModel): + totalFailuresCount: int + maxFailuresCount: int + meanFailuresCount: int + byColumnCount: List[NamedCount] + byTypeCount: List[NamedCount] + + def describe(self) -> str: + failures = pd.DataFrame([c.to_tuple() for c in self.byColumnCount]) + failure_types = [t.name for t in self.byTypeCount] + if len(failures) == 0: + return "No failures were detected." + return ( + f"Found {self.totalFailuresCount} failed results, with up to {self.maxFailuresCount} " + f"failures per column and {self.meanFailuresCount} failures on average.\n" + f"Failure types are {describe_truncated_list(failure_types)}\n" + f"Columns with failures are: \n{describe_truncated_table(failures)}\n" + ) + + +class AnomalyRecord(BaseModel): + totalAnomalyCount: int + maxAnomalyCount: int + meanAnomalyCount: int + batchCount: int + byColumnCount: List[NamedCount] + byColumnBatchCount: List[NamedCount] + + def describe(self) -> str: + counts = pd.Series([c.to_tuple() for c in self.byColumnCount]) + max_count = int(self.maxAnomalyCount) + max_pct = max_count * 100 / self.batchCount + mean_count = float(self.meanAnomalyCount) + mean_pct = mean_count * 100 / self.batchCount + return ( + f"Found {self.totalAnomalyCount} anomalies in {len(self.byColumnCount)} columns, with up to " + f"{max_pct:.1f}% ({max_count}) batches having anomalies per column and " + f"{mean_pct:.1f}% ({mean_count:.1f}) on average.\n" + f"Columns with anomalies are:\n{describe_truncated_table(counts)}\n" + ) + + +class AnalysisResultsSummary(BaseModel): + results: ResultRecord + failures: FailureRecord + anomalies: AnomalyRecord + + def describe(self) -> str: + return ( + f"Analysis results summary:\n" + f"{self.results.describe()}\n" + f"{self.anomalies.describe()}\n" + f"{self.failures.describe()}\n" + ) + + +class DiagnosticDataSummary(BaseModel): + diagnosticSegment: Segment + diagnosticProfile: Optional[ProfileSummary] + diagnosticBatches: Optional[BatchesSummary] + analysisResults: Optional[AnalysisResultsSummary] + targetedColumnCount: int + + def describe(self) -> str: + return "\n".join( + [ + f'Diagnostic segment is "{segment_as_readable_text(self.diagnosticSegment.tags)}".', + self.diagnosticBatches.describe() if self.diagnosticBatches is not None else "", + self.diagnosticProfile.describe() if self.diagnosticProfile is not None else "", + self.analysisResults.describe() if self.analysisResults is not None else "", + ] + ) + + +class AnalyzerDiagnosisReport(BaseModel): + orgId: str + datasetId: str + analyzerId: str + interval: str + expectedBatchCount: int + diagnosticData: DiagnosticDataSummary + qualityIssues: List[QualityIssueRecord] + conditions: List[ConditionRecord] + + def describe(self) -> str: + text = "\n".join([self.diagnosticData.describe(), self.describe_quality_issues(), self.describe_conditions()]) + return text + + def describe_quality_issues(self) -> str: + if len(self.qualityIssues) == 0: + return "No issues impacting diagnosis quality were detected" + text = "Conditions that may impact diagnosis quality include:\n" + for issue in self.qualityIssues: + text += f"\t* {issue.name}: {issue.description} - detectors {issue.detectors}\n" + return text + + def describe_conditions(self) -> str: + if len(self.conditions) == 0: + return "No conditions related to noise were detected." + text = "Conditions that may contribute to noise include:\n" + condition_cols: List[str] = [] + for condition in self.conditions: + text += f"\t* Condition {condition.name} ({condition.summary})" + if condition.columns is not None: + condition_cols += condition.columns + col_text = describe_truncated_list(condition_cols, 10) + text += f" for {len(condition_cols)} columns: {col_text}" + text += "\n" + + cols = pd.Series(condition_cols).unique() + if len(cols) > 0: + text += f"\nAnomalies for columns with these conditions:\n" + by_col_count = ( + self.diagnosticData.analysisResults.anomalies.byColumnCount + if (self.diagnosticData.analysisResults is not None) + else [] + ) + count_tuples = [c.to_tuple() for c in by_col_count] + idx, values = zip(*count_tuples) + count_by_col = pd.Series(values, idx) + cols_with_count = filter_by_index(cols.tolist(), count_by_col).sort_values(ascending=False) + cols_with_count.rename("anomalies") + text += describe_truncated_table(cols_with_count) + text += f"\nAccounting for {cols_with_count.sum()} anomalies out of " f"{count_by_col.sum()}\n" + + return text + + +class MonitorDiagnosisReport(AnalyzerDiagnosisReport): + monitor: Optional[Monitor] # sometimes there isn't one, e.g. it's been deleted + analyzer: Optional[Analyzer] + analyzedColumnCount: int + + def describe(self) -> str: + text = "\n".join([self.describe_monitor(), self.describe_analyzer(), super().describe()]) + return text + + def describe_monitor(self) -> str: + if self.monitor is None: + return "Monitor has been deleted.\n" + text = ( + f'Diagnosis is for monitor "{self.monitor.displayName if self.monitor.displayName else self.monitor.id}" ' + f"[{self.monitor.id}] in {self.datasetId} {self.orgId}, over interval {self.interval}.\n" + ) + if len(self.monitor.actions) > 0: + text += f"Monitor has {len(self.monitor.actions)} notification actions " + text += f"{[a.target for a in self.monitor.actions if isinstance(a, GlobalAction)]}.\n" + return text + + def describe_analyzer(self) -> str: + if self.analyzer is None: + return "No analyzer found.\n" + if isinstance(self.analyzer.config, ConjunctionConfig) or isinstance(self.analyzer.config, DisjunctionConfig): + return f"\nAnalyzer is a composite {self.analyzer.config.type}." + baseline = ( + "no baseline" + if (isinstance(self.analyzer.config, FixedThresholdsConfig) or self.analyzer.config.baseline is None) + else f"{self.analyzer.config.baseline.type} baseline" + ) + targeting_desc = "" + if self.analyzer is None: + return "" + metric = self.analyzer.config.metric + if self.analyzer.targetMatrix is not None and self.analyzer.targetMatrix.type == TargetLevel.column: + targeting_desc = ( + f'\nAnalyzer "{self.analyzer.id}" targets {self.diagnosticData.targetedColumnCount} ' + f"columns and ran on {self.analyzedColumnCount} columns in the diagnosed segment.\n" + ) + text = f"Analyzer is {self.analyzer.config.type} configuration for {metric} metric with {baseline}." + text += targeting_desc + text += "\n" + return text + + +class MonitorDiagnosisReportList(BaseModel): + __root__: List[MonitorDiagnosisReport] diff --git a/whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py b/whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py new file mode 100644 index 0000000..ee6c5b7 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/models/noisy_monitors.py @@ -0,0 +1,46 @@ +from typing import Optional, List + +from pydantic import BaseModel +from whylabs_toolkit.monitor.models import Segment + + +class NoisyMonitorStats(BaseModel): + monitor_id: Optional[str] + analyzer_id: str + metric: str + column_count: int + segment_count: int + anomaly_count: int + max_anomaly_per_column: int + min_anomaly_per_column: int + avg_anomaly_per_column: int + action_count: int + action_targets: List[str] + + +class FailedMonitorStats(BaseModel): + monitor_id: Optional[str] + analyzer_id: str + metric: str + failed_count: int + max_failed_per_column: int + min_failed_per_column: int + avg_failed_per_column: int + action_count: int + action_targets: List[str] + + +class NoisySegmentStats(BaseModel): + segment: Segment + total_anomalies: int + batch_count: int + + +class FailedSegmentStats(BaseModel): + segment: Segment + total_failed: int + + +class NoisyColumnStats(BaseModel): + column: str + total_anomalies: int diff --git a/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py new file mode 100644 index 0000000..193132f --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/monitor_diagnoser.py @@ -0,0 +1,387 @@ +import os + +import pandas as pd +from typing import Tuple, List, Optional, Dict + +from pydantic import ValidationError +from whylabs_client.model.analyzer_segment_columns_diagnostic_request import AnalyzerSegmentColumnsDiagnosticRequest +from whylabs_client.model.analyzer_segment_columns_diagnostic_response import AnalyzerSegmentColumnsDiagnosticResponse +from whylabs_client.model.analyzer_segments_diagnostic_request import AnalyzerSegmentsDiagnosticRequest +from whylabs_client.model.analyzer_segments_diagnostic_response import AnalyzerSegmentsDiagnosticResponse +from whylabs_client.model.analyzers_diagnostic_response import AnalyzersDiagnosticResponse +from whylabs_client.model.diagnosis_request import DiagnosisRequest +from whylabs_client.model.diagnostic_interval_request import DiagnosticIntervalRequest +from whylabs_client.model.diagnostic_interval_response import DiagnosticIntervalResponse +from whylabs_client.model.analyzers_diagnostic_request import AnalyzersDiagnosticRequest +from whylabs_client.model.segment import Segment as WhyLabsSegment +from whylabs_client.model.segment_tag import SegmentTag as WhyLabsSegmentTag +from whylabs_toolkit.helpers.utils import get_monitor_api, get_models_api, get_monitor_diagnostics_api +from whylabs_toolkit.monitor.models import TimeRange, Monitor, Segment, Analyzer, EntitySchema +from whylabs_toolkit.utils.granularity import Granularity + +from whylabs_toolkit.monitor.diagnoser.helpers.utils import get_monitor_diagnostics_api, segment_as_readable_text +from whylabs_toolkit.helpers.monitor_helpers import time_period_to_granularity +from whylabs_toolkit.monitor.diagnoser.constants import DEFAULT_BATCHES +from whylabs_toolkit.monitor.diagnoser.models import ( + NoisyMonitorStats, + FailedMonitorStats, + FailedSegmentStats, + NoisySegmentStats, + NoisyColumnStats, + MonitorDiagnosisReport, +) +from whylabs_toolkit.monitor.diagnoser.targeting import targeted_columns + + +def to_mapped_dict(obj: object) -> object: + """ + Convert a WhyLabs Client class instance into a JSON dictionary with keys mapped to the API schema. For example, + the pythonized 'org_id' attribute becomes 'orgId'. + :param obj: + :return: dict + """ + if hasattr(obj, "to_dict") and hasattr(obj, "attribute_map"): + return {obj.attribute_map[k]: to_mapped_dict(getattr(obj, k)) for k, _ in obj.to_dict().items()} + if isinstance(obj, list): + return [to_mapped_dict(i) for i in obj] + return obj + + +class MonitorDiagnoser: + def __init__(self, org_id: str, dataset_id: str): + self.org_id: str = org_id + self.dataset_id: str = dataset_id + self.desired_batches: int = DEFAULT_BATCHES + self.granularity: Optional[Granularity] = None + self._diagnostics_api = get_monitor_diagnostics_api() + self._monitor_api = get_monitor_api() + self._models_api = get_models_api() + self._diagnostics_api = get_monitor_diagnostics_api() + self._monitor_configs: Optional[List[Monitor]] = None + self._noisy_monitors: Optional[List[NoisyMonitorStats]] = None + self._failed_monitors: Optional[List[FailedMonitorStats]] = None + self._noisy_segments: Optional[List[NoisySegmentStats]] = None + self._failed_segments: Optional[List[FailedSegmentStats]] = None + self._noisy_columns: Optional[List[NoisyColumnStats]] = None + self._diagnostic_interval: Optional[str] = None + self._monitor_id: Optional[str] = None + self._diagnostic_segment: Optional[Segment] = None + self._analyzer: Optional[Analyzer] = None + self._diagnosed_columns: Optional[List[str]] = None + self._diagnosis: Optional[MonitorDiagnosisReport] = None + self.schema: Optional[EntitySchema] = None + + @property + def noisy_monitors(self) -> List[NoisyMonitorStats]: + if self._noisy_monitors is None: + raise Exception('Run "detect_noisy_monitors" first to get the noisy monitors.') + return self._noisy_monitors + + @property + def noisy_monitors_with_actions(self) -> List[NoisyMonitorStats]: + return [m for m in self.noisy_monitors if m.action_count > 0] + + @property + def noisy_monitors_without_actions(self) -> List[NoisyMonitorStats]: + return [m for m in self.noisy_monitors if m.action_count == 0] + + @property + def failed_monitors(self) -> List[FailedMonitorStats]: + if self._failed_monitors is None: + raise Exception('Run "detect_noisy_monitors" first to get the failed monitors.') + return self._failed_monitors + + @property + def noisy_segments(self) -> List[NoisySegmentStats]: + if self._noisy_segments is None: + raise Exception('Run "detect_noisy_segments" first to get the noisy monitors.') + return self._noisy_segments + + @property + def failed_segments(self) -> List[FailedSegmentStats]: + if self._failed_segments is None: + raise Exception('Run "detect_noisy_segments" first to get the failed monitors.') + return self._failed_segments + + @property + def noisy_columns(self) -> List[NoisyColumnStats]: + if self._noisy_columns is None: + raise Exception('Run "detect_noisy_columns" first to get the noisy columns.') + return self._noisy_columns + + @property + def monitor_configs(self) -> List[Monitor]: + if self._monitor_configs is None: + config = self._monitor_api.get_monitor_config_v3(self.org_id, self.dataset_id) + self._monitor_configs = [] + for m in config.get("monitors", []): + try: + self._monitor_configs.append(Monitor.parse_obj(m)) + except ValidationError: + pass # skipping monitors with validation problems + return self._monitor_configs + + @property + def diagnostic_interval(self) -> str: + if self._diagnostic_interval is None: + raise Exception('Set a diagnostic interval first, e.g. by running "choose_dataset_batches"') + return self._diagnostic_interval + + @diagnostic_interval.setter + def diagnostic_interval(self, interval: str) -> None: + self._diagnostic_interval = interval + + @property + def diagnostic_segment(self) -> Segment: + if self._diagnostic_segment is None: + raise Exception('Set the "diagnostic_segment" property first, e.g. by running "detect_noisy_segments"') + return self._diagnostic_segment + + @diagnostic_segment.setter + def diagnostic_segment(self, segment: Segment) -> None: + if self._diagnostic_segment != segment: + self._diagnostic_segment = segment + self._noisy_columns = None + self._diagnosis = None + + @property + def monitor_id_to_diagnose(self) -> str: + if self._monitor_id is None: + raise Exception('Set the "monitor_id" property first, e.g. by running "detect_noisy_monitors"') + return self._monitor_id + + @monitor_id_to_diagnose.setter + def monitor_id_to_diagnose(self, monitor_id: str) -> None: + if self._monitor_id != monitor_id: + self._monitor_id = monitor_id + # Reset anything specific to the monitor + self._analyzer = None + self._noisy_segments = None + self._failed_segments = None + self._noisy_columns = None + self._diagnosis = None + self._diagnostic_segment = None + + @property + def monitor_to_diagnose(self) -> Optional[Monitor]: + return next((m for m in self.monitor_configs if m.id == self._monitor_id), None) + + def targeted_columns(self) -> List[str]: + if self.schema is None: + self.schema = self._models_api.get_entity_schema(self.org_id, self.dataset_id) + return targeted_columns(self.analyzer_to_diagnose.targetMatrix, self.schema) + + @property + def analyzer_to_diagnose(self) -> Analyzer: + if self._analyzer is None: + analyzer_id = self.get_analyzer_id_for_monitor() + resp = self._monitor_api.get_analyzer(self.org_id, self.dataset_id, analyzer_id) + self._analyzer = Analyzer.parse_obj(resp) + return self._analyzer + + def choose_dataset_batches(self) -> Tuple[TimeRange, Granularity, str]: + """ + Based on the dataset's batch frequency, lineage (start/end) and the desired number of batches, + recommends a diagnostic interval for the dataset. + :return: tuple of lineage, granularity, interval + """ + # get recommended diagnostic interval and the dataset's batch frequency + resp: DiagnosticIntervalResponse = self._diagnostics_api.recommend_diagnostic_interval( + self.org_id, DiagnosticIntervalRequest(dataset_id=self.dataset_id, batches=self.desired_batches) + ) + time_period = resp.time_period + self._diagnostic_interval = resp.interval + if resp.start_timestamp is None or resp.end_timestamp is None: + raise Exception("No existing batch data") + + lineage = TimeRange(start=resp.start_timestamp, end=resp.end_timestamp) + self.granularity = time_period_to_granularity(time_period) + + return lineage, self.granularity, resp.interval + + def detect_noisy_monitors(self) -> List[NoisyMonitorStats]: + """ + Detects noisy monitors for the dataset. The summary statistics are returned and made available in the + noisy_monitors property. + :return: List of noisy monitor statistics, ordered with the noisiest first + """ + + def merge_monitor_actions(item: Dict, mon_acts: List[Dict]) -> Dict: + monitor_action = next((m for m in mon_acts if m["analyzer_id"] == item["analyzer_id"]), None) + if monitor_action: + item.update(monitor_action) + else: + item["action_count"] = 0 + item["action_targets"] = [] + return item + + if self._diagnostic_interval is None: + self.choose_dataset_batches() + resp: AnalyzersDiagnosticResponse = self._diagnostics_api.detect_noisy_analyzers( + self.org_id, AnalyzersDiagnosticRequest(dataset_id=self.dataset_id, interval=self._diagnostic_interval) + ) + monitor_actions = [ + { + "monitor_id": m.id, + "analyzer_id": m.analyzerIds[0] if len(m.analyzerIds) > 0 else None, + "action_count": len(m.actions), + "action_targets": [a.target for a in m.actions if a.type == "global"], + } + for m in self.monitor_configs + ] + self._noisy_monitors = [ + NoisyMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) + for item in resp.noisy_analyzers + ] + self._failed_monitors = [ + FailedMonitorStats.parse_obj(merge_monitor_actions(item.to_dict(), monitor_actions)) + for item in resp.failed_analyzers + ] + if len(self._noisy_monitors) == 0: + raise Exception("No noisy monitors found") + if self._monitor_id is None: + self._monitor_id = self._noisy_monitors[0].monitor_id + return self._noisy_monitors + + def get_analyzer_id_for_monitor(self) -> str: + analyzer_id: Optional[str] = next( + (m.analyzerIds[0] for m in self.monitor_configs if m.id == self.monitor_id_to_diagnose), None + ) + if analyzer_id is None: + raise Exception(f"No analyzer found for monitor {self.monitor_id_to_diagnose}") + return analyzer_id + + def detect_noisy_segments(self) -> List[NoisySegmentStats]: + analyzer_id = self.get_analyzer_id_for_monitor() + resp: AnalyzerSegmentsDiagnosticResponse = self._diagnostics_api.detect_noisy_segments( + self.org_id, + AnalyzerSegmentsDiagnosticRequest( + dataset_id=self.dataset_id, analyzer_id=analyzer_id, interval=self._diagnostic_interval + ), + ) + self._noisy_segments = [NoisySegmentStats.parse_obj(n.to_dict()) for n in resp.noisy_segments] + self._failed_segments = [FailedSegmentStats.parse_obj(n.to_dict()) for n in resp.failed_segments] + self.diagnostic_segment = self._noisy_segments[0].segment + return self._noisy_segments + + def detect_noisy_columns(self) -> List[NoisyColumnStats]: + analyzer_id = self.get_analyzer_id_for_monitor() + resp: AnalyzerSegmentColumnsDiagnosticResponse = self._diagnostics_api.detect_noisy_columns( + self.org_id, + AnalyzerSegmentColumnsDiagnosticRequest( + dataset_id=self.dataset_id, + analyzer_id=analyzer_id, + interval=self._diagnostic_interval, + segment=WhyLabsSegment(tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags]), + ), + ) + self._noisy_columns = [NoisyColumnStats.parse_obj(n.to_dict()) for n in resp.noisy_columns] + return self._noisy_columns + + def describe_segments(self) -> str: + with_anomalies = [s for s in self.noisy_segments if s.total_anomalies > 0] + with_failures = [s for s in self.failed_segments if s.total_failed > 0] + text = ( + f"{len(with_anomalies)} of {len(self.noisy_segments)} analyzed segments have anomalies " + f"and {len(with_failures)} have failures\n\n" + ) + if len(with_anomalies): + text += "Segments with anomalies:\n" + text += pd.DataFrame.from_records(with_anomalies).to_markdown() + text += "\n" + if len(with_failures): + text += "Segments with failures:\n" + text += pd.DataFrame.from_records(with_failures).to_markdown() + text += "\n" + noisiest = segment_as_readable_text(self.noisy_segments[0].segment.tags) + text += f"Noisiest segment selected for diagnosis: {noisiest}\n" + return text + + def describe_columns(self) -> str: + cols = self.noisy_columns + text = f"Analysis ran on {len(cols)} columns in the diagnosed segment.\n" + text += pd.DataFrame.from_records(cols).to_markdown() + return text + + def diagnose(self, columns: Optional[List[str]] = None) -> MonitorDiagnosisReport: + if self._diagnostic_interval is None: + self.choose_dataset_batches() + if self._monitor_id is None: + self.detect_noisy_monitors() + if self._diagnostic_segment is None: + self.detect_noisy_segments() + if columns is None: + if self._noisy_columns is None: + self.detect_noisy_columns() + self._diagnosed_columns = [c.column for c in self.noisy_columns[:100]] + else: + self._diagnosed_columns = columns[:100] + use_local_server = os.environ.get("USE_LOCAL_SERVER", False) + if use_local_server: + # Call the server function directly if configured to do so (for testing) + try: + from smart_config.server.server import DiagnosisRequest as DiagnoserDiagnosisRequest + from smart_config.server.diagnosis.analyzer_diagnoser import AnalyzerDiagnoser + + if use_local_server == "library": + # Call server code directly + analyzer_diagnoser = AnalyzerDiagnoser( + self.org_id, + self.dataset_id, + self.get_analyzer_id_for_monitor(), + self.diagnostic_interval, + os.environ["WHYLABS_API_KEY"], + ) + analyzer_diagnoser.assemble_data([t for t in self.diagnostic_segment.tags], self._diagnosed_columns) + analyzer_diagnoser.run_detectors() + report = analyzer_diagnoser.summarize_diagnosis() + report_dict = report.dict() + else: + # Call local instance of server + from smart_config.server.service.diagnosis_service import DiagnosisService + + diagnosis_service = DiagnosisService( + options={ + "headers": { + "Accept": "application/json", + "Content-Type": "application/json", + "X-API-KEY": os.environ["WHYLABS_API_KEY"], + } + } + ) + report_dict = diagnosis_service.diagnose_sync( + DiagnoserDiagnosisRequest( + orgId=self.org_id, + datasetId=self.dataset_id, + analyzerId=self.get_analyzer_id_for_monitor(), + interval=self.diagnostic_interval, + columns=self._diagnosed_columns, + segment=self.diagnostic_segment, + ) + ) + except ImportError: + raise Exception("USE_LOCAL_SERVER is set but server library is not available.") + else: + # Call the diagnosis API via whyLabs client + response = self._diagnostics_api.diagnose_analyzer_sync( + self.org_id, + DiagnosisRequest( + dataset_id=self.dataset_id, + analyzer_id=self.get_analyzer_id_for_monitor(), + interval=self.diagnostic_interval, + columns=self._diagnosed_columns, + segment=WhyLabsSegment( + tags=[WhyLabsSegmentTag(t.key, t.value) for t in self.diagnostic_segment.tags] + ), + ), + ) + + report_dict = to_mapped_dict(response) + + self._diagnosis = MonitorDiagnosisReport( + **report_dict, + analyzer=self.analyzer_to_diagnose, + monitor=self.monitor_to_diagnose, + analyzedColumnCount=len(self.noisy_columns), + ) + return self._diagnosis diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/__init__.py b/whylabs_toolkit/monitor/diagnoser/recommendation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py new file mode 100644 index 0000000..40660c2 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/change_recommender.py @@ -0,0 +1,169 @@ +from __future__ import annotations +from typing import List, NamedTuple, Optional +import pandas as pd +from whylabs_client.api.monitor_api import MonitorApi +from whylabs_toolkit.helpers.utils import get_monitor_api +from whylabs_toolkit.monitor.models import Analyzer, Monitor + +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange +from whylabs_toolkit.monitor.diagnoser.recommendation.manual_change import ManualChange +from whylabs_toolkit.monitor.diagnoser.recommendation.remove_columns import RemoveColumns +from whylabs_toolkit.monitor.diagnoser.models.diagnosis_report import ( + MonitorDiagnosisReport, + ConditionRecord, +) + + +class ChangeResults(NamedTuple): + succeeded: List[RecommendedChange] + failed: List[RecommendedChange] + errors: List[str] + manual: List[RecommendedChange] + + def describe(self) -> str: + description = "" + if len(self.succeeded): + description += "Successfully made the following changes:\n" + description += "\n\t".join(["\t* " + c.describe() for c in self.succeeded]) + "\n" + if len(self.failed): + description += "Failed to make the following changes:\n" + description += "\n\t".join(["\t* " + c.describe() for c in self.failed]) + description += "\nErrors:\n" + description += "\n\t".join(["\t* " + e for e in self.errors]) + "\n" + if len(self.manual): + description += "The following changes require manual intervention:\n" + description += "\n\t".join(["\t* " + c.describe() for c in self.manual]) + "\n" + return description + + +class ChangeRecommender: + + _condition_order = [ + # specific conditions unlikely to be rectified by other actions + "changing_discrete", + "changing_continuous", + "few_unique", + "many_unique", + "very_few_unique", + "late_upload_mismatch", + "narrow_threshold_band", + "small_nonnull_batches", + # most general conditions + "stale_analysis", + "low_drift_threshold", + "fixed_threshold_mismatch", + "stddev_insufficient_baseline", + "missing_baseline_batches", + "fixed_baseline_mismatch", + ] + + def __init__(self, report: MonitorDiagnosisReport): + self._min_anomaly_count = 0 + self.report = report + self.org_id = report.orgId + self.dataset_id = report.datasetId + self.analyzer = report.analyzer + self.monitor = report.monitor + self._monitor_api = None # lazy + + @property + def monitor_api(self) -> MonitorApi: + if self._monitor_api is None: + self._monitor_api = get_monitor_api() + return self._monitor_api + + def _sort_conditions(self, conditions: List[ConditionRecord]) -> List[ConditionRecord]: + return sorted(conditions, key=lambda c: self._condition_order.index(c.name)) + + @staticmethod + def _best_change_for_condition(condition: ConditionRecord) -> RecommendedChange: + if condition.columns is None: + raise ValueError("Condition must have columns to recommend a change") + if condition.name in ["changing_discrete", "changing_continuous"]: + return RemoveColumns(columns=condition.columns, info=condition.info) + info = condition.info if condition.info else {} + info["condition"] = condition.name + info["summary"] = condition.summary + return ManualChange(columns=condition.columns, info=info) + + @property + def min_anomaly_count(self) -> int: + return self._min_anomaly_count + + @min_anomaly_count.setter + def min_anomaly_count(self, count: int) -> int: + self._min_anomaly_count = count + return self._min_anomaly_count + + def recommend(self) -> List[RecommendedChange]: + by_col_count = ( + self.report.diagnosticData.analysisResults.anomalies.byColumnCount + if (self.report.diagnosticData.analysisResults is not None) + else [] + ) + count_tuples = [c.to_tuple() for c in by_col_count] + cols, counts = zip(*count_tuples) + anom_count = pd.Series(counts, index=cols) + cols_to_address = anom_count[anom_count >= self.min_anomaly_count] + changes = [] + # find the best actions for the cols that pass min anomaly criteria + for c in self._sort_conditions(self.report.conditions): + c.columns = list(cols_to_address.filter(items=c.columns if c.columns else []).index) + if len(c.columns) > 0: + changes.append(self._best_change_for_condition(c)) + return changes + + def _update_analyzer(self, updated: Analyzer) -> None: + self.monitor_api.put_analyzer( + org_id=self.org_id, + dataset_id=self.dataset_id, + analyzer_id=updated.id, + body=updated.dict(exclude_none=True), + ) + + def _delete_monitor(self) -> None: + if self.monitor is not None and self.analyzer is not None: + analyzer: Analyzer = self.analyzer + self.monitor_api.delete_monitor(org_id=self.org_id, dataset_id=self.dataset_id, monitor_id=self.monitor.id) + self.monitor_api.delete_analyzer(org_id=self.org_id, dataset_id=self.dataset_id, analyzer_id=analyzer.id) + + def _add_new_monitor(self, new_analyzer: Analyzer) -> None: + new_monitor = ( + Monitor(**self.monitor.dict(), id=new_analyzer.id) if self.monitor else Monitor(id=new_analyzer.id) + ) + self.monitor_api.put_monitor( + org_id=self.org_id, + dataset_id=self.dataset_id, + monitor_id=new_analyzer.id, # use same id as the analyzer + body=new_monitor.json(exclude_none=True), + ) + self.monitor_api.put_analyzer( + org_id=self.org_id, + dataset_id=self.dataset_id, + analyzer_id=new_analyzer.id, + body=new_analyzer.json(exclude_none=True), + ) + + def make_changes(self, changes: Optional[List[RecommendedChange]] = None) -> ChangeResults: + changes = self.recommend() if changes is None else changes + succeeded: List[RecommendedChange] = [] + failed: List[RecommendedChange] = [] + errors: List[str] = [] + for c in changes: + if c.can_automate() and self.analyzer: + try: + changed_analyzers = c.generate_config(self.analyzer) + if next((a.id for a in changed_analyzers), None) is None: + # Delete existing analyzer/monitor as there's nothing useful left in it + self._delete_monitor() + # update existing or create new monitor(s) + for changed in changed_analyzers: + if changed.id == self.analyzer.id: + self._update_analyzer(changed) + else: + self._add_new_monitor(changed) + succeeded.append(c) + except Exception as e: + failed.append(c) + errors.append(f"{c.name} failed with {e}") + return ChangeResults(succeeded, failed, errors, [c for c in changes if not c.can_automate()]) diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py new file mode 100644 index 0000000..b885fe3 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/manual_change.py @@ -0,0 +1,15 @@ +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange + + +class ManualChange(RecommendedChange): + name = "manual_change" + summary = "Make a manual change to the analyzer to address {condition}: {summary}" + required_info = ["condition"] + manual = True + + def summarize(self) -> str: + condition = self.info.get("condition", "") if self.info else "" + if condition == "narrow_threshold_band": + # percent diff of 0 would be bad... need to add info to differentiate + return "Move columns to a new analyzer that uses absolute diff, percent diff or fixed thresholds" + return super().summarize() diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py new file mode 100644 index 0000000..b900974 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/recommended_change.py @@ -0,0 +1,60 @@ +from __future__ import annotations +from typing import Optional, List + +from whylabs_toolkit.monitor.models import Analyzer + +from whylabs_toolkit.monitor.diagnoser.models import ConditionRecord +from whylabs_toolkit.monitor.diagnoser.helpers.describe import describe_truncated_list + + +class RecommendedChange: + name = "" + summary = "" + manual = True + required_info: List[str] = [] + + @classmethod + def from_condition(cls, condition: ConditionRecord) -> RecommendedChange: + return cls(condition.columns if condition.columns is not None else [], condition.info) + + def __init__(self, columns: List[str], info: Optional[dict] = None): + self.columns = columns + self.info = info + + def merge(self, change: RecommendedChange) -> RecommendedChange: + if change.name != self.name: + raise ValueError(f"Cannot merge {self.name} and {change.name}") + merged = RecommendedChange(list(set(self.columns) | set(change.columns)), self.info) + merged.merge_info(change.info) + return merged + + def merge_info(self, info: Optional[dict]) -> Optional[dict]: + if self.info is None: + self.info = info + elif info is not None: + self.info = {**self.info, **info} + return self.info + + def summarize(self) -> str: + info = self.info if self.info else {} + return self.summary.format(**info) + + def describe(self) -> str: + return f"{self.summarize()} for {describe_truncated_list(self.columns)}" + + def can_automate(self) -> bool: + return all(getattr(self.info, f, False) for f in self.required_info) and not self.manual + + def _check_can_do(self, analyzer: Analyzer) -> bool: + if self.manual: + raise Exception(f"{self.name} has not been automated") + if not self.can_automate(): + raise Exception( + f"{self.name} requires extra information " + f"{[f for f in self.required_info if self.info is None or f not in self.info.keys()]}" + ) + return True + + def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: + self._check_can_do(analyzer) + return [analyzer] diff --git a/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py new file mode 100644 index 0000000..531a40a --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/recommendation/remove_columns.py @@ -0,0 +1,39 @@ +from typing import List, Union + +from whylabs_toolkit.monitor.models import Analyzer, TargetLevel, ColumnMatrix, DatasetMatrix + +from whylabs_toolkit.monitor.diagnoser.recommendation.recommended_change import RecommendedChange +from whylabs_toolkit.monitor.models.analyzer import ColumnGroups + + +class RemoveColumns(RecommendedChange): + name = "remove_columns" + summary = "Remove columns from the analyzer" + required_info: List[str] = [] + manual = False + + def _check_can_do(self, analyzer: Analyzer) -> bool: + if analyzer.targetMatrix.type == TargetLevel.dataset: + raise ValueError("Cannot remove columns from a dataset level target matrix") + return super()._check_can_do(analyzer) + + def generate_config(self, analyzer: Analyzer) -> List[Analyzer]: + self._check_can_do(analyzer) + if isinstance(analyzer.targetMatrix, DatasetMatrix): + return [analyzer] + target_matrix: ColumnMatrix = analyzer.targetMatrix + include: List[str] = analyzer.targetMatrix.include if analyzer.targetMatrix.include is not None else [] + exclude: List[Union[ColumnGroups, str]] = ( + analyzer.targetMatrix.exclude if analyzer.targetMatrix.exclude is not None else [] + ) + to_remove = set(self.columns) + # remove from includes if possible, otherwise exclude + remove_includes = set(include).intersection(to_remove) + new_includes = list(set(include) - to_remove) + analyzer.targetMatrix.include = new_includes + new_excludes = list(set(exclude).union(to_remove - remove_includes)) + analyzer.targetMatrix.exclude = new_excludes + # if nothing's left to target, just remove the analyzer + if len(analyzer.targetMatrix.include) == 0: + return [] + return [analyzer] diff --git a/whylabs_toolkit/monitor/diagnoser/targeting.py b/whylabs_toolkit/monitor/diagnoser/targeting.py new file mode 100644 index 0000000..9f95608 --- /dev/null +++ b/whylabs_toolkit/monitor/diagnoser/targeting.py @@ -0,0 +1,33 @@ +from typing import List, Union, Set + +from whylabs_toolkit.monitor.models import EntitySchema, ColumnMatrix, DatasetMatrix + + +def expand_target(target: str, schema: EntitySchema) -> List[str]: + if target == "*": + return [str(k) for k in schema.columns.keys()] + col_items = schema.columns.items() + if target == "group:discrete": + return [name for (name, c) in col_items if c.discreteness == "discrete"] + if target == "group:continuous": + return [name for (name, c) in col_items if c.discreteness == "continuous"] + if target == "group:input": + return [name for (name, c) in col_items if c.classifier == "input"] + if target == "group:output": + return [name for (name, c) in col_items if c.classifier == "output"] + return [target] + + +def targeted_columns(target_matrix: Union[ColumnMatrix, DatasetMatrix], schema: EntitySchema) -> List[str]: + if target_matrix is None: + return [] + if isinstance(target_matrix, DatasetMatrix): + return ["__internal__datasetMetrics"] + columns: Set[str] = set() + if target_matrix.include is not None: + for include in target_matrix.include: + columns.update(expand_target(include, schema)) + if target_matrix.exclude is not None: + for exclude in target_matrix.exclude: + columns = columns - set(expand_target(exclude, schema)) + return list(columns)