From abdea3ecb77871e890ec3c5b10e7d6505c7eda8a Mon Sep 17 00:00:00 2001 From: Vipul Gupta Date: Mon, 8 May 2023 21:42:22 +0530 Subject: [PATCH] Added Grammar check as data integrity (#247) --- .../grammar_check.ipynb | 68 +++++++++---------- .../core/classes/monitors/data_integrity.py | 30 +++++--- .../ee/classes/measurables/llm_measurables.py | 1 - 3 files changed, 52 insertions(+), 47 deletions(-) diff --git a/examples/conversation_summarization/grammar_check.ipynb b/examples/conversation_summarization/grammar_check.ipynb index 6496e748..dc762e99 100644 --- a/examples/conversation_summarization/grammar_check.ipynb +++ b/examples/conversation_summarization/grammar_check.ipynb @@ -52,7 +52,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "284d33c36ea244fb81019ff599345e94", + "model_id": "a33964146a8d4528906a6b6e1c6bf6cd", "version_major": 2, "version_minor": 0 }, @@ -65,7 +65,8 @@ } ], "source": [ - "samsum_dataset = load_dataset(\"samsum\")" + "samsum_dataset = load_dataset(\"samsum\")\n", + "dataset = samsum_dataset['test'][0:50]" ] }, { @@ -131,7 +132,7 @@ " 'feature_name': 'summary'\n", " },\n", " ],\n", - " \"update_freq\": 200,\n", + " \"update_freq\": 50,\n", " # 'initial_dataset': reference_dataset_file,\n", " \"do_clustering\": False,\n", "}" @@ -140,6 +141,23 @@ { "cell_type": "code", "execution_count": 5, + "id": "242b028b-e458-416a-8568-6e461c815d40", + "metadata": {}, + "outputs": [], + "source": [ + "di_check = data_integrity_check = {\n", + " \"type\": uptrain.Monitor.DATA_INTEGRITY,\n", + " \"measurable_args\": {\n", + " 'type': uptrain.MeasurableType.GRAMMAR_SCORE,\n", + " },\n", + " \"integrity_type\": \"grammar_check\",\n", + " \"threshold\": 60,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "42e0ee13-9bb7-4902-9d27-cac52b583af3", "metadata": {}, "outputs": [ @@ -147,17 +165,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Deleting the folder: uptrain_smart_data\n", - "Deleting the folder: uptrain_logs\n" + "Deleting contents of the folder: uptrain_smart_data\n", + "Deleting contents of the folder: uptrain_logs\n" ] } ], "source": [ "config = {\n", - " \"checks\": [umap_check],\n", + " \"checks\": [umap_check, di_check],\n", " \"logging_args\": {\"st_logging\": True},\n", " # ADD your OpenAI API key below\n", - " \"license_args\": {\"openai_key\": \"YOUR_KEY_HERE\"}\n", + " \"license_args\": {\"openai_key\": \"YOUR KEY HERE\"}\n", "}\n", "\n", "framework = uptrain.Framework(cfg_dict=config)" @@ -165,39 +183,17 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "86ca0230-31b1-4059-a5f1-7d56d51e7134", - "metadata": {}, - "outputs": [], - "source": [ - "batch_size = 200" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "fdd126d7-89d0-4164-a012-d44c08be4849", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100 predictions logged for samsum test\n" - ] - } - ], + "outputs": [], "source": [ "\"\"\"\n", "Run the model in production and pass \n", "800 data points from SAMSum test.\n", "\"\"\"\n", - " \n", - "d_type = 'test'\n", - "dataset_name = 'samsum'\n", - "dataset = samsum_dataset['test']\n", "\n", - "f = open(os.path.join(data_dir, f\"out_{d_type}_{dataset_name}_summaries.json\"))\n", + "f = open(os.path.join(data_dir, f\"out_test_samsum_summaries.json\"))\n", "all_summaries = json.load(f)\n", "f.close()\n", "\n", @@ -207,10 +203,11 @@ "But any other embeddings, such as the ones generated by the\n", "encoder can be used as well.\n", "\"\"\"\n", - "f = open(os.path.join(data_dir, f\"out_{d_type}_{dataset_name}_bert_embs.json\"))\n", + "f = open(os.path.join(data_dir, f\"out_test_samsum_bert_embs.json\"))\n", "all_bert_embs = json.load(f)\n", "f.close()\n", "\n", + "batch_size = 10\n", "for idx in range(len(all_bert_embs)//batch_size):\n", " idxs = slice(idx*batch_size, (idx+1)*batch_size)\n", " this_batch = dataset['summary'][idxs]\n", @@ -219,13 +216,12 @@ " inputs = {\n", " 'id': list(range(idx*batch_size, (idx+1)*batch_size)),\n", " 'bert_embs': np.array(all_bert_embs[idxs]),\n", - " 'dataset_label': [dataset_name]*batch_size,\n", + " 'dataset_label': ['samsum']*batch_size,\n", " 'dialog': this_batch_dialog,\n", " 'summary': this_batch,\n", " }\n", " idens = framework.log(inputs=inputs, outputs=all_summaries[idxs])\n", - " break\n", - "print(f\"{(idx+1)*batch_size} predictions logged for {dataset_name} {d_type}\")" + "print(f\"{(idx+1)*batch_size} predictions logged.\")" ] } ], diff --git a/uptrain/core/classes/monitors/data_integrity.py b/uptrain/core/classes/monitors/data_integrity.py index 5335651e..115f18e7 100644 --- a/uptrain/core/classes/monitors/data_integrity.py +++ b/uptrain/core/classes/monitors/data_integrity.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd from uptrain.core.classes.monitors import AbstractMonitor -from uptrain.core.classes.measurables import MeasurableResolver +from uptrain.ee.classes.measurables import GrammerScoreMeasurable from uptrain.constants import Monitor from uptrain.core.lib.helper_funcs import read_json @@ -13,6 +13,7 @@ class DataIntegrity(AbstractMonitor): def base_init(self, fw, check): self.integrity_type = check["integrity_type"] + # Threshold value for data integrity check self.threshold = check.get("threshold", None) # Threshold for when to alert on percentage of outliers (default 2%) self.outliers_alert_thres = check.get("outliers_alert_thres", 2) @@ -30,15 +31,15 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}): signal_value = np.squeeze(np.array(signal_value)) if self.integrity_type == "non_null": - has_issue = signal_value == None + self.has_issue = signal_value == None elif self.integrity_type == "less_than": - has_issue = signal_value > self.threshold + self.has_issue = signal_value > self.threshold elif self.integrity_type == "equal_to": - has_issue = signal_value == self.threshold + self.has_issue = signal_value == self.threshold elif self.integrity_type == "greater_than": - has_issue = signal_value < self.threshold + self.has_issue = signal_value < self.threshold elif self.integrity_type == "minus_one": - has_issue = signal_value == -1 + self.has_issue = signal_value == -1 elif self.integrity_type == "z_score": if self.threshold is None: self.threshold = 3 @@ -49,9 +50,9 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}): # Calculating Z-scores w.r.t. the current dataset else: z_score = zscore(signal_value) - has_issue = np.abs(z_score) > self.threshold - outliers = signal_value[has_issue] - valid_signal = signal_value[~has_issue] + self.has_issue = np.abs(z_score) > self.threshold + outliers = signal_value[self.has_issue] + valid_signal = signal_value[~self.has_issue] feat_name = self.measurable.col_name() plot_name = f"z_score_feature_{feat_name}" @@ -75,12 +76,14 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}): alert = f"{len(outliers)} of {len(z_score)} samples have Z-Score > {self.threshold} ({perc}%)", dashboard_name = self.dashboard_name ) + elif self.integrity_type == "grammar_check": + self.has_issue = signal_value < self.threshold else: raise NotImplementedError( "Data integrity check {} not implemented".format(self.integrity_type) ) self.count += len(signal_value) - self.num_issues += np.sum(np.array(has_issue)) + self.num_issues += np.sum(np.array(self.has_issue)) self.log_handler.add_scalars( self.integrity_type + "_outliers_ratio", @@ -92,6 +95,13 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}): def need_ground_truth(self): return False + + def base_is_data_interesting(self, inputs, outputs, gts=None, extra_args={}): + reasons = ["None"] * len(extra_args["id"]) + for idx in range(len(extra_args["id"])): + if self.has_issue[idx]: + reasons.append("Data Integrity Issue, Type: {}".format(self.integrity_type)) + return self.has_issue, reasons def get_ref_data_stats(self): """ diff --git a/uptrain/ee/classes/measurables/llm_measurables.py b/uptrain/ee/classes/measurables/llm_measurables.py index bd8450c5..03748f3e 100644 --- a/uptrain/ee/classes/measurables/llm_measurables.py +++ b/uptrain/ee/classes/measurables/llm_measurables.py @@ -1,5 +1,4 @@ import re -import os from uptrain.core.lib.helper_funcs import read_json, dependency_required from uptrain.core.classes.measurables import Measurable