Skip to content

Commit

Permalink
Added Grammar check as data integrity (#247)
Browse files Browse the repository at this point in the history
  • Loading branch information
vipgupta authored May 8, 2023
1 parent 7407595 commit abdea3e
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 47 deletions.
68 changes: 32 additions & 36 deletions examples/conversation_summarization/grammar_check.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "284d33c36ea244fb81019ff599345e94",
"model_id": "a33964146a8d4528906a6b6e1c6bf6cd",
"version_major": 2,
"version_minor": 0
},
Expand All @@ -65,7 +65,8 @@
}
],
"source": [
"samsum_dataset = load_dataset(\"samsum\")"
"samsum_dataset = load_dataset(\"samsum\")\n",
"dataset = samsum_dataset['test'][0:50]"
]
},
{
Expand Down Expand Up @@ -131,7 +132,7 @@
" 'feature_name': 'summary'\n",
" },\n",
" ],\n",
" \"update_freq\": 200,\n",
" \"update_freq\": 50,\n",
" # 'initial_dataset': reference_dataset_file,\n",
" \"do_clustering\": False,\n",
"}"
Expand All @@ -140,64 +141,59 @@
{
"cell_type": "code",
"execution_count": 5,
"id": "242b028b-e458-416a-8568-6e461c815d40",
"metadata": {},
"outputs": [],
"source": [
"di_check = data_integrity_check = {\n",
" \"type\": uptrain.Monitor.DATA_INTEGRITY,\n",
" \"measurable_args\": {\n",
" 'type': uptrain.MeasurableType.GRAMMAR_SCORE,\n",
" },\n",
" \"integrity_type\": \"grammar_check\",\n",
" \"threshold\": 60,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "42e0ee13-9bb7-4902-9d27-cac52b583af3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Deleting the folder: uptrain_smart_data\n",
"Deleting the folder: uptrain_logs\n"
"Deleting contents of the folder: uptrain_smart_data\n",
"Deleting contents of the folder: uptrain_logs\n"
]
}
],
"source": [
"config = {\n",
" \"checks\": [umap_check],\n",
" \"checks\": [umap_check, di_check],\n",
" \"logging_args\": {\"st_logging\": True},\n",
" # ADD your OpenAI API key below\n",
" \"license_args\": {\"openai_key\": \"YOUR_KEY_HERE\"}\n",
" \"license_args\": {\"openai_key\": \"YOUR KEY HERE\"}\n",
"}\n",
"\n",
"framework = uptrain.Framework(cfg_dict=config)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "86ca0230-31b1-4059-a5f1-7d56d51e7134",
"metadata": {},
"outputs": [],
"source": [
"batch_size = 200"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "fdd126d7-89d0-4164-a012-d44c08be4849",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 predictions logged for samsum test\n"
]
}
],
"outputs": [],
"source": [
"\"\"\"\n",
"Run the model in production and pass \n",
"800 data points from SAMSum test.\n",
"\"\"\"\n",
" \n",
"d_type = 'test'\n",
"dataset_name = 'samsum'\n",
"dataset = samsum_dataset['test']\n",
"\n",
"f = open(os.path.join(data_dir, f\"out_{d_type}_{dataset_name}_summaries.json\"))\n",
"f = open(os.path.join(data_dir, f\"out_test_samsum_summaries.json\"))\n",
"all_summaries = json.load(f)\n",
"f.close()\n",
"\n",
Expand All @@ -207,10 +203,11 @@
"But any other embeddings, such as the ones generated by the\n",
"encoder can be used as well.\n",
"\"\"\"\n",
"f = open(os.path.join(data_dir, f\"out_{d_type}_{dataset_name}_bert_embs.json\"))\n",
"f = open(os.path.join(data_dir, f\"out_test_samsum_bert_embs.json\"))\n",
"all_bert_embs = json.load(f)\n",
"f.close()\n",
"\n",
"batch_size = 10\n",
"for idx in range(len(all_bert_embs)//batch_size):\n",
" idxs = slice(idx*batch_size, (idx+1)*batch_size)\n",
" this_batch = dataset['summary'][idxs]\n",
Expand All @@ -219,13 +216,12 @@
" inputs = {\n",
" 'id': list(range(idx*batch_size, (idx+1)*batch_size)),\n",
" 'bert_embs': np.array(all_bert_embs[idxs]),\n",
" 'dataset_label': [dataset_name]*batch_size,\n",
" 'dataset_label': ['samsum']*batch_size,\n",
" 'dialog': this_batch_dialog,\n",
" 'summary': this_batch,\n",
" }\n",
" idens = framework.log(inputs=inputs, outputs=all_summaries[idxs])\n",
" break\n",
"print(f\"{(idx+1)*batch_size} predictions logged for {dataset_name} {d_type}\")"
"print(f\"{(idx+1)*batch_size} predictions logged.\")"
]
}
],
Expand Down
30 changes: 20 additions & 10 deletions uptrain/core/classes/monitors/data_integrity.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
from uptrain.core.classes.monitors import AbstractMonitor
from uptrain.core.classes.measurables import MeasurableResolver
from uptrain.ee.classes.measurables import GrammerScoreMeasurable
from uptrain.constants import Monitor
from uptrain.core.lib.helper_funcs import read_json

Expand All @@ -13,6 +13,7 @@ class DataIntegrity(AbstractMonitor):

def base_init(self, fw, check):
self.integrity_type = check["integrity_type"]
# Threshold value for data integrity check
self.threshold = check.get("threshold", None)
# Threshold for when to alert on percentage of outliers (default 2%)
self.outliers_alert_thres = check.get("outliers_alert_thres", 2)
Expand All @@ -30,15 +31,15 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}):
signal_value = np.squeeze(np.array(signal_value))

if self.integrity_type == "non_null":
has_issue = signal_value == None
self.has_issue = signal_value == None
elif self.integrity_type == "less_than":
has_issue = signal_value > self.threshold
self.has_issue = signal_value > self.threshold
elif self.integrity_type == "equal_to":
has_issue = signal_value == self.threshold
self.has_issue = signal_value == self.threshold
elif self.integrity_type == "greater_than":
has_issue = signal_value < self.threshold
self.has_issue = signal_value < self.threshold
elif self.integrity_type == "minus_one":
has_issue = signal_value == -1
self.has_issue = signal_value == -1
elif self.integrity_type == "z_score":
if self.threshold is None:
self.threshold = 3
Expand All @@ -49,9 +50,9 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}):
# Calculating Z-scores w.r.t. the current dataset
else:
z_score = zscore(signal_value)
has_issue = np.abs(z_score) > self.threshold
outliers = signal_value[has_issue]
valid_signal = signal_value[~has_issue]
self.has_issue = np.abs(z_score) > self.threshold
outliers = signal_value[self.has_issue]
valid_signal = signal_value[~self.has_issue]

feat_name = self.measurable.col_name()
plot_name = f"z_score_feature_{feat_name}"
Expand All @@ -75,12 +76,14 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}):
alert = f"{len(outliers)} of {len(z_score)} samples have Z-Score > {self.threshold} ({perc}%)",
dashboard_name = self.dashboard_name
)
elif self.integrity_type == "grammar_check":
self.has_issue = signal_value < self.threshold
else:
raise NotImplementedError(
"Data integrity check {} not implemented".format(self.integrity_type)
)
self.count += len(signal_value)
self.num_issues += np.sum(np.array(has_issue))
self.num_issues += np.sum(np.array(self.has_issue))

self.log_handler.add_scalars(
self.integrity_type + "_outliers_ratio",
Expand All @@ -92,6 +95,13 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}):

def need_ground_truth(self):
return False

def base_is_data_interesting(self, inputs, outputs, gts=None, extra_args={}):
reasons = ["None"] * len(extra_args["id"])
for idx in range(len(extra_args["id"])):
if self.has_issue[idx]:
reasons.append("Data Integrity Issue, Type: {}".format(self.integrity_type))
return self.has_issue, reasons

def get_ref_data_stats(self):
"""
Expand Down
1 change: 0 additions & 1 deletion uptrain/ee/classes/measurables/llm_measurables.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
import os
from uptrain.core.lib.helper_funcs import read_json, dependency_required
from uptrain.core.classes.measurables import Measurable

Expand Down

0 comments on commit abdea3e

Please sign in to comment.