Skip to content

Commit

Permalink
Add speech to text example (#231)
Browse files Browse the repository at this point in the history
* Add speech to text example

* revert commits to example scripts

* revert commits to example scripts
  • Loading branch information
sourabhagr authored May 5, 2023
1 parent e6707a1 commit 7ae27d3
Show file tree
Hide file tree
Showing 12 changed files with 487 additions and 11 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ We are constantly working to make UpTrain better. Want a new feature or need any

# License 💻

This repo is published under Apache 2.0 license. We're currently focused on developing non-enterprise offerings that should cover most use cases by adding more features and extending to more models. We also working towards adding a hosted offering - [contact us](mailto:[email protected]) if you are interested.
This repo is published under Apache 2.0 license, with the exception of the ee directory which will contain premium features requiring an enterprise license in the future. We're currently focused on developing non-enterprise offerings that should cover most use cases by adding more features and extending to more models. We also working towards adding a hosted offering - [contact us](mailto:[email protected]) if you are interested.

# Stay Updated ☎️
We are continuously adding tons of features and use cases. Please support us by giving the project a star ⭐!
Expand Down
346 changes: 346 additions & 0 deletions examples/speech_to_text/run.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions uptrain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,7 @@
PlotType,
Statistic,
Visual,
ComparisonMetric,
ComparisonModel
)
from uptrain.core.encoders import UpTrainEncoder
9 changes: 9 additions & 0 deletions uptrain/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class Monitor(str, Enum):
CONCEPT_DRIFT = "concept_drift"
POPULARITY_BIAS = "popularity_bias"
DATA_INTEGRITY = "data_integrity"
OUTPUT_COMPARISON = "output_comparison"


class PlotType(str, Enum):
Expand All @@ -76,3 +77,11 @@ class Visual(str, Enum):
TSNE = "t-SNE"
SHAP = "SHAP"
PLOT = "PLOT"


class ComparisonModel(str, Enum):
FASTER_WHISPER = "faster_whisper"


class ComparisonMetric(str, Enum):
ROGUE_L_F1 = "rogue-l-f"
9 changes: 5 additions & 4 deletions uptrain/core/classes/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,14 +232,15 @@ def check_and_add_data(self, inputs, outputs, gts=None, extra_args={}):
}
)

if self.log_data:
# Log all the data-points into all_data warehouse
add_data_to_warehouse(deepcopy(data), self.path_all_data)

# Check for any monitors
self.check(data, extra_args)
self.predicted_count += self.batch_size

if self.log_data:
data.update(extra_args)
# Log all the data-points into all_data warehouse
add_data_to_warehouse(deepcopy(data), self.path_all_data)

# Smartly add data for retraining
self.smartly_add_data(data, extra_args)
self.extra_args = extra_args
Expand Down
4 changes: 4 additions & 0 deletions uptrain/core/classes/managers/check_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
ModelBias,
DataIntegrity,
EdgeCase,
OutputComparison
)
from uptrain.core.classes.statistics import (
Distance,
Expand Down Expand Up @@ -99,6 +100,9 @@ def add_monitor(self, check):
)
integrity_managers.append(DataIntegrity(self.fw, check_copy))
self.monitors_to_check.extend(integrity_managers)
elif check["type"] == Monitor.OUTPUT_COMPARISON:
comparison_monitor = OutputComparison(self.fw, check)
self.monitors_to_check.append(comparison_monitor)
else:
raise Exception("Monitor type not Supported")

Expand Down
10 changes: 6 additions & 4 deletions uptrain/core/classes/measurables/measurable_resolver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from uptrain.core.classes.measurables import (
Measurable,
InputFeatureMeasurable,
OutputFeatureMeasurable,
FeatureMeasurable,
FeatureConcatMeasurable,
ConditionMeasurable,
Expand Down Expand Up @@ -35,11 +37,11 @@ def resolve(self, framework) -> Measurable:
resolve_args = self._args
measurable_type = resolve_args["type"]
if measurable_type == MeasurableType.INPUT_FEATURE:
return FeatureMeasurable(framework, resolve_args["feature_name"], "inputs")
return InputFeatureMeasurable(framework, resolve_args["feature_name"])
elif measurable_type == MeasurableType.FEATURE_CONCAT:
return FeatureConcatMeasurable(framework, resolve_args["feat_name_list"])
elif measurable_type == MeasurableType.PREDICTION:
return FeatureMeasurable(framework, resolve_args["feature_name"], "outputs")
return OutputFeatureMeasurable(framework)
elif measurable_type == MeasurableType.CUSTOM:
return CustomMeasurable(framework, resolve_args)
elif measurable_type == MeasurableType.ACCURACY:
Expand All @@ -51,13 +53,13 @@ def resolve(self, framework) -> Measurable:
elif measurable_type == MeasurableType.CONDITION_ON_INPUT:
return ConditionMeasurable(
framework,
FeatureMeasurable(framework, resolve_args["feature_name"], "inputs"),
InputFeatureMeasurable(framework, resolve_args["feature_name"]),
resolve_args["condition_args"],
)
elif measurable_type == MeasurableType.CONDITION_ON_PREDICTION:
return ConditionMeasurable(
framework,
FeatureMeasurable(framework, resolve_args["feature_name"], "outputs"),
OutputFeatureMeasurable(framework),
resolve_args["condition_args"],
)
elif measurable_type == MeasurableType.SCALAR_FROM_EMBEDDING:
Expand Down
3 changes: 1 addition & 2 deletions uptrain/core/classes/measurables/output_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
class OutputFeatureMeasurable(Measurable):
"""Class that returns the output feature corresponding to the feature name."""

def __init__(self, framework, feature_name) -> None:
def __init__(self, framework) -> None:
super().__init__(framework)
self.feature_name = feature_name

def _compute(self, inputs=None, outputs=None, gts=None, extra=None) -> Any:
return outputs
Expand Down
1 change: 1 addition & 0 deletions uptrain/core/classes/monitors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from .edge_case import EdgeCase
from .model_bias import ModelBias
from .data_integrity import DataIntegrity
from .output_comparison import OutputComparison
2 changes: 2 additions & 0 deletions uptrain/core/classes/monitors/data_integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def base_check(self, inputs, outputs, gts=None, extra_args={}):
has_issue = signal_value == None
elif self.integrity_type == "less_than":
has_issue = signal_value > self.threshold
elif self.integrity_type == "equal_to":
has_issue = signal_value == self.threshold
elif self.integrity_type == "greater_than":
has_issue = signal_value < self.threshold
elif self.integrity_type == "minus_one":
Expand Down
79 changes: 79 additions & 0 deletions uptrain/core/classes/monitors/output_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np
from uptrain.core.classes.monitors import AbstractMonitor
from uptrain.core.classes.measurables import MeasurableResolver
from uptrain.constants import Monitor, ComparisonModel, ComparisonMetric


class OutputComparison(AbstractMonitor):
dashboard_name = "output_comparison"
monitor_type = Monitor.OUTPUT_COMPARISON

def base_init(self, fw, check):
self.comparison_model_base = check['comparison_model']
self.comparison_model_resolved = ComparisonModelResolver().resolve(check['comparison_model'])
self.comparison_model_inputs = MeasurableResolver(check.get("comparison_model_input_args", None)).resolve(fw)
self.comparison_metric_base = check['comparison_metric']
self.comparison_metric_resolved = ComparisonMetricResolver().resolve(check['comparison_metric'])
self.threshold = check['threshold']
self.count = 0

def base_check(self, inputs, outputs, gts=None, extra_args={}):
vals = self.measurable.compute_and_log(
inputs, outputs, gts=gts, extra=extra_args
)

comparison_model_inputs = self.comparison_model_inputs.compute_and_log(
inputs, outputs, gts=gts, extra=extra_args
)

comparison_model_outputs = self.comparison_model_resolved(comparison_model_inputs)
batch_metrics = self.comparison_metric_resolved(vals, comparison_model_outputs)
self.batch_metrics = batch_metrics

extra_args.update({self.comparison_model_base + " outputs": comparison_model_outputs, self.comparison_metric_base: batch_metrics})

feat_name = self.comparison_metric_base
plot_name = f"{feat_name} Comparison - Production vs {self.comparison_model_base}"
self.count += len(extra_args['id'])

self.log_handler.add_scalars(
plot_name,
{"y_" + feat_name: np.mean(batch_metrics)},
self.count,
self.dashboard_name,
file_name=plot_name,
)

def need_ground_truth(self):
return False

def base_is_data_interesting(self, inputs, outputs, gts=None, extra_args={}):
reasons = ["None"] * len(extra_args["id"])
is_interesting = self.batch_metrics < self.threshold
reasons = []
for idx in range(len(extra_args["id"])):
if is_interesting[idx] == 0:
reasons.append("None")
else:
reasons.append(f"Different output compared to {self.comparison_model_base}")
return is_interesting, reasons


class ComparisonModelResolver:

def resolve(self, model):
if model == ComparisonModel.FASTER_WHISPER:
from uptrain.ee.lib.algorithms import faster_whisper_speech_to_text
return faster_whisper_speech_to_text
else:
raise Exception(f"{model} can't be resolved")


class ComparisonMetricResolver:

def resolve(self, metric):
if metric == ComparisonMetric.ROGUE_L_F1:
from uptrain.ee.lib.algorithms import rogue_l_similarity
return rogue_l_similarity
else:
raise Exception(f"{metric} can't be resolved")
31 changes: 31 additions & 0 deletions uptrain/ee/lib/algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np
from uptrain.core.lib.helper_funcs import fn_dependency_required

try:
import faster_whisper
except:
faster_whisper = None

try:
import rouge
except:
rouge = None

@fn_dependency_required(faster_whisper, "faster_whisper")
def faster_whisper_speech_to_text(audio_files):
model_size = "large-v2"
model = faster_whisper.WhisperModel(model_size, device="cpu", compute_type="int8")
prescribed_texts = []
for audio_file in audio_files:
segments, _ = model.transcribe(audio_file, beam_size=5)
prescribed_text = ''
for segment in segments:
prescribed_text += segment.text
prescribed_texts.append(prescribed_text)
return prescribed_texts

@fn_dependency_required(rouge, "rouge")
def rogue_l_similarity(text1_list, text2_list):
r = rouge.Rouge()
res = r.get_scores([x.lower() for x in text1_list],[x.lower() for x in text2_list])
return np.array([x['rouge-l']['f'] for x in res])

0 comments on commit 7ae27d3

Please sign in to comment.