Add Vocal Sound, Multilingual Librispeech, and FLEURS scenarios (#3130)

stanford-crfm · Nov 6, 2024 · 137698a · 137698a
1 parent 68651b9
commit 137698a
Show file tree

Hide file tree

Showing 10 changed files with 703 additions and 3 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -54,6 +54,8 @@ install_requires=
     scipy~=1.10
     uncertainty-calibration~=0.1.4
     scikit-learn~=1.1
+    jiwer~=3.0
+    rapidfuzz~=3.10
 
     # Models and Metrics Extras
     transformers~=4.40  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
@@ -146,7 +148,7 @@ mistral =
     mistralai~=1.1
 
 openai =
-    openai~=1.0
+    openai~=1.52
     tiktoken~=0.7
     pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
 
@@ -289,6 +291,8 @@ audiolm =
 
     # For metrics
     pycocoevalcap~=1.2
+    jiwer~=3.0
+    rapidfuzz~=3.10
 
 # Install everything
 all =

diff --git a/src/helm/benchmark/metrics/evaluate_reference_metrics.py b/src/helm/benchmark/metrics/evaluate_reference_metrics.py
@@ -9,6 +9,7 @@
 from nltk.translate.bleu_score import sentence_bleu
 from rouge_score import rouge_scorer
 import numpy as np
+from jiwer import wer, mer, wip, cer
 
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.request_state import RequestState
@@ -202,6 +203,45 @@ def cider(gold: str, pred: str) -> float:
     return average_score
 
 
+def wa_score(gold: str, pred: str) -> float:
+    # Word Accuracy (WA) equals to 1 - word error rate (WER), which is a common
+    # metric used to evaluate the accuracy of speech recognition systems.
+    # Note that this metric could be negative because the WER might be greater than 1.
+    # https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
+    if not pred:
+        return 0
+
+    wer_ret = 1 - wer(gold, pred)
+    return wer_ret
+
+
+def ma_score(gold: str, pred: str) -> float:
+    # Match Accuracy (MA) equals to 1 - match error rate (MER), which is for evaluating the accuracy of
+    # speech recognition systems.
+    if not pred:
+        return 0
+    mer_ret = mer(gold, pred)
+    return mer_ret
+
+
+def wip_score(gold: str, pred: str) -> float:
+    # Word information preservation (WIP) for evaluating the preserved information of speech
+    # recognition systems.
+    if not pred:
+        return 0
+    wip_ret = wip(gold, pred)
+    return wip_ret
+
+
+def ca_score(gold: str, pred: str) -> float:
+    # Character accuracy (CA) equals to character error rate (CER) for evaluating the accuracy
+    # of speech recognition systems.
+    if not pred:
+        return 0
+    cer_ret = cer(gold, pred)
+    return cer_ret
+
+
 def extract_set_from_text(
     set_str: str,
     set_start_str: str = " is ",
@@ -351,6 +391,10 @@ def compute_metrics_helper(
         "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
         "cleva_math_result_match": cleva_math_result_match,
         "absolute_value_difference": absolute_value_difference,
+        "wa_score": wa_score,
+        "ma_score": ma_score,
+        "wip_score": wip_score,
+        "ca_score": ca_score,
     }
 
     stats: List[Stat] = []

diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -3,4 +3,12 @@ entries: [
 
     # TODO: populate with the rest of the languages
     {description: "covost2:source_language=English,target_language=Chinese,model=audiolm", priority: 1}
+
+    {description: "vocal_sound:model=audiolm", priority: 1}
+
+    # TODO: populate with the rest of the languages
+    {description: "multilingual_librispeech:language=Dutch,model=audiolm", priority: 1}
+
+    # TODO: populate with the rest of the languages
+    {description: "fleurs:language=Finnish,model=audiolm", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -8,6 +8,7 @@
 from helm.benchmark.metrics.common_metric_specs import (
     get_classification_metric_specs,
     get_exact_match_metric_specs,
+    get_basic_metric_specs,
 )
 from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
@@ -49,6 +50,10 @@ def get_machine_translation_metric_specs() -> List[MetricSpec]:
     return [MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric")]
 
 
+def _get_audio_recognition_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["wa_score", "ma_score", "wip_score", "ca_score"])
+
+
 ########################################################################################################################
 # RunSpecs
 
@@ -90,3 +95,67 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=["covost2"],
     )
+
+
+@run_spec_function("vocal_sound")
+def get_vocal_sound_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
+        '"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
+        max_tokens=5,
+    )
+    metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    return RunSpec(
+        name="vocal_sound",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["vocal_sound"],
+    )
+
+
+@run_spec_function("multilingual_librispeech")
+def get_multilingual_librispeech_run_spec(language: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.multilingual_librispeech_scenario."
+        "MultilingualLibriSpeechScenario",
+        args={"language": language},
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Listen to the audio and generate an accurate transcript of the spoken content. "
+        "Respond with only the transcript text.",
+        max_tokens=100,
+    )
+    metric_specs = _get_audio_recognition_metric_specs()
+    return RunSpec(
+        name="multilingual_librispeech",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["multilingual_librispeech"],
+    )
+
+
+@run_spec_function("fleurs")
+def get_fleurs_run_spec(language: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.fleurs_scenario.FLEURSScenario",
+        args={"language": language},
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Listen to the audio and identify the language spoken. Choose from these"
+        'options only: "Finnish", "Bulgarian", "Hebrew", "Zulu", "Bengali", "Thai",'
+        '"Mandarin Chinese". Respond with just the language name.',
+        max_tokens=5,
+    )
+    metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    return RunSpec(
+        name="fleurs",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["fleurs"],
+    )
diff --git a/src/helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py b/src/helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py
@@ -48,14 +48,14 @@ class AudioMNISTScenario(Scenario):
 
     def get_instances(self, output_path: str) -> List[Instance]:
         instances: List[Instance] = []
-        wav_save_dir: str = os.path.join(output_path, "wav_files")
+        wav_save_dir: str = os.path.join(output_path, "audio_files")
         ensure_directory_exists(wav_save_dir)
         for row in tqdm(load_dataset("flexthink/audiomnist", cache_dir=output_path, split=TEST_SPLIT)):
             local_audio_path = os.path.join(wav_save_dir, row["audio"]["path"])
             audio_array = row["audio"]["array"]
             ensure_wav_file_exists_from_array(local_audio_path, audio_array, row["audio"]["sampling_rate"])
             input = Input(
-                multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
+                multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
             )
             references = [Reference(Output(text=str(row["digit"])), tags=[CORRECT_TAG])]
             instances.append(Instance(input=input, references=references, split=TEST_SPLIT))