Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Common_Voice_15 and RSB audio scenarios #3147

Merged
merged 6 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ install_requires=
scipy~=1.10
uncertainty-calibration~=0.1.4
scikit-learn~=1.1
jiwer~=3.0
rapidfuzz~=3.10

# Models and Metrics Extras
transformers~=4.40 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
Expand Down Expand Up @@ -293,6 +291,7 @@ audiolm =
pycocoevalcap~=1.2
jiwer~=3.0
rapidfuzz~=3.10
jieba~=0.42.1

# Install everything
all =
Expand Down
61 changes: 60 additions & 1 deletion src/helm/benchmark/metrics/evaluate_reference_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import numpy as np
from jiwer import wer, mer, wip, cer

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
Expand Down Expand Up @@ -208,6 +207,11 @@ def wa_score(gold: str, pred: str) -> float:
# metric used to evaluate the accuracy of speech recognition systems.
# Note that this metric could be negative because the WER might be greater than 1.
# https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
try:
from jiwer import wer
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

if not pred:
return 0

Expand All @@ -218,6 +222,11 @@ def wa_score(gold: str, pred: str) -> float:
def ma_score(gold: str, pred: str) -> float:
# Match Accuracy (MA) equals to 1 - match error rate (MER), which is for evaluating the accuracy of
# speech recognition systems.
try:
from jiwer import mer
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

if not pred:
return 0
mer_ret = mer(gold, pred)
Expand All @@ -227,6 +236,11 @@ def ma_score(gold: str, pred: str) -> float:
def wip_score(gold: str, pred: str) -> float:
# Word information preservation (WIP) for evaluating the preserved information of speech
# recognition systems.
try:
from jiwer import wip
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

if not pred:
return 0
wip_ret = wip(gold, pred)
Expand All @@ -236,12 +250,53 @@ def wip_score(gold: str, pred: str) -> float:
def ca_score(gold: str, pred: str) -> float:
# Character accuracy (CA) equals to character error rate (CER) for evaluating the accuracy
# of speech recognition systems.
try:
from jiwer import cer
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

if not pred:
return 0
cer_ret = cer(gold, pred)
return cer_ret


def chinese_wa_score(gold: str, pred: str) -> float:
try:
import jieba
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

return wa_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))


def chinese_ma_score(gold: str, pred: str) -> float:
try:
import jieba
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

return ma_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))


def chinese_wip_score(gold: str, pred: str) -> float:
try:
import jieba
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

return wip_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))


def chinese_ca_score(gold: str, pred: str) -> float:
try:
import jieba
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["audiolm"])

return ca_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))


def extract_set_from_text(
set_str: str,
set_start_str: str = " is ",
Expand Down Expand Up @@ -395,6 +450,10 @@ def compute_metrics_helper(
"ma_score": ma_score,
"wip_score": wip_score,
"ca_score": ca_score,
"chinese_wa_score": chinese_wa_score,
"chinese_ma_score": chinese_ma_score,
"chinese_wip_score": chinese_wip_score,
"chinese_ca_score": chinese_ca_score,
}

stats: List[Stat] = []
Expand Down
57 changes: 55 additions & 2 deletions src/helm/benchmark/run_specs/audio_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
)


def _get_chinese_audio_recognition_metric_specs() -> List[MetricSpec]:
return get_basic_metric_specs(["chinese_wa_score", "chinese_ma_score", "chinese_wip_score", "chinese_ca_score"])


########################################################################################################################
# RunSpecs

Expand Down Expand Up @@ -173,7 +177,10 @@ def get_multilingual_librispeech_run_spec(language: str) -> RunSpec:
"Respond with only the transcript text.",
max_tokens=100,
)
metric_specs = _get_audio_recognition_metric_specs()
if "chinese" in language.lower():
metric_specs = _get_chinese_audio_recognition_metric_specs()
else:
metric_specs = _get_audio_recognition_metric_specs()
return RunSpec(
name="multilingual_librispeech",
scenario_spec=scenario_spec,
Expand Down Expand Up @@ -208,7 +215,7 @@ def get_fleurs_run_spec(language: str) -> RunSpec:
@run_spec_function("audiocaps")
def get_audiocaps_run_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.audiocaps_senario.AudioCapsScenario"
class_name="helm.benchmark.scenarios.audio_language.audiocaps_scenario.AudioCapsScenario"
)
adapter_spec = _get_generation_adapter_spec(
instructions="Generate a caption for the following audio. The caption should be short and does "
Expand All @@ -223,3 +230,49 @@ def get_audiocaps_run_spec() -> RunSpec:
metric_specs=metric_specs,
groups=["audiocaps"],
)


@run_spec_function("common_voice_15")
def get_common_voice_15_run_spec(language: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.common_voice_15_scenario.CommonVoice15Scenario",
args={"language": language},
)
adapter_spec = _get_generation_adapter_spec(
instructions="Listen to the audio and generate an accurate transcript of the spoken content. "
"Respond with only the transcript text.",
max_tokens=100,
)
# Chinese characters are not supported in the default metrics
if "chinese" in language.lower():
metric_specs = _get_chinese_audio_recognition_metric_specs()
else:
metric_specs = _get_audio_recognition_metric_specs()
return RunSpec(
name="common_voice_15",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["common_voice_15"],
)


@run_spec_function("speech_robust_bench")
def get_speech_robust_bench_run_spec(subject: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.speech_robust_bench_scenario.SpeechRobustBenchScenario",
args={"subject": subject},
)
adapter_spec = _get_generation_adapter_spec(
instructions="Listen to the audio and generate an accurate transcript of the spoken content. "
"Respond with only the transcript text.",
max_tokens=100,
)
metric_specs = _get_audio_recognition_metric_specs()
return RunSpec(
name="speech_robust_bench",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["speech_robust_bench"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from datasets import load_dataset
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_directory_exists
from helm.common.audio_utils import ensure_wav_file_exists_from_array
from helm.common.audio_utils import ensure_audio_file_exists_from_array


class AudioMNISTScenario(Scenario):
Expand Down Expand Up @@ -53,7 +53,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
for row in tqdm(load_dataset("flexthink/audiomnist", cache_dir=output_path, split=TEST_SPLIT)):
local_audio_path = os.path.join(wav_save_dir, row["audio"]["path"])
audio_array = row["audio"]["array"]
ensure_wav_file_exists_from_array(local_audio_path, audio_array, row["audio"]["sampling_rate"])
ensure_audio_file_exists_from_array(local_audio_path, audio_array, row["audio"]["sampling_rate"])
input = Input(
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Scenarios for audio models"""

from typing import List

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)
from collections import OrderedDict
from tqdm import tqdm
from datasets import load_dataset
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.hierarchical_logger import hlog


class CommonVoice15Scenario(Scenario):
"""CommonVoice15 Scenario

The most recent release of CommonVoice15 (Ardila et al, 2019) includes 114 languages. Over 50,000
individuals have participated so far, resulting in 2,500 hours of collected audio. This is the largest
audio corpus in the public domain for speech recognition, both in terms of number of hours and number
of languages. The task is to recognize the speech from the audio sample.



Paper: https://arxiv.org/abs/1912.06670
Code: https://github.com/common-voice/common-voice

Citation:
@article{ardila2019common,
title={Common voice: A massively-multilingual speech corpus},
author={Ardila, Rosana and Branson, Megan and Davis, Kelly and
Henretty, Michael and Kohler, Michael and Meyer, Josh and Morais,
Reuben and Saunders, Lindsay and Tyers, Francis M and Weber, Gregor},
journal={arXiv preprint arXiv:1912.06670},
year={2019}
}

"""

HF_DATASET_NAME = "mozilla-foundation/common_voice_15_0"

# Randomly selected 4 languages from 114 languages in the Common Voice 15 dataset following
# Qwen2-Audio (https://arxiv.org/abs/2407.10759). The full language is:
# https://huggingface.co/datasets/mozilla-foundation/common_voice_15_0/blob/main/languages.py
_COMMON_VOICE_TEST_LANG_TO_ID = OrderedDict(
[
("English", "en"),
("Chinese_hk", "zh-HK"),
("German", "de"),
("French", "fr"),
]
)

name = "common_voice_15"
description = "Speech recognition for 4 languages from 114 different languages in Common Voice 15 \
([Ardila et al, 2019](https://arxiv.org/abs/1912.06670))."
tags: List[str] = ["audio", "recognition"]

def __init__(self, language: str) -> None:
super().__init__()

language = language.capitalize()
if language not in CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID.keys():
raise ValueError(
f"Invalid language. Valid languages are: {CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID.keys()}"
)

self._language: str = language
hlog(
"You need to sign in Huggingface to download the dataset. Please remember "
"to sign in to download the dataset."
)

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []
language_category = CommonVoice15Scenario._COMMON_VOICE_TEST_LANG_TO_ID[self._language]
for row in tqdm(
load_dataset(
CommonVoice15Scenario.HF_DATASET_NAME,
name=language_category,
cache_dir=output_path,
split=TEST_SPLIT,
)
):
local_audio_path = row["path"]
answer = row["sentence"]
input = Input(
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
)
references = [Reference(Output(text=answer), tags=[CORRECT_TAG])]
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
return instances
10 changes: 9 additions & 1 deletion src/helm/benchmark/scenarios/audio_language/fleurs_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,15 @@ class FLEURSScenario(Scenario):
Code: https://tensorflow.org/datasets/catalog/xtreme_s

Citation:

@inproceedings{conneau2023fleurs,
title={Fleurs: Few-shot learning evaluation of universal representations of speech},
author={Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu and Axelrod,
Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara and Bapna, Ankur},
booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
pages={798--805},
year={2023},
organization={IEEE}
}
"""

HF_DATASET_NAME = "google/xtreme_s"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Input,
Output,
)
from helm.common.audio_utils import ensure_wav_file_exists_from_array
from helm.common.audio_utils import ensure_audio_file_exists_from_array
from helm.common.general import ensure_directory_exists
from helm.common.media_object import MediaObject, MultimediaObject

Expand Down Expand Up @@ -65,7 +65,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
wav_path = os.path.join(wav_dir, row["audio"]["path"])
print(len(row["audio"]["array"]))
print(list(row["audio"]["array"])[0:10])
ensure_wav_file_exists_from_array(
ensure_audio_file_exists_from_array(
wav_path, row["audio"]["array"], sample_rate=IEMOCAPAudioScenario.SAMPLE_RATE
)
input = Input(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
Input,
Output,
)
from helm.common.audio_utils import ensure_wav_file_exists_from_array, get_array_from_audio_file
from helm.common.audio_utils import ensure_audio_file_exists_from_array, get_array_from_audio_file
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_directory_exists, ensure_file_downloaded

Expand Down Expand Up @@ -98,7 +98,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
flac_file_name = f"dia{row.Dialogue_ID}_utt{row.Utterance_ID}.flac"
flac_file_path = os.path.join(flac_dir, flac_file_name)
audio_array = get_array_from_audio_file(flac_file_path, MELDAudioScenario.SAMPLE_RATE)
ensure_wav_file_exists_from_array(wav_file_path, audio_array, MELDAudioScenario.SAMPLE_RATE)
ensure_audio_file_exists_from_array(wav_file_path, audio_array, MELDAudioScenario.SAMPLE_RATE)
input = Input(
multimedia_content=MultimediaObject(
media_objects=[MediaObject(location=wav_file_path, content_type="audio/wav")]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from tqdm import tqdm
from datasets import load_dataset
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.audio_utils import ensure_mp3_file_exists_from_array
from helm.common.audio_utils import ensure_audio_file_exists_from_array


class MultilingualLibriSpeechScenario(Scenario):
Expand Down Expand Up @@ -70,7 +70,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
):
local_audio_path = os.path.join(audio_save_dir, row["original_path"].split("/")[-1])
# download to the local path
ensure_mp3_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
ensure_audio_file_exists_from_array(local_audio_path, row["audio"]["array"], row["audio"]["sampling_rate"])
answer = row["transcript"]
input = Input(
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
Expand Down
Loading