Skip to content

Commit

Permalink
Add Vocal Sound, Multilingual Librispeech, and FLEURS scenarios (#3130)
Browse files Browse the repository at this point in the history
  • Loading branch information
ImKeTT authored Nov 6, 2024
1 parent 68651b9 commit 137698a
Show file tree
Hide file tree
Showing 10 changed files with 703 additions and 3 deletions.
6 changes: 5 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ install_requires=
scipy~=1.10
uncertainty-calibration~=0.1.4
scikit-learn~=1.1
jiwer~=3.0
rapidfuzz~=3.10

# Models and Metrics Extras
transformers~=4.40 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
Expand Down Expand Up @@ -146,7 +148,7 @@ mistral =
mistralai~=1.1

openai =
openai~=1.0
openai~=1.52
tiktoken~=0.7
pydantic~=2.0 # For model_dump(mode="json") - openai only requires pydantic>=1.9.0

Expand Down Expand Up @@ -289,6 +291,8 @@ audiolm =

# For metrics
pycocoevalcap~=1.2
jiwer~=3.0
rapidfuzz~=3.10

# Install everything
all =
Expand Down
44 changes: 44 additions & 0 deletions src/helm/benchmark/metrics/evaluate_reference_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import numpy as np
from jiwer import wer, mer, wip, cer

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
Expand Down Expand Up @@ -202,6 +203,45 @@ def cider(gold: str, pred: str) -> float:
return average_score


def wa_score(gold: str, pred: str) -> float:
# Word Accuracy (WA) equals to 1 - word error rate (WER), which is a common
# metric used to evaluate the accuracy of speech recognition systems.
# Note that this metric could be negative because the WER might be greater than 1.
# https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
if not pred:
return 0

wer_ret = 1 - wer(gold, pred)
return wer_ret


def ma_score(gold: str, pred: str) -> float:
# Match Accuracy (MA) equals to 1 - match error rate (MER), which is for evaluating the accuracy of
# speech recognition systems.
if not pred:
return 0
mer_ret = mer(gold, pred)
return mer_ret


def wip_score(gold: str, pred: str) -> float:
# Word information preservation (WIP) for evaluating the preserved information of speech
# recognition systems.
if not pred:
return 0
wip_ret = wip(gold, pred)
return wip_ret


def ca_score(gold: str, pred: str) -> float:
# Character accuracy (CA) equals to character error rate (CER) for evaluating the accuracy
# of speech recognition systems.
if not pred:
return 0
cer_ret = cer(gold, pred)
return cer_ret


def extract_set_from_text(
set_str: str,
set_start_str: str = " is ",
Expand Down Expand Up @@ -351,6 +391,10 @@ def compute_metrics_helper(
"chinese_rouge_2": get_chinese_rouge_function("rouge2"),
"cleva_math_result_match": cleva_math_result_match,
"absolute_value_difference": absolute_value_difference,
"wa_score": wa_score,
"ma_score": ma_score,
"wip_score": wip_score,
"ca_score": ca_score,
}

stats: List[Stat] = []
Expand Down
8 changes: 8 additions & 0 deletions src/helm/benchmark/presentation/run_entries_speech.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,12 @@ entries: [

# TODO: populate with the rest of the languages
{description: "covost2:source_language=English,target_language=Chinese,model=audiolm", priority: 1}

{description: "vocal_sound:model=audiolm", priority: 1}

# TODO: populate with the rest of the languages
{description: "multilingual_librispeech:language=Dutch,model=audiolm", priority: 1}

# TODO: populate with the rest of the languages
{description: "fleurs:language=Finnish,model=audiolm", priority: 1}
]
69 changes: 69 additions & 0 deletions src/helm/benchmark/run_specs/audio_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from helm.benchmark.metrics.common_metric_specs import (
get_classification_metric_specs,
get_exact_match_metric_specs,
get_basic_metric_specs,
)
from helm.benchmark.metrics.metric import MetricSpec
from helm.benchmark.run_spec import RunSpec, run_spec_function
Expand Down Expand Up @@ -49,6 +50,10 @@ def get_machine_translation_metric_specs() -> List[MetricSpec]:
return [MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric")]


def _get_audio_recognition_metric_specs() -> List[MetricSpec]:
return get_basic_metric_specs(["wa_score", "ma_score", "wip_score", "ca_score"])


########################################################################################################################
# RunSpecs

Expand Down Expand Up @@ -90,3 +95,67 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
metric_specs=metric_specs,
groups=["covost2"],
)


@run_spec_function("vocal_sound")
def get_vocal_sound_run_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
)
adapter_spec = _get_generation_adapter_spec(
instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
'"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
return RunSpec(
name="vocal_sound",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["vocal_sound"],
)


@run_spec_function("multilingual_librispeech")
def get_multilingual_librispeech_run_spec(language: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.multilingual_librispeech_scenario."
"MultilingualLibriSpeechScenario",
args={"language": language},
)
adapter_spec = _get_generation_adapter_spec(
instructions="Listen to the audio and generate an accurate transcript of the spoken content. "
"Respond with only the transcript text.",
max_tokens=100,
)
metric_specs = _get_audio_recognition_metric_specs()
return RunSpec(
name="multilingual_librispeech",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["multilingual_librispeech"],
)


@run_spec_function("fleurs")
def get_fleurs_run_spec(language: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.fleurs_scenario.FLEURSScenario",
args={"language": language},
)
adapter_spec = _get_generation_adapter_spec(
instructions="Listen to the audio and identify the language spoken. Choose from these"
'options only: "Finnish", "Bulgarian", "Hebrew", "Zulu", "Bengali", "Thai",'
'"Mandarin Chinese". Respond with just the language name.',
max_tokens=5,
)
metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
return RunSpec(
name="fleurs",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["fleurs"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ class AudioMNISTScenario(Scenario):

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []
wav_save_dir: str = os.path.join(output_path, "wav_files")
wav_save_dir: str = os.path.join(output_path, "audio_files")
ensure_directory_exists(wav_save_dir)
for row in tqdm(load_dataset("flexthink/audiomnist", cache_dir=output_path, split=TEST_SPLIT)):
local_audio_path = os.path.join(wav_save_dir, row["audio"]["path"])
audio_array = row["audio"]["array"]
ensure_wav_file_exists_from_array(local_audio_path, audio_array, row["audio"]["sampling_rate"])
input = Input(
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=local_audio_path)])
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=local_audio_path)])
)
references = [Reference(Output(text=str(row["digit"])), tags=[CORRECT_TAG])]
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
Expand Down
Loading

0 comments on commit 137698a

Please sign in to comment.