Skip to content

Commit

Permalink
CoVost-2: Speech Machine Translation (#3106)
Browse files Browse the repository at this point in the history
  • Loading branch information
teetone authored Oct 28, 2024
1 parent 49e8a11 commit 4b82dfd
Show file tree
Hide file tree
Showing 9 changed files with 266 additions and 10 deletions.
5 changes: 4 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ heim =
audiolm =
crfm-helm[openai]

# For HuggingFace audio datasets
soundfile~=0.12
librosa~=0.10

# For OpenFlamingo
einops~=0.7.0
einops-exts~=0.0.4
Expand All @@ -284,7 +288,6 @@ audiolm =

# For Qwen2-Audio
transformers~=4.45.1
librosa~=0.10.2
transformers_stream_generator~=0.0.4
scipy~=1.10
torchvision>=0.14.1,<3.0.0
Expand Down
6 changes: 6 additions & 0 deletions src/helm/benchmark/presentation/run_entries_speech.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
entries: [
{description: "audio_mnist:model=audiolm", priority: 1}

# TODO: populate with the rest of the languages
{description: "covost2:source_language=English,target_language=Chinese,model=audiolm", priority: 1}
]
41 changes: 39 additions & 2 deletions src/helm/benchmark/run_specs/audio_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,16 @@
get_classification_metric_specs,
get_exact_match_metric_specs,
)
from helm.benchmark.metrics.metric import MetricSpec
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.scenarios.scenario import ScenarioSpec


def _get_multimodal_generation_adapter_spec(
########################################################################################################################
# AdapterSpecs


def _get_generation_adapter_spec(
max_tokens: int,
instructions: str = "",
max_train_instances: int = 0,
Expand All @@ -36,12 +41,24 @@ def _get_multimodal_generation_adapter_spec(
)


########################################################################################################################
# MetricSpecs


def get_machine_translation_metric_specs() -> List[MetricSpec]:
return [MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric")]


########################################################################################################################
# RunSpecs


@run_spec_function("audio_mnist")
def get_audio_mnist_run_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.audio_mnist_scenario.AudioMNISTScenario"
)
adapter_spec = _get_multimodal_generation_adapter_spec(
adapter_spec = _get_generation_adapter_spec(
instructions="Classify the spoken digit. Respond with only a single digit.",
max_tokens=5,
)
Expand All @@ -53,3 +70,23 @@ def get_audio_mnist_run_spec() -> RunSpec:
metric_specs=metric_specs,
groups=["audio_mnist"],
)


@run_spec_function("covost2")
def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.covost2_scenario.CoVoST2Scenario",
args={"source_language": source_language, "target_language": target_language},
)
adapter_spec = _get_generation_adapter_spec(
instructions=f"Translate from {source_language} to {target_language}.",
max_tokens=50,
)
metric_specs = get_machine_translation_metric_specs()
return RunSpec(
name=f"covost2:source_language={source_language},target_language={target_language}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["covost2"],
)
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class AudioMNISTScenario(Scenario):
WAV_URL_TEMPLATE = r"https://github.com/soerenab/AudioMNIST/raw/544b0f4bc65227e54332e665d5e02c24be6732c2/data/{speaker_id}/{digit}_{speaker_id}_{trial_index}.wav" # noqa: E501

name = "audio_mnist"
description = "Classify an audio sample of a spoken digit"
description = "Classify an audio sample of a spoken digit ([Becker et al, 2023](https://arxiv.org/abs/1807.03418))."
tags = ["audio", "classification"]

def get_instances(self, output_path: str) -> List[Instance]:
Expand Down
162 changes: 162 additions & 0 deletions src/helm/benchmark/scenarios/audio_language/covost2_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from typing import Dict, List
import os

from datasets import load_dataset
from tqdm import tqdm

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)
from helm.common.media_object import MediaObject, MultimediaObject


class CoVoST2Scenario(Scenario):
"""
CoVost-2 is a large-scale multilingual speech translation corpus covering translations from 21 languages
into English and from English into 15 languages.
The dataset contains the audio, transcriptions, and translations in the following languages:
French, German, Dutch, Russian, Spanish, Italian, Turkish, Persian, Swedish, Mongolian, Chinese,
Welsh, Catalan, Slovenian, Estonian, Indonesian, Arabic, Tamil, Portuguese, Latvian, and Japanese.
Paper: https://arxiv.org/abs/2007.10310
Dataset: https://huggingface.co/datasets/facebook/covost2
Requires downloading Common Voice Corpus 4 from https://commonvoice.mozilla.org/en/datasets
Citation:
@misc{wang2020covost2massivelymultilingual,
title={CoVoST 2 and Massively Multilingual Speech-to-Text Translation},
author={Changhan Wang and Anne Wu and Juan Pino},
year={2020},
eprint={2007.10310},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2007.10310},
}
"""

LANGUAGE_TO_CODE: Dict[str, str] = {
"English": "en",
"German": "de",
"French": "fr",
"Spanish": "es",
"Italian": "it",
"Portuguese": "pt",
"Russian": "ru",
"Chinese": "zh-CN",
"Japanese": "ja",
"Turkish": "tr",
"Persian": "fa",
"Arabic": "ar",
"Dutch": "nl",
"Swedish": "sv-SE",
"Indonesian": "id",
"Tamil": "ta",
"Latvian": "lv",
"Slovenian": "sl",
"Welsh": "cy",
"Mongolian": "mn",
"Estonian": "et",
}

VALID_SUBSETS: List[str] = [
"en_de",
"en_tr",
"en_fa",
"en_sv-SE",
"en_mn",
"en_zh-CN",
"en_cy",
"en_ca",
"en_sl",
"en_et",
"en_id",
"en_ar",
"en_ta",
"en_lv",
"en_ja",
"fr_en",
"de_en",
"es_en",
"ca_en",
"it_en",
"ru_en",
"zh-CN_en",
"pt_en",
"fa_en",
"et_en",
"mn_en",
"nl_en",
"tr_en",
"ar_en",
"sv-SE_en",
"lv_en",
"sl_en",
"ta_en",
"ja_en",
"id_en",
"cy_en",
]

name = "covost2"
description = (
"A large scale multilingual speech translation corpus ([Wang et al., 2017](https://arxiv.org/abs/2007.10310))."
)
tags = ["audio", "translation", "multilinguality"]

def __init__(self, source_language: str, target_language: str) -> None:
super().__init__()

if (
source_language not in CoVoST2Scenario.LANGUAGE_TO_CODE
or target_language not in CoVoST2Scenario.LANGUAGE_TO_CODE
):
raise ValueError(f"Invalid language. Valid languages are: {list(CoVoST2Scenario.LANGUAGE_TO_CODE.keys())}")

# Get the corresponding language codes
source_language_code: str = self.LANGUAGE_TO_CODE[source_language]
target_language_code: str = self.LANGUAGE_TO_CODE[target_language]

subset: str = f"{source_language_code}_{target_language_code}"
if subset not in CoVoST2Scenario.VALID_SUBSETS:
raise ValueError(f"Invalid subset: {subset}. Valid subsets are: {CoVoST2Scenario.VALID_SUBSETS}")

self._subset: str = subset
self._source_language: str = source_language

def get_instances(self, output_path: str) -> List[Instance]:
data_dir: str = os.path.join(output_path, self._source_language)
assert os.path.exists(data_dir), (
f"Download the {self._source_language} subset from Common Voice Corpus 4 "
f"(https://commonvoice.mozilla.org/en/datasets) and unzip and place at {data_dir}."
)

instances: List[Instance] = []
split: str = TEST_SPLIT
for row in tqdm(
load_dataset(
"facebook/covost2",
self._subset,
cache_dir=output_path,
data_dir=data_dir,
split=split,
trust_remote_code=True,
)
):
audio_path: str = row["file"]
assert os.path.exists(audio_path), f"Audio file does not exist at path: {audio_path}"

input = Input(
multimedia_content=MultimediaObject([MediaObject(content_type="audio/mpeg", location=audio_path)])
)
references = [Reference(Output(text=row["translation"]), tags=[CORRECT_TAG])]
instances.append(Instance(input=input, references=references, split=split))

return instances
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ metrics:
description: Fraction of instances that the predicted output matches a correct reference exactly.
lower_is_better: false

# Machine Translation metrics
- name: bleu
display_name: BLEU
short_display_name: BLEU
description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
lower_is_better: false

############################################################
perturbations: []

Expand Down Expand Up @@ -117,9 +124,9 @@ run_groups:
- name: audio_mnist
display_name: AudioMNIST
description: >
The AudioMNIST ([Becker et al, 2023](https://arxiv.org/abs/1807.03418)) dataset consists of a dataset of 30000 audio samples of
The AudioMNIST dataset consists of a dataset of 30000 audio samples of
spoken digits (0-9) of 60 different speakers. The task is to classify the digit from the
audio sample.
audio sample ([Becker et al, 2023](https://arxiv.org/abs/1807.03418)).
metric_groups:
- accuracy
- efficiency
Expand All @@ -133,3 +140,27 @@ run_groups:
who: 60 different speakers
when: "2018"
language: English

- name: covost2
display_name: CoVost-2
description: >
CoVost-2 is a large-scale multilingual speech translation corpus covering translations from 21 languages
into English and from English into 15 languages.
The dataset contains the audio, transcriptions, and translations in the following languages:
French, German, Dutch, Russian, Spanish, Italian, Turkish, Persian, Swedish, Mongolian, Chinese,
Welsh, Catalan, Slovenian, Estonian, Indonesian, Arabic, Tamil, Portuguese, Latvian, and Japanese
([Wang et al, 2020](https://arxiv.org/abs/2007.10310)).
metric_groups:
- accuracy
- efficiency
- general_information
environment:
main_name: bleu
main_split: test
taxonomy:
task: audio machine translation
what: audio, transcriptions, and translations in 15 languages
who: real speakers
when: "2020"
language: 15 languages
17 changes: 13 additions & 4 deletions src/helm/clients/openai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,22 @@ def _make_chat_request(self, request: Request) -> RequestResult:
image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
content.append({"type": "image_url", "image_url": image_object})
elif media_object.is_type("audio") and media_object.location:
from helm.common.audio_utils import encode_base64 # type: ignore

base64_audio: str = (
encode_base64(media_object.location)
if media_object.is_local_file
else multimodal_request_utils.get_contents_as_base64(media_object.location)
)
format: str = media_object.content_type.split("/")[1]
if format == "mpeg":
# OpenAI expects "mp3" for mpeg audio
format = "mp3"

content.append(
{
"type": "input_audio",
"input_audio": {
"data": multimodal_request_utils.get_contents_as_base64(media_object.location),
"format": media_object.content_type.split("/")[1],
},
"input_audio": {"data": base64_audio, "format": format},
}
)
elif media_object.is_type(TEXT_TYPE):
Expand Down
8 changes: 8 additions & 0 deletions src/helm/common/audio_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import base64


def encode_base64(audio_path: str) -> str:
"""Returns the base64 representation of an audio file."""
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
return base64.b64encode(audio_data).decode("utf-8")

0 comments on commit 4b82dfd

Please sign in to comment.