Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AudioCaps scenario #3137

Merged
merged 4 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/helm/benchmark/presentation/run_entries_speech.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ entries: [

# TODO: populate with the rest of the languages
{description: "fleurs:language=Finnish,model=audiolm", priority: 1}

{description: "audiocaps:model=audiolm", priority: 1}
]
26 changes: 26 additions & 0 deletions src/helm/benchmark/run_specs/audio_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ def _get_audio_recognition_metric_specs() -> List[MetricSpec]:
return get_basic_metric_specs(["wa_score", "ma_score", "wip_score", "ca_score"])


def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
return get_basic_metric_specs(
["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
)


########################################################################################################################
# RunSpecs

Expand Down Expand Up @@ -159,3 +165,23 @@ def get_fleurs_run_spec(language: str) -> RunSpec:
metric_specs=metric_specs,
groups=["fleurs"],
)


@run_spec_function("audiocaps")
def get_audiocaps_run_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.audiocaps_senario.AudioCapsScenario"
)
adapter_spec = _get_generation_adapter_spec(
instructions="Generate a caption for the following audio. The caption should be short and does "
"not need to be a complete sentence.",
max_tokens=50,
)
metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
return RunSpec(
name="audiocaps",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["audiocaps"],
)
59 changes: 59 additions & 0 deletions src/helm/benchmark/scenarios/audio_language/audiocaps_senario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import List
import os

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)
from tqdm import tqdm
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_file_downloaded
import pandas as pd


class AudioCapsScenario(Scenario):
"""AudioCaps

AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
musical instruments and genres, and common everyday environmental sounds.

Paper: https://aclanthology.org/N19-1011.pdf
Code: https://github.com/cdjkim/audiocaps

Citation:
@inproceedings{audiocaps,
title={AudioCaps: Generating Captions for Audios in The Wild},
author={Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee},
booktitle={NAACL-HLT},
year={2019}
}
"""

DOWNLOADING_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/wav_files.zip"
REFERENCE_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/test.csv"

name = "audiocaps"
description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
tags: List[str] = ["audio", "captioning"]

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []
data_dir: str = os.path.join(output_path, "wav_files")
ensure_file_downloaded(source_url=AudioCapsScenario.DOWNLOADING_URL, target_path=data_dir, unpack=True)
for _, row in tqdm(pd.read_csv(AudioCapsScenario.REFERENCE_URL, sep=",").iterrows()):
audiocap_id = row["audiocap_id"]
audio_path: str = os.path.join(data_dir, f"{audiocap_id}.wav")
assert os.path.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
input = Input(
multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path)])
)
references = [Reference(Output(text=str(row["caption"])), tags=[CORRECT_TAG])]
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
return instances
20 changes: 20 additions & 0 deletions src/helm/benchmark/static/schema_speech.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,23 @@ run_groups:
who: real speakers
when: "2022"
language: 102 languages

- name: audiocaps
display_name: AudioCaps
description: >
AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
musical instruments and genres, and common everyday environmental sounds.
([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
metric_groups:
- accuracy
- general_information
environment:
main_name: cider
main_split: test
taxonomy:
task: audio captioning
what: audio clips in the wild
who: real speakers
when: "2019"
language: English