stanford-crfm · ImKeTT · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/src/helm/benchmark/presentation/run_entries_speech.conf b/src/helm/benchmark/presentation/run_entries_speech.conf
@@ -11,4 +11,6 @@ entries: [
 
     # TODO: populate with the rest of the languages
     {description: "fleurs:language=Finnish,model=audiolm", priority: 1}
+
+    {description: "audiocaps:model=audiolm", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -54,6 +54,12 @@ def _get_audio_recognition_metric_specs() -> List[MetricSpec]:
     return get_basic_metric_specs(["wa_score", "ma_score", "wip_score", "ca_score"])
 
 
+def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(
+        ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
+    )
+
+
 ########################################################################################################################
 # RunSpecs
 
@@ -159,3 +165,23 @@ def get_fleurs_run_spec(language: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=["fleurs"],
     )
+
+
+@run_spec_function("audiocaps")
+def get_audiocaps_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.audiocaps_senario.AudioCapsScenario"
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Generate a caption for the following audio. The caption should be short and does "
+        "not need to be a complete sentence.",
+        max_tokens=50,
+    )
+    metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+    return RunSpec(
+        name="audiocaps",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["audiocaps"],
+    )
diff --git a/src/helm/benchmark/scenarios/audio_language/audiocaps_senario.py b/src/helm/benchmark/scenarios/audio_language/audiocaps_senario.py
@@ -0,0 +1,59 @@
+from typing import List
+import os
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from tqdm import tqdm
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_file_downloaded
+import pandas as pd
+
+
+class AudioCapsScenario(Scenario):
+    """AudioCaps
+
+    AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected
+    via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds,
+    musical instruments and genres, and common everyday environmental sounds.
+
+    Paper: https://aclanthology.org/N19-1011.pdf
+    Code: https://github.com/cdjkim/audiocaps
+
+    Citation:
+    @inproceedings{audiocaps,
+        title={AudioCaps: Generating Captions for Audios in The Wild},
+        author={Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee},
+        booktitle={NAACL-HLT},
+        year={2019}
+        }
+    """
+
+    DOWNLOADING_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/wav_files.zip"
+    REFERENCE_URL = "https://huggingface.co/datasets/Olivia714/audiocaps/resolve/main/test.csv"
+
+    name = "audiocaps"
+    description = "A large-scale dataset of about 46K audio clips to human-written text pairs \
+        ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf))."
+    tags: List[str] = ["audio", "captioning"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        data_dir: str = os.path.join(output_path, "wav_files")
+        ensure_file_downloaded(source_url=AudioCapsScenario.DOWNLOADING_URL, target_path=data_dir, unpack=True)
+        for _, row in tqdm(pd.read_csv(AudioCapsScenario.REFERENCE_URL, sep=",").iterrows()):
+            audiocap_id = row["audiocap_id"]
+            audio_path: str = os.path.join(data_dir, f"{audiocap_id}.wav")
+            assert os.path.exists(audio_path), f"Audio file does not exist at path: {audio_path}"
+            input = Input(
+                multimedia_content=MultimediaObject([MediaObject(content_type="audio/wav", location=audio_path)])
+            )
+            references = [Reference(Output(text=str(row["caption"])), tags=[CORRECT_TAG])]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+        return instances
diff --git a/src/helm/benchmark/static/schema_speech.yaml b/src/helm/benchmark/static/schema_speech.yaml
@@ -280,3 +280,23 @@ run_groups:
       who: real speakers
       when: "2022"
       language: 102 languages
+
+  - name: audiocaps
+    display_name: AudioCaps
+    description: >
+      AudioCaps is a large-scale dataset of about 46K audio clips to human-written text pairs collected 
+      via crowdsourcing on the AudioSet dataset, which covers a wide range of human and animal sounds, 
+      musical instruments and genres, and common everyday environmental sounds. 
+      ([Kim et al, 2019](https://aclanthology.org/N19-1011.pdf)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: cider
+      main_split: test
+    taxonomy:
+      task: audio captioning
+      what: audio clips in the wild
+      who: real speakers
+      when: "2019"
+      language: English