support audio & audio-text data reading (#95)

* fix opencc serialization error * support audio-text data reading * update multimodal_README * fix pre-commit error * modify audio_special_token * support only one target_field * fix pre-commit * add id for log
modelscope · Nov 24, 2023 · 155f9f6 · 155f9f6
1 parent 67026a8
commit 155f9f6
Show file tree

Hide file tree

Showing 7 changed files with 481 additions and 4 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -26,6 +26,8 @@ cache_compress: null                                        # The compression me
 # for multimodal data processing
 image_key: 'images'                                         # Key name of field to store the list of sample image paths.
 image_special_token: '<__dj__image>'                        # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
+audio_key: 'audios'                                         # Key name of field to store the list of sample audio paths.
+audio_special_token: '<__dj__audio>'                        # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
 
 eoc_special_token: '<|__dj__eoc|>'                          # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.
 

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -1,4 +1,4 @@
-from datasets import Image
+from datasets import Audio, Image
 
 from data_juicer.utils.constant import DEFAULT_PREFIX
 
@@ -8,6 +8,7 @@
 class SpecialTokens(object):
     # modality
     image = f'<{DEFAULT_PREFIX}image>'
+    audio = f'<{DEFAULT_PREFIX}audio>'
 
     # others
     eoc = f'<|{DEFAULT_PREFIX}eoc|>'
@@ -17,13 +18,23 @@ def load_images(paths):
     return [load_image(path) for path in paths]
 
 
+def load_audios(paths):
+    return [load_audio(path) for path in paths]
+
+
 def load_image(path):
     img_feature = Image()
     img = img_feature.decode_example(img_feature.encode_example(path))
     return img
 
 
-def get_image_size(path):
+def load_audio(path, sampling_rate=None):
+    aud_feature = Audio(sampling_rate)
+    aud = aud_feature.decode_example(aud_feature.encode_example(path))
+    return (aud['array'], aud['sampling_rate'])
+
+
+def get_image_size(path, ):
     import os
     return os.path.getsize(path)
 

diff --git a/tools/multimodal/README.md b/tools/multimodal/README.md
@@ -18,6 +18,7 @@ For now, dataset formats that are supported by Data-Juicer are listed in the fol
 | Format     | source_format_to_data_juicer_format | data_juicer_format_to_target_format | Ref.                                                                                                             |
 |------------|-------------------------------------|-------------------------------------|------------------------------------------------------------------------------------------------------------------|
 | LLaVA-like | `llava_to_dj.py`                    | `dj_to_llava.py`                    | [Format Description](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
+| WavCaps-like  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | [Format Description](https://github.com/XinhaoMei/WavCaps#table-of-contents) |
 
 For all tools, you can run the following command to find out the usage of them:
 
@@ -91,3 +92,39 @@ and converted datasets, so we can regard this sample is aligned with the origina
     }
 ]
 ```
+
+### WavCaps-like
+
+The [WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) is composed of four sub-datasets: [FreeSound](https://freesound.org/), [BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/),[SoundBible](https://soundbible.com/) and [AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html). Each sub-dataset has different fields. For example, the 'description' field is included in SoundBible, but does not exist in AudioSet. To ensure that the different sub-datasets can be properly merged after conversion, the union of all fields from the sub-datasets is used during the wavcaps_to_dj stage, and all fields are fully retained during the dj_to_wavcaps stage.
+
+```json
+# original dataset
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}]    
+}
+
+# converted dataset
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav",
+        "category": "",
+        "tags": "" }]    
+}
+```
diff --git a/tools/multimodal/README_ZH.md b/tools/multimodal/README_ZH.md
@@ -15,6 +15,7 @@
 | 格式        | source_format_to_data_juicer_format | data_juicer_format_to_target_format | 格式参考                                                                                               |
 |-----------|-------------------------------------|-------------------------------------|----------------------------------------------------------------------------------------------------|
 | 类LLaVA格式  | `llava_to_dj.py`                    | `dj_to_llava.py`                    | [格式描述](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
+| 类WavCaps格式  | `wavcaps_to_dj.py`                    | `dj_to_wavcaps.py`                    | [格式描述](https://github.com/XinhaoMei/WavCaps#table-of-contents) |
 
 对于所有工具，您可以运行以下命令来了解它们的详细用法：
 
@@ -74,3 +75,37 @@ python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --hel
     }
 ]
 ```
+
+#### 类WavCaps格式
+[WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) 数据集由 [FreeSound](https://freesound.org/)，[BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/)，[SoundBible](https://soundbible.com/)，[AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html) 四个子数据集组成，每个数据集里都有不同的字段。例如SoundBible里包含了‘description’字段，而该字段在AudioSet里并不存在。为了保证不同子数据集在转换后能够正常合并，在wavcaps_to_dj阶段使用了所有子数据集字段的并集，并在dj_to_wavcaps阶段完整保留了所有字段。
+```json
+# 原始数据集
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}]    
+}
+
+# 转换后数据集
+{ "num_captions_per_audio": 1,
+  "data": [{
+        "title": "Airplane Landing Airport",
+        "description": "Large commercial airplane landing at an airport runway.",
+        "author": "Daniel Simion",
+        "href": "2219-Airplane-Landing-Airport.html",
+        "caption": "An airplane is landing.",
+        "id": "2219",
+        "duration": 14.1424375,
+        "audio": "wav_path",
+        "download_link": "http://soundbible.com/grab.php?id=2219&type=wav",
+        "category": "",
+        "tags": "" }]    
+}
+```
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
@@ -1,5 +1,5 @@
-# This tool is used to convert multimodal dataset in LLaVA format to a target
-# dataset in Data-Juicer format.
+# This tool is used to convert multimodal dataset in Data-Juicer format to a
+# target dataset in LLaVA format.
 #
 # Corresponding Data-Juicer format:
 #   - multi-chunk interleaved image-text sequence

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -0,0 +1,166 @@
+# This tool is used to convert multimodal dataset in Data-Juicer format to a
+# target dataset in WavCaps format.
+#
+# Data-Juicer format:
+# {'id': 2219,
+#  'audios': ['./path/to/audio/2219.flac'],
+#  'text': '<__dj__audio>\n'
+#          'An airplane is landing. <|__dj__eoc|>',
+#  '__dj__meta__': {
+#       'num_captions_per_audio': 1,
+#       'title': 'Airplane Landing Airport',
+#       'description': 'Large commercial airplane landing at an airport runway.',  # noqa: E501
+#       'author': 'Daniel Simion',
+#       'href': '2219-Airplane-Landing-Airport.html',
+#       'caption': 'An airplane is landing.',
+#       'id': '2219',
+#       'duration': 14.1424375,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav',
+#       'category': '',
+#       'tags': '' }}
+# {'id': 2218,
+#  'audios': ['./path/to/audio/2218.flac'],
+#  'text': '<__dj__audio>\n'
+#          'Someone is ringing a bell. <|__dj__eoc|>',
+#  '__dj__meta__': {
+#       'num_captions_per_audio': 1,
+#       'title': 'Service Bell Help',
+#       'description': 'Customer ringing service bell in need of help in a store.',  # noqa: E501
+#       'author': 'Daniel Simion',
+#       'href': '2218-Service-Bell-Help.html',
+#       'caption': 'Someone is ringing a bell.',
+#       'id': '2218',
+#       'duration': 1.5698125,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav',
+#       'category': '',
+#       'tags': '' }}
+#
+# Corresponding WavCps format:
+# { 'num_captions_per_audio': 1,
+#   'data': [{
+#       'title': 'Airplane Landing Airport',
+#       'description': 'Large commercial airplane landing at an airport runway.',  # noqa: E501
+#       'author': 'Daniel Simion',
+#       'href': '2219-Airplane-Landing-Airport.html',
+#       'caption': 'An airplane is landing.',
+#       'id': '2219',
+#       'duration': 14.1424375,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
+#   },  {
+#       'title': 'Service Bell Help',
+#       'description': 'Customer ringing service bell in need of help in a store.',  # noqa: E501
+#       'author': 'Daniel Simion',
+#       'href': '2218-Service-Bell-Help.html',
+#       'caption': 'Someone is ringing a bell.',
+#       'id': '2218',
+#       'duration': 1.5698125,
+#       'audio': 'wav_path',
+#       'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav'
+#   },
+#   ...]
+# }
+
+import json
+import os
+
+import fire
+import jsonlines as jl
+from loguru import logger
+from tqdm import tqdm
+
+from data_juicer.utils.constant import Fields
+from data_juicer.utils.mm_utils import SpecialTokens
+
+
+@logger.catch
+def main(
+    dj_ds_path: str,
+    target_wavcaps_ds_path: str,
+    target_field: str = 'caption',
+    eoc_special_token: str = SpecialTokens.eoc,
+    audio_special_token: str = SpecialTokens.audio,
+    remove_eoc_at_last: bool = True,
+    remove_target_field_token: bool = False,
+    sent_seperator: str = '\n',
+):
+    """
+    Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
+
+    :param dj_ds_path: path to the input dataset in Data-Juicer format.
+    :param target_wavcaps_ds_path: path to store the converted dataset in
+        WavCaps format.
+    :param target_field: the field used to describe audio in the WavCaps-like
+        dataset, which can be one of ['caption','title','description'].
+    :param eoc_special_token: the special token for "end of a chunk". It's used
+        to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
+        Data-Juicer).
+    :param audio_special_token: the special token for audios. It's used to
+        locate the audios in the text.
+    :param remove_eoc_at_last: whether to remove the extra eoc_special_token at
+        the end of text. Default: True.
+    :param remove_target_field_token: whether to remove the extra
+        target_field_token at text.
+    :param sent_seperator: seperator to split different sentences. Default: \n.
+    """
+    # ----- Constant settings. Better not to change them. -----
+    from_format = '[[%s]]: '  # default handle method for the text label
+    # ----- Constant settings. Better not to change them. -----
+
+    if not os.path.exists(dj_ds_path):
+        raise FileNotFoundError(
+            f'Input dataset [{dj_ds_path}] can not be found.')
+    if not target_wavcaps_ds_path.endswith('.json'):
+        raise ValueError(
+            'Only support "json" target dataset file for WavCaps now.')
+    if os.path.dirname(target_wavcaps_ds_path) \
+            and not os.path.exists(os.path.dirname(target_wavcaps_ds_path)):
+        logger.info(
+            f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] '
+            f'for the target dataset.')
+        os.makedirs(os.path.dirname(target_wavcaps_ds_path))
+
+    if target_field not in ['caption', 'description', 'title']:
+        raise ValueError(
+            "target_field must be in '['caption', 'description', 'title']'")
+
+    logger.info('Start to convert.')
+    samples = {'num_captions_per_audio': 1, 'data': []}
+    with jl.open(dj_ds_path, 'r') as reader:
+        for sample in tqdm(reader):
+            id = sample['id']
+            if Fields.meta not in sample:
+                logger.warning(
+                    f'{Fields.meta} does not exist in this sample with '
+                    f'id [{id}].')
+                continue
+
+            if target_field not in sample[Fields.meta].keys():
+                logger.warning(
+                    f'{target_field} does not exist in this sample with '
+                    f'id [{id}].')
+                continue
+            samples['num_captions_per_audio'] = sample[
+                Fields.meta]['num_captions_per_audio']
+            del sample[Fields.meta]['num_captions_per_audio']
+
+            sample[Fields.meta][target_field] = sample['text'].replace(
+                audio_special_token + sent_seperator, '')
+            if remove_eoc_at_last:
+                sample[Fields.meta][target_field] = sample[
+                    Fields.meta][target_field].replace(eoc_special_token, '')
+            if remove_target_field_token:
+                sample[Fields.meta][target_field] = sample[
+                    Fields.meta][target_field].replace(
+                        from_format % target_field, '')
+            samples['data'].append(sample[Fields.meta])
+
+    logger.info(f'Start to write the converted dataset to '
+                f'[{target_wavcaps_ds_path}]...')
+    json.dump(samples, open(target_wavcaps_ds_path, 'w', encoding='utf-8'))
+
+
+if __name__ == '__main__':
+    fire.Fire(main)