Skip to content

Commit

Permalink
support audio & audio-text data reading (#95)
Browse files Browse the repository at this point in the history
* fix opencc serialization error

* support audio-text data reading

* update multimodal_README

* fix pre-commit error

* modify audio_special_token

* support only one target_field

* fix pre-commit

* add id for log
  • Loading branch information
chenhesen authored Nov 24, 2023
1 parent 67026a8 commit 155f9f6
Show file tree
Hide file tree
Showing 7 changed files with 481 additions and 4 deletions.
2 changes: 2 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ cache_compress: null # The compression me
# for multimodal data processing
image_key: 'images' # Key name of field to store the list of sample image paths.
image_special_token: '<__dj__image>' # The special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
audio_key: 'audios' # Key name of field to store the list of sample audio paths.
audio_special_token: '<__dj__audio>' # The special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.

eoc_special_token: '<|__dj__eoc|>' # The special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.

Expand Down
15 changes: 13 additions & 2 deletions data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datasets import Image
from datasets import Audio, Image

from data_juicer.utils.constant import DEFAULT_PREFIX

Expand All @@ -8,6 +8,7 @@
class SpecialTokens(object):
# modality
image = f'<{DEFAULT_PREFIX}image>'
audio = f'<{DEFAULT_PREFIX}audio>'

# others
eoc = f'<|{DEFAULT_PREFIX}eoc|>'
Expand All @@ -17,13 +18,23 @@ def load_images(paths):
return [load_image(path) for path in paths]


def load_audios(paths):
return [load_audio(path) for path in paths]


def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
return img


def get_image_size(path):
def load_audio(path, sampling_rate=None):
aud_feature = Audio(sampling_rate)
aud = aud_feature.decode_example(aud_feature.encode_example(path))
return (aud['array'], aud['sampling_rate'])


def get_image_size(path, ):
import os
return os.path.getsize(path)

Expand Down
37 changes: 37 additions & 0 deletions tools/multimodal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ For now, dataset formats that are supported by Data-Juicer are listed in the fol
| Format | source_format_to_data_juicer_format | data_juicer_format_to_target_format | Ref. |
|------------|-------------------------------------|-------------------------------------|------------------------------------------------------------------------------------------------------------------|
| LLaVA-like | `llava_to_dj.py` | `dj_to_llava.py` | [Format Description](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
| WavCaps-like | `wavcaps_to_dj.py` | `dj_to_wavcaps.py` | [Format Description](https://github.com/XinhaoMei/WavCaps#table-of-contents) |

For all tools, you can run the following command to find out the usage of them:

Expand Down Expand Up @@ -91,3 +92,39 @@ and converted datasets, so we can regard this sample is aligned with the origina
}
]
```

### WavCaps-like

The [WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) is composed of four sub-datasets: [FreeSound](https://freesound.org/), [BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/),[SoundBible](https://soundbible.com/) and [AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html). Each sub-dataset has different fields. For example, the 'description' field is included in SoundBible, but does not exist in AudioSet. To ensure that the different sub-datasets can be properly merged after conversion, the union of all fields from the sub-datasets is used during the wavcaps_to_dj stage, and all fields are fully retained during the dj_to_wavcaps stage.

```json
# original dataset
{ "num_captions_per_audio": 1,
"data": [{
"title": "Airplane Landing Airport",
"description": "Large commercial airplane landing at an airport runway.",
"author": "Daniel Simion",
"href": "2219-Airplane-Landing-Airport.html",
"caption": "An airplane is landing.",
"id": "2219",
"duration": 14.1424375,
"audio": "wav_path",
"download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}]
}

# converted dataset
{ "num_captions_per_audio": 1,
"data": [{
"title": "Airplane Landing Airport",
"description": "Large commercial airplane landing at an airport runway.",
"author": "Daniel Simion",
"href": "2219-Airplane-Landing-Airport.html",
"caption": "An airplane is landing.",
"id": "2219",
"duration": 14.1424375,
"audio": "wav_path",
"download_link": "http://soundbible.com/grab.php?id=2219&type=wav",
"category": "",
"tags": "" }]
}
```
35 changes: 35 additions & 0 deletions tools/multimodal/README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
| 格式 | source_format_to_data_juicer_format | data_juicer_format_to_target_format | 格式参考 |
|-----------|-------------------------------------|-------------------------------------|----------------------------------------------------------------------------------------------------|
| 类LLaVA格式 | `llava_to_dj.py` | `dj_to_llava.py` | [格式描述](https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md#dataset-format) |
| 类WavCaps格式 | `wavcaps_to_dj.py` | `dj_to_wavcaps.py` | [格式描述](https://github.com/XinhaoMei/WavCaps#table-of-contents) |

对于所有工具,您可以运行以下命令来了解它们的详细用法:

Expand Down Expand Up @@ -74,3 +75,37 @@ python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --hel
}
]
```

#### 类WavCaps格式
[WavCaps](https://github.com/XinhaoMei/WavCaps#dataset) 数据集由 [FreeSound](https://freesound.org/)[BBC Sound Effects](https://sound-effects.bbcrewind.co.uk/)[SoundBible](https://soundbible.com/)[AudioSet Strongly-labelled Subset](https://research.google.com/audioset/download_strong.html) 四个子数据集组成,每个数据集里都有不同的字段。例如SoundBible里包含了‘description’字段,而该字段在AudioSet里并不存在。为了保证不同子数据集在转换后能够正常合并,在wavcaps_to_dj阶段使用了所有子数据集字段的并集,并在dj_to_wavcaps阶段完整保留了所有字段。
```json
# 原始数据集
{ "num_captions_per_audio": 1,
"data": [{
"title": "Airplane Landing Airport",
"description": "Large commercial airplane landing at an airport runway.",
"author": "Daniel Simion",
"href": "2219-Airplane-Landing-Airport.html",
"caption": "An airplane is landing.",
"id": "2219",
"duration": 14.1424375,
"audio": "wav_path",
"download_link": "http://soundbible.com/grab.php?id=2219&type=wav"}]
}

# 转换后数据集
{ "num_captions_per_audio": 1,
"data": [{
"title": "Airplane Landing Airport",
"description": "Large commercial airplane landing at an airport runway.",
"author": "Daniel Simion",
"href": "2219-Airplane-Landing-Airport.html",
"caption": "An airplane is landing.",
"id": "2219",
"duration": 14.1424375,
"audio": "wav_path",
"download_link": "http://soundbible.com/grab.php?id=2219&type=wav",
"category": "",
"tags": "" }]
}
```
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This tool is used to convert multimodal dataset in LLaVA format to a target
# dataset in Data-Juicer format.
# This tool is used to convert multimodal dataset in Data-Juicer format to a
# target dataset in LLaVA format.
#
# Corresponding Data-Juicer format:
# - multi-chunk interleaved image-text sequence
Expand Down
166 changes: 166 additions & 0 deletions tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# This tool is used to convert multimodal dataset in Data-Juicer format to a
# target dataset in WavCaps format.
#
# Data-Juicer format:
# {'id': 2219,
# 'audios': ['./path/to/audio/2219.flac'],
# 'text': '<__dj__audio>\n'
# 'An airplane is landing. <|__dj__eoc|>',
# '__dj__meta__': {
# 'num_captions_per_audio': 1,
# 'title': 'Airplane Landing Airport',
# 'description': 'Large commercial airplane landing at an airport runway.', # noqa: E501
# 'author': 'Daniel Simion',
# 'href': '2219-Airplane-Landing-Airport.html',
# 'caption': 'An airplane is landing.',
# 'id': '2219',
# 'duration': 14.1424375,
# 'audio': 'wav_path',
# 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav',
# 'category': '',
# 'tags': '' }}
# {'id': 2218,
# 'audios': ['./path/to/audio/2218.flac'],
# 'text': '<__dj__audio>\n'
# 'Someone is ringing a bell. <|__dj__eoc|>',
# '__dj__meta__': {
# 'num_captions_per_audio': 1,
# 'title': 'Service Bell Help',
# 'description': 'Customer ringing service bell in need of help in a store.', # noqa: E501
# 'author': 'Daniel Simion',
# 'href': '2218-Service-Bell-Help.html',
# 'caption': 'Someone is ringing a bell.',
# 'id': '2218',
# 'duration': 1.5698125,
# 'audio': 'wav_path',
# 'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav',
# 'category': '',
# 'tags': '' }}
#
# Corresponding WavCps format:
# { 'num_captions_per_audio': 1,
# 'data': [{
# 'title': 'Airplane Landing Airport',
# 'description': 'Large commercial airplane landing at an airport runway.', # noqa: E501
# 'author': 'Daniel Simion',
# 'href': '2219-Airplane-Landing-Airport.html',
# 'caption': 'An airplane is landing.',
# 'id': '2219',
# 'duration': 14.1424375,
# 'audio': 'wav_path',
# 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
# }, {
# 'title': 'Service Bell Help',
# 'description': 'Customer ringing service bell in need of help in a store.', # noqa: E501
# 'author': 'Daniel Simion',
# 'href': '2218-Service-Bell-Help.html',
# 'caption': 'Someone is ringing a bell.',
# 'id': '2218',
# 'duration': 1.5698125,
# 'audio': 'wav_path',
# 'download_link': 'http://soundbible.com/grab.php?id=2218&type=wav'
# },
# ...]
# }

import json
import os

import fire
import jsonlines as jl
from loguru import logger
from tqdm import tqdm

from data_juicer.utils.constant import Fields
from data_juicer.utils.mm_utils import SpecialTokens


@logger.catch
def main(
dj_ds_path: str,
target_wavcaps_ds_path: str,
target_field: str = 'caption',
eoc_special_token: str = SpecialTokens.eoc,
audio_special_token: str = SpecialTokens.audio,
remove_eoc_at_last: bool = True,
remove_target_field_token: bool = False,
sent_seperator: str = '\n',
):
"""
Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
:param dj_ds_path: path to the input dataset in Data-Juicer format.
:param target_wavcaps_ds_path: path to store the converted dataset in
WavCaps format.
:param target_field: the field used to describe audio in the WavCaps-like
dataset, which can be one of ['caption','title','description'].
:param eoc_special_token: the special token for "end of a chunk". It's used
to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
Data-Juicer).
:param audio_special_token: the special token for audios. It's used to
locate the audios in the text.
:param remove_eoc_at_last: whether to remove the extra eoc_special_token at
the end of text. Default: True.
:param remove_target_field_token: whether to remove the extra
target_field_token at text.
:param sent_seperator: seperator to split different sentences. Default: \n.
"""
# ----- Constant settings. Better not to change them. -----
from_format = '[[%s]]: ' # default handle method for the text label
# ----- Constant settings. Better not to change them. -----

if not os.path.exists(dj_ds_path):
raise FileNotFoundError(
f'Input dataset [{dj_ds_path}] can not be found.')
if not target_wavcaps_ds_path.endswith('.json'):
raise ValueError(
'Only support "json" target dataset file for WavCaps now.')
if os.path.dirname(target_wavcaps_ds_path) \
and not os.path.exists(os.path.dirname(target_wavcaps_ds_path)):
logger.info(
f'Create directory [{os.path.dirname(target_wavcaps_ds_path)}] '
f'for the target dataset.')
os.makedirs(os.path.dirname(target_wavcaps_ds_path))

if target_field not in ['caption', 'description', 'title']:
raise ValueError(
"target_field must be in '['caption', 'description', 'title']'")

logger.info('Start to convert.')
samples = {'num_captions_per_audio': 1, 'data': []}
with jl.open(dj_ds_path, 'r') as reader:
for sample in tqdm(reader):
id = sample['id']
if Fields.meta not in sample:
logger.warning(
f'{Fields.meta} does not exist in this sample with '
f'id [{id}].')
continue

if target_field not in sample[Fields.meta].keys():
logger.warning(
f'{target_field} does not exist in this sample with '
f'id [{id}].')
continue
samples['num_captions_per_audio'] = sample[
Fields.meta]['num_captions_per_audio']
del sample[Fields.meta]['num_captions_per_audio']

sample[Fields.meta][target_field] = sample['text'].replace(
audio_special_token + sent_seperator, '')
if remove_eoc_at_last:
sample[Fields.meta][target_field] = sample[
Fields.meta][target_field].replace(eoc_special_token, '')
if remove_target_field_token:
sample[Fields.meta][target_field] = sample[
Fields.meta][target_field].replace(
from_format % target_field, '')
samples['data'].append(sample[Fields.meta])

logger.info(f'Start to write the converted dataset to '
f'[{target_wavcaps_ds_path}]...')
json.dump(samples, open(target_wavcaps_ds_path, 'w', encoding='utf-8'))


if __name__ == '__main__':
fire.Fire(main)
Loading

0 comments on commit 155f9f6

Please sign in to comment.