Skip to content

Commit

Permalink
add dataset preperation src
Browse files Browse the repository at this point in the history
  • Loading branch information
MoayedHajiAli committed Jun 24, 2024
1 parent 8c68695 commit 4aeaec9
Show file tree
Hide file tree
Showing 5 changed files with 489 additions and 0 deletions.
4 changes: 4 additions & 0 deletions dataset_preperation/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
data
core*
.vscode
download_logs.txt
56 changes: 56 additions & 0 deletions dataset_preperation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

# AutoCap Dataset Preparation

## Environment Initialization
For initializing your environment, please refer to the [general README](../README.md).

## Dataset Download
- We currently provide the following datasets:
* autocap_audioset_vggsounds: containing **444,837** audio-text pairs.

**More datasets will be coming soon!**

```shell
python download.py --save_dir <path-to-save-dir> --dataset_name <dataset-subset>

# Example
python download.py --save_dir data/autocap --dataset_name autocap_audioset_vggsounds --audio_only
```
By default, the script will download videos along with their metadata.

We provide the following helpful arguments:
- `--sampling_rate`: Specifies the sampling rate at which the audio files are to be stored.
- `--audio_only`: Download only the audio files and discard the videos. This is helpful to save storage space.
- `--files_per_folder`: Downloaded files will be organized into many folders. This argument specifies how many files to store per folder.
- `--start_idx`, `--end_idx`: To download only a subset of the dataset.
- `--proxy`: For large downloads, YouTube might block your address. You may SSH to another machine at a specific port and provide it using this argument.

## Dataset Organization
Once the dataset finishes downloading, run the following script:
```shell
python organize_dataset.py --save_dir <path-to-dataset> --dataset_name <key-to-store-dataset> --split <split-type> --files_per_subset <number_of_files_per_subset>

# Example
python organize_dataset.py --save_dir data/autocap --dataset_name autocap --split train
```
- If `--files_per_subset` is specified to be more than one, the dataset keys will be named as dataset_name_subset_1, dataset_name_subset_2, etc.
- The datasets details can be found at `data/metadata/dataset_root.json`.
- Add the dataset keys under the`data` column in your config file for audio generation and captioning experiments.

## Prepare Your Custom Dataset
You need to arrange your audio files in one folder using the following structure:
```
- Folder
- 000000
- Id_1.wav
- Id_1.json
- Id_2.wav
- Id_2.json
- 000001
- Id_3.wav
- Id_3.json
.
.
```
- In the JSON files, add the metadata such as title, description, video_caption, and gt_audio_caption.
- Organizing your dataset following the instructions in [Dataset Organization](#dataset-organization).
272 changes: 272 additions & 0 deletions dataset_preperation/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import os
import shutil
from tqdm import tqdm
from multiprocessing import Pool, get_context
import yt_dlp
import logging
from io import StringIO
import json
import argparse
from functools import partial
from download_manager import get_dataset_json_file, dataset_urls

def download_yt_video(entry,
save_dir,
yt_cookie_path=None,
audio_only=False,
proxy=None,
audio_sampling_rate=44100,
resume=True,
files_per_folder=5000):

video_idx = entry[0]
video_id, intervals = entry[1][0], entry[1][1]['intervals']

for file_idx, video_info in enumerate(intervals):
start = video_info['start']
to = video_info['end']
autocap_caption = video_info.get('text', None)
subfolder_idx = f'{video_idx // files_per_folder:06}'
st = f'{int(start//3600)}:{int(start//60)-60*int(start//3600)}:{start%60}'
dur = f'{int(to//3600)}:{int(to//60)-60*int(to//3600)}:{to%60}'

outpath = os.path.join(save_dir, subfolder_idx)
os.makedirs(outpath, exist_ok=True)

if resume and os.path.isfile(os.path.join(outpath, f'{video_id}_{file_idx:03d}.json')):
continue
else:
ytdl_logger = logging.getLogger()
log_stream = StringIO()
logging.basicConfig(stream=log_stream, level=logging.INFO)

out_file_ext = 'wav' if audio_only else 'mp4'
format = 'bestaudio/best' if audio_only else 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio'
ydl_opts = {
"logger": ytdl_logger,
'outtmpl': f"/temps/id_{video_id}_{file_idx:03d}/audio.%(ext)s",
'format': format,
'quiet': True,
'ignoreerrors': False,
# 'write_thumbnail': True,
'writeinfojson': True, # This will write a separate .info.json with detailed info
# 'writesubtitles': True, # Attempt to download subtitles (transcripts)
# 'writeautomaticsub': True, # Attempt to download automatic subtitles (auto-generated transcripts)
'force_generic_extractor': True,
'postprocessor_args': ['-ar', str(audio_sampling_rate)],
'external_downloader':'ffmpeg',
'download_ranges': yt_dlp.utils.download_range_func([], [[start, to]]),
'force-keyframe-at-cuts': True,
'external_downloader_args':['-loglevel', 'quiet'],
}
if yt_cookie_path is not None:
ydl_opts['cookiefile'] = f'/temps/id_{video_id}_{file_idx:03d}/cookies.txt'
if audio_only:
ydl_opts['postprocessors'] = [{'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav'}]
else:
ydl_opts['postprocessors'] = [{'key': 'FFmpegVideoConvertor',
'preferedformat': 'mp4', # Ensure the output is MP4
}]
if proxy is not None:
ydl_opts['proxy'] = f'socks5://127.0.0.1:{proxy}/'

url = f'https://www.youtube.com/watch?v={video_id}'
os.makedirs(f'/temps/id_{video_id}_{file_idx:03d}', exist_ok=True)
if yt_cookie_path is not None:
shutil.copy(yt_cookie_path, f'/temps/id_{video_id}_{file_idx:03d}/cookies.txt')
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
file_exist = os.path.isfile(os.path.join(outpath, f'{video_id}_{file_idx:03d}.{out_file_ext}'))
info=ydl.extract_info(url, download=not file_exist)
filename = f'{video_id}_{file_idx:03d}.{out_file_ext}'
jsonname = f'{video_id}_{file_idx:03d}.json'
if not file_exist:
shutil.move(os.path.join(f'/temps/id_{video_id}_{file_idx:03d}',f'audio.{out_file_ext}'), os.path.join(outpath, filename))
else:
pass
file_meta = {'id':f'{video_id}','path': os.path.join(outpath, filename),'title': info['title'], 'url':url, 'start': start, 'end': to}

if autocap_caption is not None:
file_meta['autocap_caption'] = autocap_caption

# meta data
file_meta['resolution'] = info.get('resolution')
file_meta['fps'] = info.get('fps')
file_meta['aspect_ratio'] = info.get('aspect_ratio')
file_meta['audio_channels'] = info.get('audio_channels')

file_meta['description'] = info.get('description')
file_meta['uploader'] = info.get('uploader')
file_meta['upload_date'] = info.get('upload_date')
file_meta['duration'] = info.get('duration')
file_meta['view_count'] = info.get('view_count')
file_meta['like_count'] = info.get('like_count')
file_meta['channel_follower_count'] = info.get('channel_follower_count')
file_meta['dislike_count'] = info.get('dislike_count')
file_meta['channel_id'] = info.get('channel_id')
file_meta['channel_url'] = info.get('channel_url')
file_meta['channel_name'] = info.get('uploader')

print("save meta data for", os.path.join(outpath, jsonname))
json.dump(file_meta, open(os.path.join(outpath, jsonname),'w'))
os.system(f'rm -rf /temps/id_{video_id}_{file_idx:03d}')
except Exception as e:
os.system(f'rm -rf /temps/id_{video_id}_{file_idx:03d}')
print(f"Error downloading {os.path.join(outpath, f'{video_id}_{file_idx:03d}.json')}:", e)
return f'{url} - ytdl : {log_stream.getvalue()}, system : {str(e)}'
return None

def update_interval_dict(dict_1, dict_2):
"""
combine two dictionaries, and merge intervals list if it is replicated
"""
for k, v in dict_2.items():
if k in dict_1:
dict_2[k]['intervals'] += dict_1[k]['intervals']

dict_1.update(dict_2)

def read_video_segments_info(local_input_video_segments,
start_idx=0,
end_idx=int(1e9)):
all_video_segments = {}
with open(local_input_video_segments, 'r') as f:
last_idx = 0
for idx, json_str in enumerate(tqdm(f, desc="parsing json input")):
if idx > start_idx:
try:
if json_str.endswith('\n'):
json_str = json_str[:-1]
if json_str.endswith(','):
json_str = json_str[:-1]
json_object = json.loads(json_str)
update_interval_dict(all_video_segments, json_object)
except Exception as e:
print("[ERROR] Couldn't parse json string:", json_str)
continue
last_idx += 1

if last_idx >= end_idx:
break

return all_video_segments

def download_audioset_split(json_file,
save_dir,
yt_cookie_path,
audio_only=False,
proxy_port=None,
audio_sampling_rate=44100,
start_idx=0,
end_idx=int(1e9),
num_processes=os.cpu_count(),
resume=True,
files_per_folder=5000
):

os.makedirs(save_dir, exist_ok=True)

all_video_segments = read_video_segments_info(json_file,
start_idx=start_idx,
end_idx=end_idx)

download_audio_split = partial(download_yt_video,
save_dir=save_dir,
yt_cookie_path=yt_cookie_path,
audio_only=audio_only,
proxy=proxy_port,
audio_sampling_rate=audio_sampling_rate,
resume=resume,
files_per_folder=files_per_folder)

logs = []
p = get_context("spawn").Pool(num_processes*2)

# download_audio_split = partial(save_metadata, split=split) # save_metadata
with tqdm(total=len(all_video_segments),leave=False) as pbar:
for log in p.imap_unordered(download_audio_split, enumerate(all_video_segments.items(), start=start_idx)):
logs.append(log)
pbar.update()
p.close()
p.join()
logs = [l for l in logs if l is not None]
open(f'download_logs.txt','w').write('\n'.join(logs))

if __name__ == "__main__":
try:
shutil.rmtree('/temps')
except FileNotFoundError:
pass
os.makedirs('/temps', exist_ok=True)

parser = argparse.ArgumentParser()

parser.add_argument("--dataset_name",
type=str,
required=True,
help=f"Provided the dataset names. Available datasets are {dataset_urls.keys()}")

parser.add_argument("--input_file",
type=str,
default=None,
required=False,
help="Provided the path to the json object that contains the dataset information. You may leave it empty to attempt to download the required files from the web")

parser.add_argument("--save_dir",
type=str,
required=False,
default='data/autocap/videos',
help="where to save the downloaded files")

parser.add_argument("--audio_only",
required=False,
action='store_true',
help="Enable to only save the wav files and discard the vidoes")

parser.add_argument("--cookie_path",
type=str,
required=False,
default=None,
help="Path to your Youtube cookies files")

parser.add_argument("--sampling_rate",
type=int,
default=44100,
help="Audio sampling rate, default is set to 44.1KHz")

parser.add_argument("--proxy",
type=str,
default=None,
help="provde a proxy port to bypass youtube blocking your IP")

parser.add_argument("--files_per_folder",
type=int,
default=50000,
help="How many files to store per folder")

parser.add_argument('--start_idx', '-s',
type=int, default=0,
help="start index of the json objects in the provided files")

parser.add_argument('--end_idx', '-e', type=int, default=int(1e9),
help="start index of the json objects in the provided files")

parser.add_argument('--redownload', action='store_true',
help="redownload already downloaded files")

args = parser.parse_args()

if args.input_file is None or not os.path.exists(args.input_file):
args.input_file = get_dataset_json_file(args.dataset_name, args.input_file, download=True)

download_audioset_split(json_file=args.input_file,
save_dir=args.save_dir,
audio_only=args.audio_only,
audio_sampling_rate=args.sampling_rate,
yt_cookie_path=args.cookie_path,
proxy_port=args.proxy,
start_idx=args.start_idx,
end_idx=args.end_idx,
resume=not args.redownload,
files_per_folder=args.files_per_folder)
22 changes: 22 additions & 0 deletions dataset_preperation/download_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import wget

save_dir = 'data/json_files'
dataset_urls = {"autocap_audioset_vggsounds":'https://huggingface.co/datasets/mali6/autocap/resolve/main/autocap_audioset_vggsounds.json'}


def get_dataset_json_file(dataset_name, dataset_json_file_path=None, download=True):
if dataset_json_file_path is None:
dataset_json_file_path = os.path.join(save_dir, f"{dataset_name}.json")
if os.path.exists(dataset_json_file_path):
return dataset_json_file_path
elif not download:
raise f"[ERROR] Dataset json file does not exist at {dataset_json_file_path}, please use download flag to attempt to downloaded it from the web or manually download it from https://huggingface.co/datasets/mali6/autocap/"
else:
os.makedirs(save_dir, exist_ok=True)
if dataset_name not in dataset_urls:
raise f"[ERROR] Dataset {dataset_name} is not recognized and its json file does not exist at {dataset_json_file_path}"
wget.download(dataset_urls[dataset_name], dataset_json_file_path)
print(f"[INFO] JSON file for dataset {dataset_name} is downloaded at {dataset_json_file_path}")
return dataset_json_file_path

Loading

0 comments on commit 4aeaec9

Please sign in to comment.