From 4aeaec98de75cac9ccc986a8c0ba2e857047b141 Mon Sep 17 00:00:00 2001 From: MoayedHajiAli Date: Mon, 24 Jun 2024 02:18:56 +0000 Subject: [PATCH] add dataset preperation src --- dataset_preperation/.gitignore | 4 + dataset_preperation/README.md | 56 +++++ dataset_preperation/download.py | 272 ++++++++++++++++++++++++ dataset_preperation/download_manager.py | 22 ++ dataset_preperation/organize_dataset.py | 135 ++++++++++++ 5 files changed, 489 insertions(+) create mode 100644 dataset_preperation/.gitignore create mode 100644 dataset_preperation/README.md create mode 100644 dataset_preperation/download.py create mode 100644 dataset_preperation/download_manager.py create mode 100644 dataset_preperation/organize_dataset.py diff --git a/dataset_preperation/.gitignore b/dataset_preperation/.gitignore new file mode 100644 index 0000000..84818b6 --- /dev/null +++ b/dataset_preperation/.gitignore @@ -0,0 +1,4 @@ +data +core* +.vscode +download_logs.txt \ No newline at end of file diff --git a/dataset_preperation/README.md b/dataset_preperation/README.md new file mode 100644 index 0000000..fa13511 --- /dev/null +++ b/dataset_preperation/README.md @@ -0,0 +1,56 @@ + +# AutoCap Dataset Preparation + +## Environment Initialization +For initializing your environment, please refer to the [general README](../README.md). + +## Dataset Download +- We currently provide the following datasets: + * autocap_audioset_vggsounds: containing **444,837** audio-text pairs. + +**More datasets will be coming soon!** + +```shell +python download.py --save_dir --dataset_name + +# Example +python download.py --save_dir data/autocap --dataset_name autocap_audioset_vggsounds --audio_only +``` +By default, the script will download videos along with their metadata. + +We provide the following helpful arguments: +- `--sampling_rate`: Specifies the sampling rate at which the audio files are to be stored. +- `--audio_only`: Download only the audio files and discard the videos. This is helpful to save storage space. +- `--files_per_folder`: Downloaded files will be organized into many folders. This argument specifies how many files to store per folder. +- `--start_idx`, `--end_idx`: To download only a subset of the dataset. +- `--proxy`: For large downloads, YouTube might block your address. You may SSH to another machine at a specific port and provide it using this argument. + +## Dataset Organization +Once the dataset finishes downloading, run the following script: +```shell +python organize_dataset.py --save_dir --dataset_name --split --files_per_subset + +# Example +python organize_dataset.py --save_dir data/autocap --dataset_name autocap --split train +``` +- If `--files_per_subset` is specified to be more than one, the dataset keys will be named as dataset_name_subset_1, dataset_name_subset_2, etc. +- The datasets details can be found at `data/metadata/dataset_root.json`. +- Add the dataset keys under the`data` column in your config file for audio generation and captioning experiments. + +## Prepare Your Custom Dataset +You need to arrange your audio files in one folder using the following structure: +``` +- Folder + - 000000 + - Id_1.wav + - Id_1.json + - Id_2.wav + - Id_2.json + - 000001 + - Id_3.wav + - Id_3.json + . + . +``` +- In the JSON files, add the metadata such as title, description, video_caption, and gt_audio_caption. +- Organizing your dataset following the instructions in [Dataset Organization](#dataset-organization). diff --git a/dataset_preperation/download.py b/dataset_preperation/download.py new file mode 100644 index 0000000..07a3531 --- /dev/null +++ b/dataset_preperation/download.py @@ -0,0 +1,272 @@ +import os +import shutil +from tqdm import tqdm +from multiprocessing import Pool, get_context +import yt_dlp +import logging +from io import StringIO +import json +import argparse +from functools import partial +from download_manager import get_dataset_json_file, dataset_urls + +def download_yt_video(entry, + save_dir, + yt_cookie_path=None, + audio_only=False, + proxy=None, + audio_sampling_rate=44100, + resume=True, + files_per_folder=5000): + + video_idx = entry[0] + video_id, intervals = entry[1][0], entry[1][1]['intervals'] + + for file_idx, video_info in enumerate(intervals): + start = video_info['start'] + to = video_info['end'] + autocap_caption = video_info.get('text', None) + subfolder_idx = f'{video_idx // files_per_folder:06}' + st = f'{int(start//3600)}:{int(start//60)-60*int(start//3600)}:{start%60}' + dur = f'{int(to//3600)}:{int(to//60)-60*int(to//3600)}:{to%60}' + + outpath = os.path.join(save_dir, subfolder_idx) + os.makedirs(outpath, exist_ok=True) + + if resume and os.path.isfile(os.path.join(outpath, f'{video_id}_{file_idx:03d}.json')): + continue + else: + ytdl_logger = logging.getLogger() + log_stream = StringIO() + logging.basicConfig(stream=log_stream, level=logging.INFO) + + out_file_ext = 'wav' if audio_only else 'mp4' + format = 'bestaudio/best' if audio_only else 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio' + ydl_opts = { + "logger": ytdl_logger, + 'outtmpl': f"/temps/id_{video_id}_{file_idx:03d}/audio.%(ext)s", + 'format': format, + 'quiet': True, + 'ignoreerrors': False, + # 'write_thumbnail': True, + 'writeinfojson': True, # This will write a separate .info.json with detailed info + # 'writesubtitles': True, # Attempt to download subtitles (transcripts) + # 'writeautomaticsub': True, # Attempt to download automatic subtitles (auto-generated transcripts) + 'force_generic_extractor': True, + 'postprocessor_args': ['-ar', str(audio_sampling_rate)], + 'external_downloader':'ffmpeg', + 'download_ranges': yt_dlp.utils.download_range_func([], [[start, to]]), + 'force-keyframe-at-cuts': True, + 'external_downloader_args':['-loglevel', 'quiet'], + } + if yt_cookie_path is not None: + ydl_opts['cookiefile'] = f'/temps/id_{video_id}_{file_idx:03d}/cookies.txt' + if audio_only: + ydl_opts['postprocessors'] = [{'key': 'FFmpegExtractAudio', + 'preferredcodec': 'wav'}] + else: + ydl_opts['postprocessors'] = [{'key': 'FFmpegVideoConvertor', + 'preferedformat': 'mp4', # Ensure the output is MP4 + }] + if proxy is not None: + ydl_opts['proxy'] = f'socks5://127.0.0.1:{proxy}/' + + url = f'https://www.youtube.com/watch?v={video_id}' + os.makedirs(f'/temps/id_{video_id}_{file_idx:03d}', exist_ok=True) + if yt_cookie_path is not None: + shutil.copy(yt_cookie_path, f'/temps/id_{video_id}_{file_idx:03d}/cookies.txt') + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + file_exist = os.path.isfile(os.path.join(outpath, f'{video_id}_{file_idx:03d}.{out_file_ext}')) + info=ydl.extract_info(url, download=not file_exist) + filename = f'{video_id}_{file_idx:03d}.{out_file_ext}' + jsonname = f'{video_id}_{file_idx:03d}.json' + if not file_exist: + shutil.move(os.path.join(f'/temps/id_{video_id}_{file_idx:03d}',f'audio.{out_file_ext}'), os.path.join(outpath, filename)) + else: + pass + file_meta = {'id':f'{video_id}','path': os.path.join(outpath, filename),'title': info['title'], 'url':url, 'start': start, 'end': to} + + if autocap_caption is not None: + file_meta['autocap_caption'] = autocap_caption + + # meta data + file_meta['resolution'] = info.get('resolution') + file_meta['fps'] = info.get('fps') + file_meta['aspect_ratio'] = info.get('aspect_ratio') + file_meta['audio_channels'] = info.get('audio_channels') + + file_meta['description'] = info.get('description') + file_meta['uploader'] = info.get('uploader') + file_meta['upload_date'] = info.get('upload_date') + file_meta['duration'] = info.get('duration') + file_meta['view_count'] = info.get('view_count') + file_meta['like_count'] = info.get('like_count') + file_meta['channel_follower_count'] = info.get('channel_follower_count') + file_meta['dislike_count'] = info.get('dislike_count') + file_meta['channel_id'] = info.get('channel_id') + file_meta['channel_url'] = info.get('channel_url') + file_meta['channel_name'] = info.get('uploader') + + print("save meta data for", os.path.join(outpath, jsonname)) + json.dump(file_meta, open(os.path.join(outpath, jsonname),'w')) + os.system(f'rm -rf /temps/id_{video_id}_{file_idx:03d}') + except Exception as e: + os.system(f'rm -rf /temps/id_{video_id}_{file_idx:03d}') + print(f"Error downloading {os.path.join(outpath, f'{video_id}_{file_idx:03d}.json')}:", e) + return f'{url} - ytdl : {log_stream.getvalue()}, system : {str(e)}' + return None + +def update_interval_dict(dict_1, dict_2): + """ + combine two dictionaries, and merge intervals list if it is replicated + """ + for k, v in dict_2.items(): + if k in dict_1: + dict_2[k]['intervals'] += dict_1[k]['intervals'] + + dict_1.update(dict_2) + +def read_video_segments_info(local_input_video_segments, + start_idx=0, + end_idx=int(1e9)): + all_video_segments = {} + with open(local_input_video_segments, 'r') as f: + last_idx = 0 + for idx, json_str in enumerate(tqdm(f, desc="parsing json input")): + if idx > start_idx: + try: + if json_str.endswith('\n'): + json_str = json_str[:-1] + if json_str.endswith(','): + json_str = json_str[:-1] + json_object = json.loads(json_str) + update_interval_dict(all_video_segments, json_object) + except Exception as e: + print("[ERROR] Couldn't parse json string:", json_str) + continue + last_idx += 1 + + if last_idx >= end_idx: + break + + return all_video_segments + +def download_audioset_split(json_file, + save_dir, + yt_cookie_path, + audio_only=False, + proxy_port=None, + audio_sampling_rate=44100, + start_idx=0, + end_idx=int(1e9), + num_processes=os.cpu_count(), + resume=True, + files_per_folder=5000 + ): + + os.makedirs(save_dir, exist_ok=True) + + all_video_segments = read_video_segments_info(json_file, + start_idx=start_idx, + end_idx=end_idx) + + download_audio_split = partial(download_yt_video, + save_dir=save_dir, + yt_cookie_path=yt_cookie_path, + audio_only=audio_only, + proxy=proxy_port, + audio_sampling_rate=audio_sampling_rate, + resume=resume, + files_per_folder=files_per_folder) + + logs = [] + p = get_context("spawn").Pool(num_processes*2) + + # download_audio_split = partial(save_metadata, split=split) # save_metadata + with tqdm(total=len(all_video_segments),leave=False) as pbar: + for log in p.imap_unordered(download_audio_split, enumerate(all_video_segments.items(), start=start_idx)): + logs.append(log) + pbar.update() + p.close() + p.join() + logs = [l for l in logs if l is not None] + open(f'download_logs.txt','w').write('\n'.join(logs)) + +if __name__ == "__main__": + try: + shutil.rmtree('/temps') + except FileNotFoundError: + pass + os.makedirs('/temps', exist_ok=True) + + parser = argparse.ArgumentParser() + + parser.add_argument("--dataset_name", + type=str, + required=True, + help=f"Provided the dataset names. Available datasets are {dataset_urls.keys()}") + + parser.add_argument("--input_file", + type=str, + default=None, + required=False, + help="Provided the path to the json object that contains the dataset information. You may leave it empty to attempt to download the required files from the web") + + parser.add_argument("--save_dir", + type=str, + required=False, + default='data/autocap/videos', + help="where to save the downloaded files") + + parser.add_argument("--audio_only", + required=False, + action='store_true', + help="Enable to only save the wav files and discard the vidoes") + + parser.add_argument("--cookie_path", + type=str, + required=False, + default=None, + help="Path to your Youtube cookies files") + + parser.add_argument("--sampling_rate", + type=int, + default=44100, + help="Audio sampling rate, default is set to 44.1KHz") + + parser.add_argument("--proxy", + type=str, + default=None, + help="provde a proxy port to bypass youtube blocking your IP") + + parser.add_argument("--files_per_folder", + type=int, + default=50000, + help="How many files to store per folder") + + parser.add_argument('--start_idx', '-s', + type=int, default=0, + help="start index of the json objects in the provided files") + + parser.add_argument('--end_idx', '-e', type=int, default=int(1e9), + help="start index of the json objects in the provided files") + + parser.add_argument('--redownload', action='store_true', + help="redownload already downloaded files") + + args = parser.parse_args() + + if args.input_file is None or not os.path.exists(args.input_file): + args.input_file = get_dataset_json_file(args.dataset_name, args.input_file, download=True) + + download_audioset_split(json_file=args.input_file, + save_dir=args.save_dir, + audio_only=args.audio_only, + audio_sampling_rate=args.sampling_rate, + yt_cookie_path=args.cookie_path, + proxy_port=args.proxy, + start_idx=args.start_idx, + end_idx=args.end_idx, + resume=not args.redownload, + files_per_folder=args.files_per_folder) diff --git a/dataset_preperation/download_manager.py b/dataset_preperation/download_manager.py new file mode 100644 index 0000000..6a78895 --- /dev/null +++ b/dataset_preperation/download_manager.py @@ -0,0 +1,22 @@ +import os +import wget + +save_dir = 'data/json_files' +dataset_urls = {"autocap_audioset_vggsounds":'https://huggingface.co/datasets/mali6/autocap/resolve/main/autocap_audioset_vggsounds.json'} + + +def get_dataset_json_file(dataset_name, dataset_json_file_path=None, download=True): + if dataset_json_file_path is None: + dataset_json_file_path = os.path.join(save_dir, f"{dataset_name}.json") + if os.path.exists(dataset_json_file_path): + return dataset_json_file_path + elif not download: + raise f"[ERROR] Dataset json file does not exist at {dataset_json_file_path}, please use download flag to attempt to downloaded it from the web or manually download it from https://huggingface.co/datasets/mali6/autocap/" + else: + os.makedirs(save_dir, exist_ok=True) + if dataset_name not in dataset_urls: + raise f"[ERROR] Dataset {dataset_name} is not recognized and its json file does not exist at {dataset_json_file_path}" + wget.download(dataset_urls[dataset_name], dataset_json_file_path) + print(f"[INFO] JSON file for dataset {dataset_name} is downloaded at {dataset_json_file_path}") + return dataset_json_file_path + \ No newline at end of file diff --git a/dataset_preperation/organize_dataset.py b/dataset_preperation/organize_dataset.py new file mode 100644 index 0000000..a2739f3 --- /dev/null +++ b/dataset_preperation/organize_dataset.py @@ -0,0 +1,135 @@ +import os +import shutil +from tqdm import tqdm +from multiprocessing import Pool, get_context +import logging +from io import StringIO +import json +import argparse +from pathlib import Path + + +def load_json(file_path): + try: + with open(file_path, 'r') as file: + data = json.load(file) # Attempt to read the JSON data + + except json.JSONDecodeError as e: + with open(file_path, 'r') as file: + # Read the file content till the point where JSON is valid + file_content = file.read() + valid_json = file_content[:file_content.rfind('}')+1] + + try: + data = json.loads(valid_json) # Reload the valid JSON part + except json.JSONDecodeError: + print("Failed to recover JSON.") + return None + + # Save the cleaned JSON data to a new file + if data is not None: + with open(file_path, 'w') as new_file: + json.dump(data, new_file, indent=4) + return data + + +def load_file(fname): + with open(fname, "r") as f: + return f.read().split('\n')[:-1] + +def write_json(my_dict, fname): + with open(fname, "w") as json_file: + json.dump(my_dict, json_file, indent=4) + +def find_json_files(directory): + json_files = [] + + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.json'): + json_files.append(os.path.join(root, file)) + + return json_files + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--save_dir", + type=str, + required=True, + help="where to save the downloaded files") + + parser.add_argument("--dataset_meta_file", + required=False, + type=str, + default='data/metadata/dataset_root.json', + help="path to the dataset root json file where the datafiles paths will be stores") + + parser.add_argument("--datafiles_dir", + required=False, + type=str, + default='data/metadata/datafiles/autocap', + help="directories where the datafiles will be stored") + + parser.add_argument("--dataset_name", + type=str, + default='autocap', + help="Name of the compiled dataset") + + parser.add_argument("--files_per_subset", + type=int, + default=-1, + help="How many files to include in each subset. -1 put all files in a single subset") + parser.add_argument("--split", + type=str, + default='train', + help="split of the dataset") + + args = parser.parse_args() + + # initialize all paths + Path(args.datafiles_dir).mkdir(parents=True, exist_ok=True) + Path(args.dataset_meta_file).parent.mkdir(parents=True, exist_ok=True) + + # find all .json files + all_json_files = find_json_files(args.save_dir) + + current_subset = 1 + current_dataset_name = f"{args.dataset_name}_subset_{current_subset}" if args.files_per_subset > 0 else args.dataset_name + current_datafile_path = os.path.join(args.datafiles_dir, f"{current_dataset_name}_{args.split}.txt") + current_datafile = open(current_datafile_path, 'w') + + all_datafiles_path = [(args.split, current_dataset_name, current_datafile_path)] + for idx, file_path in enumerate(all_json_files): + current_datafile.write(f"{os.path.relpath(file_path, args.save_dir)}\n") + + if (idx + 1) % args.files_per_subset == 0 and (idx+1) < len(all_json_files): + current_subset += 1 + current_dataset_name = f"{args.dataset_name}_subset_{current_subset}" + + # close current file and open a new one + current_datafile.close() + current_datafile_path = os.path.join(args.datafiles_dir, f"{current_dataset_name}_{args.split}.txt") + all_datafiles_path.append((args.split, current_dataset_name, current_datafile_path)) + current_datafile = open(current_datafile_path, 'w') + + current_datafile.close() + + # write on the dataset root files + if os.path.exists(args.dataset_meta_file): + dataset_root = load_json(args.dataset_meta_file) + else: + dataset_root = {"metadata":{"path":{}}} + + # add all datasets + for split, dataset_name, datafile_path in all_datafiles_path: + dataset_root[dataset_name] = os.path.abspath(args.save_dir) + dataset_root['metadata']['path'][dataset_name] = {} + dataset_root['metadata']['path'][dataset_name][split] = os.path.abspath(datafile_path) + for split_check in ['train', 'test', 'val']: + dataset_root['metadata']['path'][dataset_name][split_check] = dataset_root['metadata']['path'][dataset_name].get(split_check, "") + + write_json(dataset_root, args.dataset_meta_file) + print("[INFO] Congrats! done organizing dataset") + print("[INFO] Please use the following file path as the `metadata_root` in your experiments configurations:", os.path.abspath(args.dataset_meta_file)) + print("[INFO] You may use any of the following datasets to use in your experiments are:", [entry[1] for entry in all_datafiles_path]) + \ No newline at end of file