From 4aeaec98de75cac9ccc986a8c0ba2e857047b141 Mon Sep 17 00:00:00 2001
From: MoayedHajiAli <mali18@ku.edu.tr>
Date: Mon, 24 Jun 2024 02:18:56 +0000
Subject: [PATCH] add dataset preperation src

---
 dataset_preperation/.gitignore          |   4 +
 dataset_preperation/README.md           |  56 +++++
 dataset_preperation/download.py         | 272 ++++++++++++++++++++++++
 dataset_preperation/download_manager.py |  22 ++
 dataset_preperation/organize_dataset.py | 135 ++++++++++++
 5 files changed, 489 insertions(+)
 create mode 100644 dataset_preperation/.gitignore
 create mode 100644 dataset_preperation/README.md
 create mode 100644 dataset_preperation/download.py
 create mode 100644 dataset_preperation/download_manager.py
 create mode 100644 dataset_preperation/organize_dataset.py
diff --git a/dataset_preperation/.gitignore b/dataset_preperation/.gitignore
new file mode 100644
index 0000000..84818b6
--- /dev/null
+++ b/dataset_preperation/.gitignore
@@ -0,0 +1,4 @@
+data
+core*
+.vscode
+download_logs.txt
\ No newline at end of file
diff --git a/dataset_preperation/README.md b/dataset_preperation/README.md
new file mode 100644
index 0000000..fa13511
--- /dev/null
+++ b/dataset_preperation/README.md
@@ -0,0 +1,56 @@
+
+# AutoCap Dataset Preparation
+
+## Environment Initialization
+For initializing your environment, please refer to the [general README](../README.md).
+
+## Dataset Download
+- We currently provide the following datasets:
+    * autocap_audioset_vggsounds: containing **444,837** audio-text pairs.
+
+**More datasets will be coming soon!**
+
+```shell
+python download.py --save_dir <path-to-save-dir> --dataset_name <dataset-subset>
+
+# Example
+python download.py --save_dir data/autocap --dataset_name autocap_audioset_vggsounds --audio_only
+```
+By default, the script will download videos along with their metadata.
+
+We provide the following helpful arguments:
+- `--sampling_rate`: Specifies the sampling rate at which the audio files are to be stored.
+- `--audio_only`: Download only the audio files and discard the videos. This is helpful to save storage space.
+- `--files_per_folder`: Downloaded files will be organized into many folders. This argument specifies how many files to store per folder.
+- `--start_idx`, `--end_idx`: To download only a subset of the dataset.
+- `--proxy`: For large downloads, YouTube might block your address. You may SSH to another machine at a specific port and provide it using this argument.
+
+## Dataset Organization
+Once the dataset finishes downloading, run the following script:
+```shell
+python organize_dataset.py --save_dir <path-to-dataset> --dataset_name <key-to-store-dataset> --split <split-type> --files_per_subset <number_of_files_per_subset>
+
+# Example
+python organize_dataset.py --save_dir data/autocap --dataset_name autocap --split train
+```
+- If `--files_per_subset` is specified to be more than one, the dataset keys will be named as dataset_name_subset_1, dataset_name_subset_2, etc.
+- The datasets details can be found at `data/metadata/dataset_root.json`.
+- Add the dataset keys under  the`data` column in your config file for audio generation and captioning experiments.
+
+## Prepare Your Custom Dataset
+You need to arrange your audio files in one folder using the following structure:
+```
+- Folder
+    - 000000
+        - Id_1.wav
+        - Id_1.json
+        - Id_2.wav
+        - Id_2.json
+    - 000001
+        - Id_3.wav
+        - Id_3.json
+        .
+        .
+```
+- In the JSON files, add the metadata such as title, description, video_caption, and gt_audio_caption.
+- Organizing your dataset following the instructions in [Dataset Organization](#dataset-organization).
diff --git a/dataset_preperation/download.py b/dataset_preperation/download.py
new file mode 100644
index 0000000..07a3531
--- /dev/null
+++ b/dataset_preperation/download.py
@@ -0,0 +1,272 @@
+import os
+import shutil
+from tqdm import tqdm
+from multiprocessing import Pool, get_context
+import yt_dlp
+import logging
+from io import StringIO
+import json
+import argparse
+from functools import partial
+from download_manager import get_dataset_json_file, dataset_urls
+
+def download_yt_video(entry,
+                    save_dir,
+                    yt_cookie_path=None,
+                    audio_only=False,
+                    proxy=None,
+                    audio_sampling_rate=44100,
+                    resume=True,
+                    files_per_folder=5000):
+    
+    video_idx = entry[0]
+    video_id, intervals = entry[1][0], entry[1][1]['intervals']
+    
+    for file_idx, video_info in enumerate(intervals):
+        start = video_info['start']
+        to = video_info['end']
+        autocap_caption = video_info.get('text', None)
+        subfolder_idx = f'{video_idx // files_per_folder:06}'
+        st = f'{int(start//3600)}:{int(start//60)-60*int(start//3600)}:{start%60}'
+        dur = f'{int(to//3600)}:{int(to//60)-60*int(to//3600)}:{to%60}'
+        
+        outpath = os.path.join(save_dir, subfolder_idx)
+        os.makedirs(outpath, exist_ok=True)
+        
+        if resume and os.path.isfile(os.path.join(outpath, f'{video_id}_{file_idx:03d}.json')):
+            continue
+        else:
+            ytdl_logger = logging.getLogger()
+            log_stream = StringIO()    
+            logging.basicConfig(stream=log_stream, level=logging.INFO)
+            
+            out_file_ext = 'wav' if audio_only else 'mp4'
+            format = 'bestaudio/best' if audio_only else 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio'
+            ydl_opts = {
+                "logger": ytdl_logger,
+                'outtmpl': f"/temps/id_{video_id}_{file_idx:03d}/audio.%(ext)s",
+                'format': format,
+                'quiet': True,
+                'ignoreerrors': False, 
+                # 'write_thumbnail': True,
+                'writeinfojson': True,  # This will write a separate .info.json with detailed info
+                # 'writesubtitles': True,  # Attempt to download subtitles (transcripts)
+                # 'writeautomaticsub': True,  # Attempt to download automatic subtitles (auto-generated transcripts)
+                'force_generic_extractor': True,
+                'postprocessor_args': ['-ar', str(audio_sampling_rate)],
+                'external_downloader':'ffmpeg',
+                'download_ranges': yt_dlp.utils.download_range_func([], [[start, to]]),
+                'force-keyframe-at-cuts': True,
+                'external_downloader_args':['-loglevel', 'quiet'],
+            }
+            if yt_cookie_path is not None:
+                ydl_opts['cookiefile'] = f'/temps/id_{video_id}_{file_idx:03d}/cookies.txt'
+            if audio_only:
+                ydl_opts['postprocessors'] = [{'key': 'FFmpegExtractAudio',
+                                               'preferredcodec': 'wav'}]
+            else:
+                ydl_opts['postprocessors'] = [{'key': 'FFmpegVideoConvertor',
+                                                'preferedformat': 'mp4',  # Ensure the output is MP4
+                                                }]
+            if proxy is not None:
+                ydl_opts['proxy'] = f'socks5://127.0.0.1:{proxy}/'
+            
+            url = f'https://www.youtube.com/watch?v={video_id}'
+            os.makedirs(f'/temps/id_{video_id}_{file_idx:03d}', exist_ok=True)
+            if yt_cookie_path is not None:
+                shutil.copy(yt_cookie_path, f'/temps/id_{video_id}_{file_idx:03d}/cookies.txt')
+            try:
+                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                    file_exist = os.path.isfile(os.path.join(outpath, f'{video_id}_{file_idx:03d}.{out_file_ext}'))
+                    info=ydl.extract_info(url, download=not file_exist)
+                    filename = f'{video_id}_{file_idx:03d}.{out_file_ext}'
+                    jsonname = f'{video_id}_{file_idx:03d}.json'
+                    if not file_exist:
+                        shutil.move(os.path.join(f'/temps/id_{video_id}_{file_idx:03d}',f'audio.{out_file_ext}'), os.path.join(outpath, filename))
+                    else:
+                        pass
+                    file_meta = {'id':f'{video_id}','path': os.path.join(outpath, filename),'title': info['title'], 'url':url, 'start': start, 'end': to}
+
+                    if autocap_caption is not None:
+                        file_meta['autocap_caption'] = autocap_caption
+                        
+                    # meta data 
+                    file_meta['resolution'] = info.get('resolution')
+                    file_meta['fps'] = info.get('fps')
+                    file_meta['aspect_ratio'] = info.get('aspect_ratio')
+                    file_meta['audio_channels'] = info.get('audio_channels')
+
+                    file_meta['description'] = info.get('description')
+                    file_meta['uploader'] = info.get('uploader')
+                    file_meta['upload_date'] = info.get('upload_date')
+                    file_meta['duration'] = info.get('duration')
+                    file_meta['view_count'] = info.get('view_count')
+                    file_meta['like_count'] = info.get('like_count')
+                    file_meta['channel_follower_count'] = info.get('channel_follower_count')
+                    file_meta['dislike_count'] = info.get('dislike_count')
+                    file_meta['channel_id'] = info.get('channel_id')
+                    file_meta['channel_url'] = info.get('channel_url')
+                    file_meta['channel_name'] = info.get('uploader')
+
+                    print("save meta data for", os.path.join(outpath, jsonname))
+                    json.dump(file_meta, open(os.path.join(outpath, jsonname),'w'))
+                os.system(f'rm -rf /temps/id_{video_id}_{file_idx:03d}')
+            except Exception as e:
+                os.system(f'rm -rf /temps/id_{video_id}_{file_idx:03d}')
+                print(f"Error downloading {os.path.join(outpath, f'{video_id}_{file_idx:03d}.json')}:", e)
+                return f'{url} - ytdl : {log_stream.getvalue()}, system : {str(e)}'
+    return None
+
+def update_interval_dict(dict_1, dict_2):
+    """
+    combine two dictionaries, and merge intervals list if it is replicated
+    """
+    for k, v in dict_2.items():
+        if k in dict_1: 
+            dict_2[k]['intervals'] += dict_1[k]['intervals']
+    
+    dict_1.update(dict_2)
+        
+def read_video_segments_info(local_input_video_segments, 
+                             start_idx=0,
+                             end_idx=int(1e9)):
+    all_video_segments = {}
+    with open(local_input_video_segments, 'r') as f:
+        last_idx = 0
+        for idx, json_str in enumerate(tqdm(f, desc="parsing json input")): 
+            if idx > start_idx:
+                try:
+                    if json_str.endswith('\n'):
+                        json_str = json_str[:-1]
+                    if json_str.endswith(','):
+                        json_str = json_str[:-1]
+                    json_object = json.loads(json_str)
+                    update_interval_dict(all_video_segments, json_object)
+                except Exception as e:
+                    print("[ERROR] Couldn't parse json string:", json_str)
+                    continue
+                last_idx += 1
+            
+            if last_idx >= end_idx:
+                break
+    
+    return all_video_segments
+
+def download_audioset_split(json_file,
+                            save_dir,
+                            yt_cookie_path,
+                            audio_only=False,
+                            proxy_port=None,
+                            audio_sampling_rate=44100,
+                            start_idx=0,
+                            end_idx=int(1e9),
+                            num_processes=os.cpu_count(),
+                            resume=True,
+                            files_per_folder=5000
+                            ):
+    
+    os.makedirs(save_dir, exist_ok=True)
+        
+    all_video_segments = read_video_segments_info(json_file,
+                                                  start_idx=start_idx,
+                                                  end_idx=end_idx)
+    
+    download_audio_split = partial(download_yt_video,
+                                   save_dir=save_dir,
+                                   yt_cookie_path=yt_cookie_path,
+                                   audio_only=audio_only,
+                                   proxy=proxy_port,
+                                   audio_sampling_rate=audio_sampling_rate,
+                                   resume=resume,
+                                   files_per_folder=files_per_folder) 
+    
+    logs = []
+    p = get_context("spawn").Pool(num_processes*2)
+    
+    # download_audio_split = partial(save_metadata, split=split) # save_metadata
+    with tqdm(total=len(all_video_segments),leave=False) as pbar:
+        for log in p.imap_unordered(download_audio_split, enumerate(all_video_segments.items(), start=start_idx)):
+            logs.append(log)
+            pbar.update()
+    p.close()
+    p.join()
+    logs = [l for l in logs if l is not None]
+    open(f'download_logs.txt','w').write('\n'.join(logs))
+    
+if __name__ == "__main__":
+    try:
+        shutil.rmtree('/temps')
+    except FileNotFoundError:
+        pass
+    os.makedirs('/temps', exist_ok=True)
+    
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--dataset_name", 
+                        type=str,
+                        required=True,
+                        help=f"Provided the dataset names. Available datasets are {dataset_urls.keys()}")
+    
+    parser.add_argument("--input_file", 
+                        type=str,
+                        default=None,
+                        required=False,
+                        help="Provided the path to the json object that contains the dataset information. You may leave it empty to attempt to download the required files from the web")
+    
+    parser.add_argument("--save_dir", 
+                        type=str,
+                        required=False,
+                        default='data/autocap/videos',
+                        help="where to save the downloaded files")
+    
+    parser.add_argument("--audio_only", 
+                        required=False,
+                        action='store_true',
+                        help="Enable to only save the wav files and discard the vidoes")
+    
+    parser.add_argument("--cookie_path", 
+                        type=str,
+                        required=False,
+                        default=None,
+                        help="Path to your Youtube cookies files")
+    
+    parser.add_argument("--sampling_rate", 
+                        type=int,
+                        default=44100,
+                        help="Audio sampling rate, default is set to 44.1KHz")
+    
+    parser.add_argument("--proxy", 
+                        type=str,
+                        default=None,
+                        help="provde a proxy port to bypass youtube blocking your IP")
+    
+    parser.add_argument("--files_per_folder", 
+                        type=int,
+                        default=50000,
+                        help="How many files to store per folder")
+    
+    parser.add_argument('--start_idx', '-s', 
+                        type=int, default=0,
+                        help="start index of the json objects in the provided files")
+    
+    parser.add_argument('--end_idx', '-e', type=int, default=int(1e9),
+                        help="start index of the json objects in the provided files")
+    
+    parser.add_argument('--redownload', action='store_true',
+                        help="redownload already downloaded files")
+    
+    args = parser.parse_args()
+    
+    if args.input_file is None or not os.path.exists(args.input_file):
+        args.input_file = get_dataset_json_file(args.dataset_name, args.input_file, download=True)
+
+    download_audioset_split(json_file=args.input_file,
+                            save_dir=args.save_dir,
+                            audio_only=args.audio_only,
+                            audio_sampling_rate=args.sampling_rate,
+                            yt_cookie_path=args.cookie_path,
+                            proxy_port=args.proxy,
+                            start_idx=args.start_idx,
+                            end_idx=args.end_idx,
+                            resume=not args.redownload,
+                            files_per_folder=args.files_per_folder)
diff --git a/dataset_preperation/download_manager.py b/dataset_preperation/download_manager.py
new file mode 100644
index 0000000..6a78895
--- /dev/null
+++ b/dataset_preperation/download_manager.py
@@ -0,0 +1,22 @@
+import os
+import wget
+
+save_dir = 'data/json_files'
+dataset_urls = {"autocap_audioset_vggsounds":'https://huggingface.co/datasets/mali6/autocap/resolve/main/autocap_audioset_vggsounds.json'}
+
+
+def get_dataset_json_file(dataset_name, dataset_json_file_path=None, download=True):
+    if dataset_json_file_path is None:
+        dataset_json_file_path = os.path.join(save_dir, f"{dataset_name}.json")
+    if os.path.exists(dataset_json_file_path):
+        return dataset_json_file_path
+    elif not download:
+        raise f"[ERROR] Dataset json file does not exist at {dataset_json_file_path}, please use download flag to attempt to downloaded it from the web or manually download it from https://huggingface.co/datasets/mali6/autocap/"
+    else:
+        os.makedirs(save_dir, exist_ok=True)
+        if dataset_name not in dataset_urls:
+            raise f"[ERROR] Dataset {dataset_name} is not recognized and its json file does not exist at {dataset_json_file_path}"
+        wget.download(dataset_urls[dataset_name], dataset_json_file_path)
+        print(f"[INFO] JSON file for dataset {dataset_name} is downloaded at {dataset_json_file_path}")
+        return dataset_json_file_path
+        
\ No newline at end of file
diff --git a/dataset_preperation/organize_dataset.py b/dataset_preperation/organize_dataset.py
new file mode 100644
index 0000000..a2739f3
--- /dev/null
+++ b/dataset_preperation/organize_dataset.py
@@ -0,0 +1,135 @@
+import os
+import shutil
+from tqdm import tqdm
+from multiprocessing import Pool, get_context
+import logging
+from io import StringIO
+import json
+import argparse
+from pathlib import Path
+
+
+def load_json(file_path):
+    try:
+        with open(file_path, 'r') as file:
+            data = json.load(file)  # Attempt to read the JSON data
+
+    except json.JSONDecodeError as e:
+        with open(file_path, 'r') as file:
+            # Read the file content till the point where JSON is valid
+            file_content = file.read()
+            valid_json = file_content[:file_content.rfind('}')+1]
+
+            try:
+                data = json.loads(valid_json)  # Reload the valid JSON part
+            except json.JSONDecodeError:
+                print("Failed to recover JSON.")
+                return None
+
+        # Save the cleaned JSON data to a new file
+        if data is not None:
+            with open(file_path, 'w') as new_file:
+                json.dump(data, new_file, indent=4)
+    return data
+
+    
+def load_file(fname):
+    with open(fname, "r") as f:
+        return f.read().split('\n')[:-1]
+
+def write_json(my_dict, fname):
+    with open(fname, "w") as json_file:
+        json.dump(my_dict, json_file, indent=4)
+        
+def find_json_files(directory):
+    json_files = []
+    
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.json'):
+                json_files.append(os.path.join(root, file))
+    
+    return json_files
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save_dir", 
+                        type=str,
+                        required=True,
+                        help="where to save the downloaded files")
+    
+    parser.add_argument("--dataset_meta_file", 
+                        required=False,
+                        type=str,
+                        default='data/metadata/dataset_root.json',
+                        help="path to the dataset root json file where the datafiles paths will be stores")
+    
+    parser.add_argument("--datafiles_dir", 
+                        required=False,
+                        type=str,
+                        default='data/metadata/datafiles/autocap',
+                        help="directories where the datafiles will be stored")
+    
+    parser.add_argument("--dataset_name", 
+                        type=str,
+                        default='autocap',
+                        help="Name of the compiled dataset")
+    
+    parser.add_argument("--files_per_subset", 
+                        type=int,
+                        default=-1,
+                        help="How many files to include in each subset. -1 put all files in a single subset")
+    parser.add_argument("--split", 
+                        type=str,
+                        default='train',
+                        help="split of the dataset")
+    
+    args = parser.parse_args()
+    
+    # initialize all paths
+    Path(args.datafiles_dir).mkdir(parents=True, exist_ok=True)
+    Path(args.dataset_meta_file).parent.mkdir(parents=True, exist_ok=True)
+
+    # find all .json files
+    all_json_files = find_json_files(args.save_dir)
+    
+    current_subset = 1
+    current_dataset_name = f"{args.dataset_name}_subset_{current_subset}" if args.files_per_subset > 0 else args.dataset_name
+    current_datafile_path = os.path.join(args.datafiles_dir, f"{current_dataset_name}_{args.split}.txt")
+    current_datafile = open(current_datafile_path, 'w')
+    
+    all_datafiles_path = [(args.split, current_dataset_name, current_datafile_path)]
+    for idx, file_path in enumerate(all_json_files):
+        current_datafile.write(f"{os.path.relpath(file_path, args.save_dir)}\n")
+        
+        if (idx + 1) % args.files_per_subset == 0 and (idx+1) < len(all_json_files):
+            current_subset += 1
+            current_dataset_name = f"{args.dataset_name}_subset_{current_subset}" 
+            
+            # close current file and open a new one
+            current_datafile.close()
+            current_datafile_path = os.path.join(args.datafiles_dir, f"{current_dataset_name}_{args.split}.txt")
+            all_datafiles_path.append((args.split, current_dataset_name, current_datafile_path))
+            current_datafile = open(current_datafile_path, 'w')
+            
+    current_datafile.close()
+    
+    # write on the dataset root files
+    if os.path.exists(args.dataset_meta_file):
+        dataset_root = load_json(args.dataset_meta_file)
+    else:
+        dataset_root = {"metadata":{"path":{}}}
+        
+    # add all datasets
+    for split, dataset_name, datafile_path in all_datafiles_path:
+        dataset_root[dataset_name] = os.path.abspath(args.save_dir)
+        dataset_root['metadata']['path'][dataset_name] = {}
+        dataset_root['metadata']['path'][dataset_name][split] = os.path.abspath(datafile_path)
+        for split_check in ['train', 'test', 'val']:
+            dataset_root['metadata']['path'][dataset_name][split_check] = dataset_root['metadata']['path'][dataset_name].get(split_check, "")
+    
+    write_json(dataset_root, args.dataset_meta_file)
+    print("[INFO] Congrats! done organizing dataset")
+    print("[INFO] Please use the following file path as the `metadata_root` in your experiments configurations:", os.path.abspath(args.dataset_meta_file))
+    print("[INFO] You may use any of the following datasets to use in your experiments are:", [entry[1] for entry in all_datafiles_path])
+        
\ No newline at end of file