From 531ef65f879ef9ebd608f0bf17bc4c5a248e7be6 Mon Sep 17 00:00:00 2001 From: Marc Schreiber Date: Sun, 25 Nov 2018 21:18:45 +0100 Subject: [PATCH] Add support for CSS10 datasets and improved docker image to reuse pip install --- .dockerignore | 1 + README.md | 13 ++++++++++++- cpu.Dockerfile | 10 +++++++--- datasets/css10.py | 43 +++++++++++++++++++++++++++++++++++++++++++ preprocess.py | 14 ++++++++++++-- 5 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 .dockerignore create mode 100644 datasets/css10.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2d2ecd6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.git/ diff --git a/README.md b/README.md index bc4c391..e3ec6fe 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ Contributions are accepted! We'd love the communities help in building a better * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain) * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike) * [M-ailabs](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) + * [CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages](https://github.com/Kyubyong/css10) You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info. @@ -90,7 +91,17 @@ Contributions are accepted! We'd love the communities help in building a better |- lab |- wav ``` - + + alternatively, like this for CSS10, German dataset (make sure to adjust `text/symbols.py` in order to meet the character set): + ``` + tacotron + |- css10 + |- achtgesichterambiwasse + |- meisterfloh + |- serapionsbruederauswahl + |- transcript.txt + ``` + For M-AILABS follow the directory structure from [here](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) 3. **Preprocess the data** diff --git a/cpu.Dockerfile b/cpu.Dockerfile index 1fa47bf..45d8579 100644 --- a/cpu.Dockerfile +++ b/cpu.Dockerfile @@ -1,8 +1,12 @@ FROM tensorflow/tensorflow:1.8.0-py3 RUN mkdir /root/mimic2 -COPY . /root/mimic2 WORKDIR /root/mimic2 -RUN pip install --no-cache-dir -r requirements.txt -ENTRYPOINT [ "/bin/bash" ] \ No newline at end of file +COPY requirements.txt /root/mimic2/requirements.txt +RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt +RUN apt update && apt install -y ffmpeg + +COPY . /root/mimic2 + +ENTRYPOINT [ "/bin/bash" ] diff --git a/datasets/css10.py b/datasets/css10.py new file mode 100644 index 0000000..b8ad08d --- /dev/null +++ b/datasets/css10.py @@ -0,0 +1,43 @@ +from concurrent.futures import ProcessPoolExecutor +from functools import partial +import numpy as np +import os + +from util import audio + + +def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): + '''Preprocesses the css10 dataset from a given input path into a given output directory.''' + executor = ProcessPoolExecutor(max_workers=num_workers) + futures = [] + + # Read the transcript file + with open(os.path.join(in_dir, 'transcript.txt'), encoding='utf-8') as f: + for line in f: + parts = line.strip().split('|') + path = os.path.join(in_dir, parts[0]) + text = parts[1] + futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0].split('/')[1], path, text))) + + return [future.result() for future in tqdm(futures)] + + +def _process_utterance(out_dir, prompt_id, wav_path, text): + # Load the audio to a numpy array: + wav = audio.load_wav(wav_path) + + # Compute the linear-scale spectrogram from the wav: + spectrogram = audio.spectrogram(wav).astype(np.float32) + + # Compute a mel-scale spectrogram from the wav: + mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) + + # Write the spectrograms to disk: + spectrogram_filename = 'css10-spec-%s.npy' % prompt_id + mel_filename = 'css10css10-mel-%s.npy' % prompt_id + np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) + np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) + + # Return a tuple describing this training example: + n_frames = spectrogram.shape[1] + return (spectrogram_filename, mel_filename, n_frames, text) diff --git a/preprocess.py b/preprocess.py index 97f1e48..69c25a7 100644 --- a/preprocess.py +++ b/preprocess.py @@ -2,7 +2,7 @@ import os from multiprocessing import cpu_count from tqdm import tqdm -from datasets import amy, blizzard, ljspeech, kusal, mailabs +from datasets import amy, blizzard, css10, ljspeech, kusal, mailabs from hparams import hparams, hparams_debug_string @@ -32,6 +32,14 @@ def preprocess_amy(args): write_metadata(metadata, out_dir) +def preprocess_css10_de(args): + in_dir = os.path.join(args.base_dir, 'css10') + out_dir = os.path.join(args.base_dir, args.output) + os.makedirs(out_dir, exist_ok=True) + metadata = css10.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) + write_metadata(metadata, out_dir) + + def preprocess_kusal(args): in_dir = os.path.join(args.base_dir, 'kusal') out_dir = os.path.join(args.base_dir, args.output) @@ -79,7 +87,7 @@ def main(): parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron')) parser.add_argument('--output', default='training') parser.add_argument( - '--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs'] + '--dataset', required=True, choices=['amy', 'blizzard', 'css10', 'ljspeech', 'kusal', 'mailabs'] ) parser.add_argument('--mailabs_books_dir', help='absolute directory to the books for the mlailabs') @@ -103,6 +111,8 @@ def main(): preprocess_amy(args) elif args.dataset == 'blizzard': preprocess_blizzard(args) + elif args.dataset == 'css10': + preprocess_css10_de(args) elif args.dataset == 'ljspeech': preprocess_ljspeech(args) elif args.dataset == 'kusal':