From 531ef65f879ef9ebd608f0bf17bc4c5a248e7be6 Mon Sep 17 00:00:00 2001
From: Marc Schreiber <info@schrieveslaach.de>
Date: Sun, 25 Nov 2018 21:18:45 +0100
Subject: [PATCH] Add support for CSS10 datasets and improved docker image to
 reuse pip install

---
 .dockerignore     |  1 +
 README.md         | 13 ++++++++++++-
 cpu.Dockerfile    | 10 +++++++---
 datasets/css10.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 preprocess.py     | 14 ++++++++++++--
 5 files changed, 75 insertions(+), 6 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 datasets/css10.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..2d2ecd6
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+.git/
diff --git a/README.md b/README.md
index bc4c391..e3ec6fe 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,7 @@ Contributions are accepted! We'd love the communities help in building a better
     * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
     * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
     * [M-ailabs](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)
+    * [CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages](https://github.com/Kyubyong/css10)
 
    You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info.
 
@@ -90,7 +91,17 @@ Contributions are accepted! We'd love the communities help in building a better
              |- lab
              |- wav
    ```
-   
+
+   alternatively, like this for CSS10, German dataset (make sure to adjust `text/symbols.py` in order to meet the character set):
+   ```
+   tacotron
+     |- css10
+         |- achtgesichterambiwasse
+         |- meisterfloh
+         |- serapionsbruederauswahl
+         |- transcript.txt
+   ```
+
    For M-AILABS follow the directory structure from [here](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)
 
 3. **Preprocess the data**
diff --git a/cpu.Dockerfile b/cpu.Dockerfile
index 1fa47bf..45d8579 100644
--- a/cpu.Dockerfile
+++ b/cpu.Dockerfile
@@ -1,8 +1,12 @@
 FROM tensorflow/tensorflow:1.8.0-py3
 
 RUN mkdir /root/mimic2
-COPY . /root/mimic2
 WORKDIR /root/mimic2
-RUN pip install  --no-cache-dir -r requirements.txt
 
-ENTRYPOINT [ "/bin/bash" ]
\ No newline at end of file
+COPY requirements.txt /root/mimic2/requirements.txt
+RUN pip install --upgrade pip && pip install  --no-cache-dir -r requirements.txt
+RUN apt update && apt install -y ffmpeg
+
+COPY . /root/mimic2
+
+ENTRYPOINT [ "/bin/bash" ]
diff --git a/datasets/css10.py b/datasets/css10.py
new file mode 100644
index 0000000..b8ad08d
--- /dev/null
+++ b/datasets/css10.py
@@ -0,0 +1,43 @@
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+import numpy as np
+import os
+
+from util import audio
+
+
+def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
+    '''Preprocesses the css10 dataset from a given input path into a given output directory.'''
+    executor = ProcessPoolExecutor(max_workers=num_workers)
+    futures = []
+
+    # Read the transcript file
+    with open(os.path.join(in_dir, 'transcript.txt'), encoding='utf-8') as f:
+        for line in f:
+            parts = line.strip().split('|')
+            path = os.path.join(in_dir, parts[0])
+            text = parts[1]
+            futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0].split('/')[1], path, text)))
+
+    return [future.result() for future in tqdm(futures)]
+
+
+def _process_utterance(out_dir, prompt_id, wav_path, text):
+    # Load the audio to a numpy array:
+    wav = audio.load_wav(wav_path)
+
+    # Compute the linear-scale spectrogram from the wav:
+    spectrogram = audio.spectrogram(wav).astype(np.float32)
+
+    # Compute a mel-scale spectrogram from the wav:
+    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
+
+    # Write the spectrograms to disk:
+    spectrogram_filename = 'css10-spec-%s.npy' % prompt_id
+    mel_filename = 'css10css10-mel-%s.npy' % prompt_id
+    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
+    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T,  allow_pickle=False)
+
+    # Return a tuple describing this training example:
+    n_frames = spectrogram.shape[1]
+    return (spectrogram_filename, mel_filename, n_frames, text)
diff --git a/preprocess.py b/preprocess.py
index 97f1e48..69c25a7 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -2,7 +2,7 @@
 import os
 from multiprocessing import cpu_count
 from tqdm import tqdm
-from datasets import amy, blizzard, ljspeech, kusal, mailabs
+from datasets import amy, blizzard, css10, ljspeech, kusal, mailabs
 from hparams import hparams, hparams_debug_string
 
 
@@ -32,6 +32,14 @@ def preprocess_amy(args):
   write_metadata(metadata, out_dir)
 
 
+def preprocess_css10_de(args):
+  in_dir = os.path.join(args.base_dir, 'css10')
+  out_dir = os.path.join(args.base_dir, args.output)
+  os.makedirs(out_dir, exist_ok=True)
+  metadata = css10.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
+  write_metadata(metadata, out_dir)
+
+
 def preprocess_kusal(args):
   in_dir = os.path.join(args.base_dir, 'kusal')
   out_dir = os.path.join(args.base_dir, args.output)
@@ -79,7 +87,7 @@ def main():
   parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
   parser.add_argument('--output', default='training')
   parser.add_argument(
-      '--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs']
+      '--dataset', required=True, choices=['amy', 'blizzard', 'css10', 'ljspeech', 'kusal', 'mailabs']
   )
   parser.add_argument('--mailabs_books_dir',
                       help='absolute directory to the books for the mlailabs')
@@ -103,6 +111,8 @@ def main():
     preprocess_amy(args)
   elif args.dataset == 'blizzard':
     preprocess_blizzard(args)
+  elif args.dataset == 'css10':
+    preprocess_css10_de(args)
   elif args.dataset == 'ljspeech':
     preprocess_ljspeech(args)
   elif args.dataset == 'kusal':