Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for CSS10 datasets #24

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.git/
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Contributions are accepted! We'd love the communities help in building a better
* [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
* [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
* [M-ailabs](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)
* [CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages](https://github.com/Kyubyong/css10)

You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info.

Expand Down Expand Up @@ -90,7 +91,17 @@ Contributions are accepted! We'd love the communities help in building a better
|- lab
|- wav
```


alternatively, like this for CSS10, German dataset (make sure to adjust `text/symbols.py` in order to meet the character set):
```
tacotron
|- css10
|- achtgesichterambiwasse
|- meisterfloh
|- serapionsbruederauswahl
|- transcript.txt
```

For M-AILABS follow the directory structure from [here](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)

3. **Preprocess the data**
Expand Down
10 changes: 7 additions & 3 deletions cpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
FROM tensorflow/tensorflow:1.8.0-py3

RUN mkdir /root/mimic2
COPY . /root/mimic2
WORKDIR /root/mimic2
RUN pip install --no-cache-dir -r requirements.txt

ENTRYPOINT [ "/bin/bash" ]
COPY requirements.txt /root/mimic2/requirements.txt
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
RUN apt update && apt install -y ffmpeg

COPY . /root/mimic2

ENTRYPOINT [ "/bin/bash" ]
43 changes: 43 additions & 0 deletions datasets/css10.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os

from util import audio


def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
'''Preprocesses the css10 dataset from a given input path into a given output directory.'''
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []

# Read the transcript file
with open(os.path.join(in_dir, 'transcript.txt'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
path = os.path.join(in_dir, parts[0])
text = parts[1]
futures.append(executor.submit(partial(_process_utterance, out_dir, parts[0].split('/')[1], path, text)))

return [future.result() for future in tqdm(futures)]


def _process_utterance(out_dir, prompt_id, wav_path, text):
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)

# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)

# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

# Write the spectrograms to disk:
spectrogram_filename = 'css10-spec-%s.npy' % prompt_id
mel_filename = 'css10css10-mel-%s.npy' % prompt_id
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

# Return a tuple describing this training example:
n_frames = spectrogram.shape[1]
return (spectrogram_filename, mel_filename, n_frames, text)
14 changes: 12 additions & 2 deletions preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from multiprocessing import cpu_count
from tqdm import tqdm
from datasets import amy, blizzard, ljspeech, kusal, mailabs
from datasets import amy, blizzard, css10, ljspeech, kusal, mailabs
from hparams import hparams, hparams_debug_string


Expand Down Expand Up @@ -32,6 +32,14 @@ def preprocess_amy(args):
write_metadata(metadata, out_dir)


def preprocess_css10_de(args):
in_dir = os.path.join(args.base_dir, 'css10')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = css10.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)


def preprocess_kusal(args):
in_dir = os.path.join(args.base_dir, 'kusal')
out_dir = os.path.join(args.base_dir, args.output)
Expand Down Expand Up @@ -79,7 +87,7 @@ def main():
parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
parser.add_argument('--output', default='training')
parser.add_argument(
'--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech', 'kusal', 'mailabs']
'--dataset', required=True, choices=['amy', 'blizzard', 'css10', 'ljspeech', 'kusal', 'mailabs']
)
parser.add_argument('--mailabs_books_dir',
help='absolute directory to the books for the mlailabs')
Expand All @@ -103,6 +111,8 @@ def main():
preprocess_amy(args)
elif args.dataset == 'blizzard':
preprocess_blizzard(args)
elif args.dataset == 'css10':
preprocess_css10_de(args)
elif args.dataset == 'ljspeech':
preprocess_ljspeech(args)
elif args.dataset == 'kusal':
Expand Down