diff --git a/docs/config.toml b/docs/config.toml index f772a16..f614dd7 100644 --- a/docs/config.toml +++ b/docs/config.toml @@ -5,6 +5,7 @@ author = "Ryuichi YAMAMOTO" [params] author = "Ryuichi YAMAMOTO" + project = "wavenet_vocoder" logo = "/images/r9y9.jpg" twitter = "r9y9" github = "r9y9" diff --git a/docs/content/index.md b/docs/content/index.md index 8020074..9c12fdf 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -13,11 +13,12 @@ type = "index" - Github: https://github.com/r9y9/wavenet_vocoder This page provides audio samples for the open source implementation of the **WaveNet (WN)** vocoder. +Text-to-speech samples are found at the last section. -1. WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz) -2. WN conditioned on mel-spectrogram (8-bit mu-law, 16kHz) -3. WN conditioned on mel-spectrogram and speaker-embedding (16-bit linear PCM, 16kHz) -3. (Not yet) DeepVoice3 + WaveNet vocoder +- WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz) +- WN conditioned on mel-spectrogram (8-bit mu-law, 16kHz) +- WN conditioned on mel-spectrogram and speaker-embedding (16-bit linear PCM, 16kHz) +- Tacotron2 + WN text-to-speech (**New!**) ## WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz) @@ -402,9 +403,175 @@ Your browser does not support the audio element. [^1]: Note that mel-spectrogram used in local conditioning is dependent on speaker characteristics, so we cannot simply change the speaker identity of the generated audio samples using the model. It should work without speaker embedding, but it might have helped training speed. -## DeepVoice3 + WaveNet vocoder +## Tacotron2 + WN text-to-speech + +- Tacotron2: trained 189k steps on LJSpeech dataset ([Pre-trained model](https://www.dropbox.com/s/vx7y4qqs732sqgg/pretrained.tar.gz?dl=0), [Hyper params](https://github.com/r9y9/Tacotron-2/blob/9ce1a0e65b9217cdc19599c192c5cd68b4cece5b/hparams.py)). The work has been done by [@Rayhane-mamah](https://github.com/Rayhane-mamah). See https://github.com/Rayhane-mamah/Tacotron-2 for details. +- WaveNet: trained over 1000k steps on LJSpeech dataset ([Pre-trained model](https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth?dl=0), [Hyper params](https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json?dl=0)) + + +Scientists at the CERN laboratory say they have discovered a new particle. + + + + +There's a way to measure the acute emotional intelligence that has never gone out of style. + + + + +President Trump met with other leaders at the Group of 20 conference. + + + + +The Senate's bill to repeal and replace the Affordable Care Act is now imperiled. + + + + +Generative adversarial network or variational auto-encoder. + + + + +Basilar membrane and otolaryngology are not auto-correlations. + + + + +He has read the whole thing. + + + + +He reads books. + + + + +Don't desert me here in the desert! + + + + +He thought it was time to present the present. + + + +Thisss isrealy awhsome. + + + + +Punctuation sensitivity, is working. + + + + +Punctuation sensitivity is working. + + + + +The buses aren't the problem, they actually provide a solution. + + + + +The buses aren't the PROBLEM, they actually provide a SOLUTION. + + + + +The quick brown fox jumps over the lazy dog. + + + +Does the quick brown fox jump over the lazy dog? + + + + +Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick? + + + + +She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure. + + + + +The blue lagoon is a nineteen eighty American romance adventure film. + + + + +### On-line demo + +A demonstration notebook supposed to be run on Google colab can be found at [Tacotron2 + WaveNet text-to-speech demo +](https://colab.research.google.com/github/r9y9/Colaboratory/blob/master/Tacotron2_and_WaveNet_text_to_speech_demo.ipynb). -TODO ## References @@ -413,3 +580,4 @@ TODO - [Tamamori, Akira, et al. "Speaker-dependent WaveNet vocoder." Proceedings of Interspeech. 2017.](http://www.isca-speech.org/archive/Interspeech_2017/pdfs/0314.PDF) - [Jonathan Shen, Ruoming Pang, Ron J. Weiss, et al, "Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions", arXiv:1712.05884, Dec 2017.](https://arxiv.org/abs/1712.05884) - [Wei Ping, Kainan Peng, Andrew Gibiansky, et al, "Deep Voice 3: 2000-Speaker Neural Text-to-Speech", arXiv:1710.07654, Oct. 2017.](https://arxiv.org/abs/1710.07654) +- [Jonathan Shen, Ruoming Pang, Ron J. Weiss, et al, "Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions", arXiv:1712.05884, Dec 2017.](https://arxiv.org/abs/1712.05884) diff --git a/docs/layouts/partials/header.html b/docs/layouts/partials/header.html index 655176d..e22df5a 100644 --- a/docs/layouts/partials/header.html +++ b/docs/layouts/partials/header.html @@ -6,11 +6,11 @@ - - - - - + + + + +