inference_waveglow.py

# coding: utf-8
"""
Synthesis waveform with Waveglow.

usage: inference_waveglow.py [options] <dst_dir>

options:
    --hparams=<parmas>                Hyper parameters [default: ].
    -i, --input-file=<p>                  Input txt file path.
    -t, --tacotron-checkpoint=<p>         Tacotron Checkpoint Path
    -w, --waveglow-checkpoint=<p>         Waveglow Checkpoint Path
    -d. --denoiser                        Use Waveglow Denoiser
    -h, --help               Show help message.
"""
from docopt import docopt
from os import makedirs
from os.path import dirname, join, basename, splitext
import torch
import numpy as np
import soundfile as sf

# Tacotron2 modules
from model import Tacotron2
from hparams import create_hparams
from train import load_model
from text import text_to_sequence

import sys
sys.path.append('waveglow/')
from denoiser import Denoiser

MAX_WAV_VALUE = 32767.5

hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.filter_length = 1024
hparams.hop_length    = 256
hparams.win_length    = 1024


def load_waveglow(waveglow_checkpoint):
    waveglow_model = torch.load(waveglow_checkpoint)['model']
    return waveglow_model.cuda()


def load_tacotron2(tacotron_checkpoint_path):
    global hparams
    model = load_model(hparams)
    model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict'])
    _ = model.eval()
    return model


def waveform_generation(text, tacotron_model, waveglow_model, use_denoiser=False):
    # Prepare text input
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()
    # Tacotron Inference
    mel_outputs, mel_outputs_postnet, _, alignments = tacotron_model.inference(sequence)
    # Waveglow Inference
    with torch.no_grad():
        waveform = waveglow_model.infer(mel_outputs_postnet, sigma=0.666)

    if use_denoiser:
        print("Using denoiser")
        denoiser = Denoiser(waveglow_model)
        waveform = denoiser(waveform, strength=0.01)[:, 0]

    return waveform[0].data.cpu().numpy()


if __name__ == "__main__":
    args = docopt(__doc__)
    print("Command line args:\n", args)
    input_file_path      = args["--input-file"]    
    tacotron_checkpoint  = args["--tacotron-checkpoint"]
    waveglow_checkpoint  = args["--waveglow-checkpoint"]    
    use_denoiser         = args["--denoiser"]
    dst_dir              = args["<dst_dir>"]

    # Create output directory
    makedirs(dst_dir, exist_ok=True)

    checkpoint_wav_name  = splitext(basename(waveglow_checkpoint))[0].replace('waveglow_', '')
    checkpoint_taco_name = splitext(basename(tacotron_checkpoint))[0].replace('checkpoint_', '')

    tacotron_model  = load_tacotron2(tacotron_checkpoint)
    waveglow_model  = load_waveglow(waveglow_checkpoint)

    try:
        with open(input_file_path) as f:
            content =  f.read().splitlines()
    except FileNotFoundError:
        print("File {} not found.".format(input_file_path))

    # Create output directory
    subdir = 'samples_waveglow_' + checkpoint_wav_name + '_taco_' + checkpoint_taco_name
    makedirs(join(dst_dir, subdir), exist_ok=True)
    
    for i, text in enumerate(content):
        print("Generating Waveform " + str(i))
        waveform = waveform_generation(text, tacotron_model, waveglow_model, use_denoiser)
        # save
        output_filepath = join(dst_dir, subdir, "{}.wav".format(i))
        sf.write(output_filepath, waveform, samplerate=hparams.sampling_rate)
        print("Waveform {} OK".format(i))

    print("Finished! Check out {} for generated audio samples.".format(dst_dir))
    sys.exit(0)