inference_griffin-lim.py

# coding: utf-8
"""
Synthesis waveform with griffin-lim algorithm.

usage: inference_griffin-lim.py [options] <dst_dir>

options:
    --hparams=<parmas>                Hyper parameters [default: ].
    -i, --input-file=<p>                  Input txt file path.
    -t, --tacotron-checkpoint=<p>         Tacotron Checkpoint Path
    -h, --help               Show help message.
"""
from docopt import docopt
from os import makedirs
from os.path import dirname, join, basename, splitext
import sys
import torch
import numpy as np
import soundfile as sf
# from Tacotron Modules
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
import audio_processing


hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.filter_length = 1024
hparams.hop_length    = 256
hparams.win_length    = 1024


def load_tacotron2(tacotron_checkpoint_path):
    global hparams
    model = load_model(hparams)
    model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict'])
    _ = model.eval()
    return model


def waveform_generation(text, tacotron_model):
    # Prepare text input
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()
    # Tacotron Inference
    mel_outputs, mel_outputs_postnet, _, alignments = tacotron_model.inference(sequence)
    # Griffin Lim preprocess
    n_iter = 60
    taco_stft = TacotronSTFT(
        hparams.filter_length, 
        hparams.hop_length, 
        hparams.win_length,
        sampling_rate=hparams.sampling_rate
    )
    # Convert mel-spec to linear-spec
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    # Griffin-Lim 
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, n_iter)
    return waveform[0].data.cpu().numpy()


if __name__ == "__main__":
    args = docopt(__doc__)
    print("Command line args:\n", args)
    tacotron_checkpoint  = args["--tacotron-checkpoint"]
    input_file_path      = args["--input-file"]
    dst_dir              = args["<dst_dir>"]

    # Create output directory
    makedirs(dst_dir, exist_ok=True)

    checkpoint_taco_name = splitext(basename(tacotron_checkpoint))[0].replace('checkpoint_', '')
    tacotron_model  = load_tacotron2(tacotron_checkpoint)

    try:
        with open(input_file_path) as f:
            content =  f.read().splitlines()
    except FileNotFoundError:
        print("File {} not found.".format(input_file_path))

    # Create output directory
    subdir = 'samples_griffinlim' + '_taco_' + checkpoint_taco_name
    makedirs(join(dst_dir, subdir), exist_ok=True)
    
    for i, text in enumerate(content):
        print("Generating Waveform " + str(i))
        waveform = waveform_generation(text, tacotron_model)
        # save
        output_filepath = join(dst_dir, subdir, "{}.wav".format(i))
        sf.write(output_filepath, waveform, samplerate=hparams.sampling_rate)
        print("Waveform {} OK".format(i))

    print("Finished! Check out {} for generated audio samples.".format(dst_dir))
    sys.exit(0)