forked from NVIDIA/tacotron2
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinference_griffin-lim.py
104 lines (88 loc) · 3.47 KB
/
inference_griffin-lim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# coding: utf-8
"""
Synthesis waveform with griffin-lim algorithm.
usage: inference_griffin-lim.py [options] <dst_dir>
options:
--hparams=<parmas> Hyper parameters [default: ].
-i, --input-file=<p> Input txt file path.
-t, --tacotron-checkpoint=<p> Tacotron Checkpoint Path
-h, --help Show help message.
"""
from docopt import docopt
from os import makedirs
from os.path import dirname, join, basename, splitext
import sys
import torch
import numpy as np
import soundfile as sf
# from Tacotron Modules
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
import audio_processing
hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.filter_length = 1024
hparams.hop_length = 256
hparams.win_length = 1024
def load_tacotron2(tacotron_checkpoint_path):
global hparams
model = load_model(hparams)
model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict'])
_ = model.eval()
return model
def waveform_generation(text, tacotron_model):
# Prepare text input
sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
sequence = torch.autograd.Variable(
torch.from_numpy(sequence)).cuda().long()
# Tacotron Inference
mel_outputs, mel_outputs_postnet, _, alignments = tacotron_model.inference(sequence)
# Griffin Lim preprocess
n_iter = 60
taco_stft = TacotronSTFT(
hparams.filter_length,
hparams.hop_length,
hparams.win_length,
sampling_rate=hparams.sampling_rate
)
# Convert mel-spec to linear-spec
mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000
spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling
# Griffin-Lim
waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, n_iter)
return waveform[0].data.cpu().numpy()
if __name__ == "__main__":
args = docopt(__doc__)
print("Command line args:\n", args)
tacotron_checkpoint = args["--tacotron-checkpoint"]
input_file_path = args["--input-file"]
dst_dir = args["<dst_dir>"]
# Create output directory
makedirs(dst_dir, exist_ok=True)
checkpoint_taco_name = splitext(basename(tacotron_checkpoint))[0].replace('checkpoint_', '')
tacotron_model = load_tacotron2(tacotron_checkpoint)
try:
with open(input_file_path) as f:
content = f.read().splitlines()
except FileNotFoundError:
print("File {} not found.".format(input_file_path))
# Create output directory
subdir = 'samples_griffinlim' + '_taco_' + checkpoint_taco_name
makedirs(join(dst_dir, subdir), exist_ok=True)
for i, text in enumerate(content):
print("Generating Waveform " + str(i))
waveform = waveform_generation(text, tacotron_model)
# save
output_filepath = join(dst_dir, subdir, "{}.wav".format(i))
sf.write(output_filepath, waveform, samplerate=hparams.sampling_rate)
print("Waveform {} OK".format(i))
print("Finished! Check out {} for generated audio samples.".format(dst_dir))
sys.exit(0)