forked from NVIDIA/tacotron2
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathapp.py
77 lines (61 loc) · 3.56 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import torch
import torchaudio
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
from inference_bigvgan import load_tacotron2, load_bigvgan, bigvgan_prediction
from text import text_to_sequence
import soundfile as sf
device="cpu"
tacotron_checkpoint = "checkpoints_projeto_museu/checkpoint_471000"
bigvgan_checkpoint = "bigvgan_base_22khz_80band/g_05000000.zip"
# Workaround to load model mapped on GPU
# https://stackoverflow.com/a/61840832
tacotron_model = load_tacotron2(tacotron_checkpoint)
bigvgan_model = load_bigvgan(bigvgan_checkpoint)
def waveform_generation(text, tacotron_model, bigvgan_model):
# Prepare text input
sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
sequence = torch.autograd.Variable(
torch.from_numpy(sequence)).cuda().long()
# Tacotron Inference
mel_outputs, mel_outputs_postnet, _, alignments = tacotron_model.inference(sequence)
# BigVGAN Inference
with torch.no_grad():
waveform = bigvgan_prediction(bigvgan_model, mel_outputs_postnet)
return waveform, mel_outputs_postnet.to("cpu").detach().numpy()
def inference(text):
waveform, spec = waveform_generation(text, tacotron_model, bigvgan_model)
spec = spec.squeeze()
plt.imshow(spec)
plt.axis('off')
plt.savefig("test.png", bbox_inches='tight')
sf.write("output_waveglow.wav", waveform, samplerate=22050, format='wav')
return "output_waveglow.wav", "test.png"
title = "TACOTRON 2"
description = "Gradio demo for TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add your text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples = [["Amor é fogo que arde sem se ver, é ferida que dói e não se sente, é um contentamento descontente, é dor que desatina sem doer."]]
audio_output = gr.Audio(type="filepath", label="Audio")
image_output = gr.Image(type="pil", label="Spectrogram")
iface = gr.Interface(
fn=inference,
inputs="text",
outputs=[audio_output, image_output],
title=title,
description=description,
article=article,
examples=examples
)
iface.launch(share=False)
'''
title="TACOTRON 2"
description="Gradio demo for TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples=[["Amor é fogo que arde sem se ver, é ferida que dói e não se sente, é um contentamento descontente, é dor que desatina sem doer."]]
gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)
gr.Interface(
inference,
"text",
[gr.outputs.Audio(type="file",label="Audio")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)
'''