app.py

import torch
import torchaudio
import gradio as gr
import matplotlib.pyplot as plt

import numpy as np
from inference_bigvgan import load_tacotron2, load_bigvgan, bigvgan_prediction
from text import text_to_sequence
import soundfile as sf

device="cpu"

tacotron_checkpoint = "checkpoints_projeto_museu/checkpoint_471000"
bigvgan_checkpoint  = "bigvgan_base_22khz_80band/g_05000000.zip"

# Workaround to load model mapped on GPU
# https://stackoverflow.com/a/61840832
tacotron_model = load_tacotron2(tacotron_checkpoint)
bigvgan_model  = load_bigvgan(bigvgan_checkpoint)

def waveform_generation(text, tacotron_model, bigvgan_model):
    # Prepare text input
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()
    # Tacotron Inference
    mel_outputs, mel_outputs_postnet, _, alignments = tacotron_model.inference(sequence)
    # BigVGAN Inference
    with torch.no_grad():
        waveform = bigvgan_prediction(bigvgan_model, mel_outputs_postnet)
    return waveform, mel_outputs_postnet.to("cpu").detach().numpy()


def inference(text):
    waveform, spec = waveform_generation(text, tacotron_model, bigvgan_model)
    spec = spec.squeeze()
    plt.imshow(spec)
    plt.axis('off')
    plt.savefig("test.png", bbox_inches='tight')

    sf.write("output_waveglow.wav", waveform, samplerate=22050, format='wav')

    return "output_waveglow.wav", "test.png"


title = "TACOTRON 2"
description = "Gradio demo for TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add your text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples = [["Amor é fogo que arde sem se ver, é ferida que dói e não se sente, é um contentamento descontente, é dor que desatina sem doer."]]

audio_output = gr.Audio(type="filepath", label="Audio")
image_output = gr.Image(type="pil", label="Spectrogram")

iface = gr.Interface(
    fn=inference,
    inputs="text",
    outputs=[audio_output, image_output],
    title=title,
    description=description,
    article=article,
    examples=examples
)

iface.launch(share=False)

'''
title="TACOTRON 2"
description="Gradio demo for TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples=[["Amor é fogo que arde sem se ver, é ferida que dói e não se sente, é um contentamento descontente, é dor que desatina sem doer."]]
gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)

gr.Interface(
    inference,
    "text",
    [gr.outputs.Audio(type="file",label="Audio")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)
'''