From a69d0a950e34a462c60a842262687c9bc58d2abd Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 20 Oct 2023 15:57:52 +0800 Subject: [PATCH] Add Go API for TTS (#377) --- .github/workflows/test-go-package.yaml | 122 +++++++++++++++++- .github/workflows/test-go.yaml | 36 ++++++ CMakeLists.txt | 2 +- c-api-examples/offline-tts-c-api.c | 2 +- .../non-streaming-decode-files/go.mod | 6 - .../non-streaming-decode-files/go.sum | 35 ----- go-api-examples/non-streaming-tts/go.mod | 3 + go-api-examples/non-streaming-tts/main.go | 61 +++++++++ .../non-streaming-tts/run-vits-ljs.sh | 14 ++ .../non-streaming-tts/run-vits-vctk.sh | 16 +++ .../non-streaming-tts/run-vits-zh-aishell3.sh | 16 +++ .../go.mod | 6 - .../go.sum | 12 -- go-api-examples/streaming-decode-files/go.mod | 6 - go-api-examples/streaming-decode-files/go.sum | 35 ----- .../go/_internal/non-streaming-tts/.gitignore | 5 + scripts/go/_internal/non-streaming-tts/go.mod | 5 + .../go/_internal/non-streaming-tts/main.go | 1 + .../streaming-decode-files/run-paraformer.sh | 1 + .../streaming-decode-files/run-transducer.sh | 1 + scripts/go/sherpa_onnx.go | 112 ++++++++++++++++ sherpa-onnx/c-api/c-api.cc | 6 +- sherpa-onnx/c-api/c-api.h | 5 +- 23 files changed, 400 insertions(+), 108 deletions(-) delete mode 100644 go-api-examples/non-streaming-decode-files/go.sum create mode 100644 go-api-examples/non-streaming-tts/go.mod create mode 100644 go-api-examples/non-streaming-tts/main.go create mode 100755 go-api-examples/non-streaming-tts/run-vits-ljs.sh create mode 100755 go-api-examples/non-streaming-tts/run-vits-vctk.sh create mode 100755 go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh delete mode 100644 go-api-examples/real-time-speech-recognition-from-microphone/go.sum delete mode 100644 go-api-examples/streaming-decode-files/go.sum create mode 100644 scripts/go/_internal/non-streaming-tts/.gitignore create mode 100644 scripts/go/_internal/non-streaming-tts/go.mod create mode 120000 scripts/go/_internal/non-streaming-tts/main.go create mode 120000 scripts/go/_internal/streaming-decode-files/run-paraformer.sh create mode 120000 scripts/go/_internal/streaming-decode-files/run-transducer.sh diff --git a/.github/workflows/test-go-package.yaml b/.github/workflows/test-go-package.yaml index 186e197e4..c36842259 100644 --- a/.github/workflows/test-go-package.yaml +++ b/.github/workflows/test-go-package.yaml @@ -39,7 +39,7 @@ jobs: fetch-depth: 0 - uses: actions/setup-go@v4 with: - go-version: '>=1.20' + go-version: '>=1.12' - name: Display go version shell: bash @@ -66,6 +66,121 @@ jobs: run: | gcc --version + - name: Test non-streaming TTS (Linux/macOS) + if: matrix.os != 'windows-latest' + shell: bash + run: | + mkdir tts-waves + cd go-api-examples/non-streaming-tts + ls -lh + go mod tidy + cat go.mod + go build + ls -lh + + git lfs install + + echo "Test vits-ljs" + git clone https://huggingface.co/csukuangfj/vits-ljs + ./run-vits-ljs.sh + rm -rf vits-ljs + + echo "Test vits-vctk" + git clone https://huggingface.co/csukuangfj/vits-vctk + ./run-vits-vctk.sh + rm -rf vits-vctk + + echo "Test vits-zh-aishell3" + git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 + ./run-vits-zh-aishell3.sh + rm -rf vits-zh-aishell3 + + ls -lh *.wav + cp *.wav ../../tts-waves/ + + - name: Test non-streaming TTS (Win64) + if: matrix.os == 'windows-latest' && matrix.arch == 'x64' + shell: bash + run: | + mkdir tts-waves + cd go-api-examples/non-streaming-tts + ls -lh + go mod tidy + cat go.mod + go build + ls -lh + + echo $PWD + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/ + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/* + cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll . + ls -lh + + git lfs install + + echo "Test vits-ljs" + git clone https://huggingface.co/csukuangfj/vits-ljs + ./run-vits-ljs.sh + rm -rf vits-ljs + + echo "Test vits-vctk" + git clone https://huggingface.co/csukuangfj/vits-vctk + ./run-vits-vctk.sh + rm -rf vits-vctk + + echo "Test vits-zh-aishell3" + git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 + ./run-vits-zh-aishell3.sh + rm -rf vits-zh-aishell3 + + ls -lh *.wav + cp *.wav ../../tts-waves/ + + - name: Test non-streaming TTS (Win32) + if: matrix.os == 'windows-latest' && matrix.arch == 'x86' + shell: bash + run: | + cd go-api-examples/non-streaming-tts + ls -lh + go mod tidy + cat go.mod + ls -lh + + go env GOARCH + go env + echo "------------------------------" + go env -w GOARCH=386 + go env -w CGO_ENABLED=1 + go env + + go clean + go build + + echo $PWD + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/ + cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll . + ls -lh + + git lfs install + + echo "Test vits-ljs" + git clone https://huggingface.co/csukuangfj/vits-ljs + ./run-vits-ljs.sh + rm -rf vits-ljs + + echo "Test vits-vctk" + git clone https://huggingface.co/csukuangfj/vits-vctk + ./run-vits-vctk.sh + rm -rf vits-vctk + + echo "Test vits-zh-aishell3" + git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 + ./run-vits-zh-aishell3.sh + rm -rf vits-zh-aishell3 + + ls -lh *.wav + cp *.wav ../../tts-waves/ + - name: Test non-streaming decoding files (Linux/macOS) if: matrix.os != 'windows-latest' shell: bash @@ -298,3 +413,8 @@ jobs: git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en ./run-paraformer.sh rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en + + - uses: actions/upload-artifact@v3 + with: + name: tts-waves + path: tts-waves diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index a37465ff9..9b2888e0b 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -60,6 +60,42 @@ jobs: go mod tidy go build + - name: Test non-streaming TTS (macOS) + shell: bash + run: | + mkdir tts-waves + + cd scripts/go/_internal/non-streaming-tts/ + ls -lh + go mod tidy + cat go.mod + go build + ls -lh + + git lfs install + + echo "Test vits-ljs" + git clone https://huggingface.co/csukuangfj/vits-ljs + ./run-vits-ljs.sh + rm -rf vits-ljs + + echo "Test vits-vctk" + git clone https://huggingface.co/csukuangfj/vits-vctk + ./run-vits-vctk.sh + rm -rf vits-vctk + + echo "Test vits-zh-aishell3" + git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 + ./run-vits-zh-aishell3.sh + rm -rf vits-zh-aishell3 + + cp *.wav ../../../../tts-waves/ + + - uses: actions/upload-artifact@v3 + with: + name: tts-waves + path: tts-waves + - name: Test non-streaming decoding files (macOS) shell: bash run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index bf4318aab..0e15bd1a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.8.3") +set(SHERPA_ONNX_VERSION "1.8.4") # Disable warning about # diff --git a/c-api-examples/offline-tts-c-api.c b/c-api-examples/offline-tts-c-api.c index 54ba4d099..2b11eb096 100644 --- a/c-api-examples/offline-tts-c-api.c +++ b/c-api-examples/offline-tts-c-api.c @@ -188,7 +188,7 @@ int32_t main(int32_t argc, char *argv[]) { const SherpaOnnxGeneratedAudio *audio = SherpaOnnxOfflineTtsGenerate(tts, text, sid); - SherpaOnnxDestroyOfflineWriteWave(audio, filename); + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename); SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); SherpaOnnxDestroyOfflineTts(tts); diff --git a/go-api-examples/non-streaming-decode-files/go.mod b/go-api-examples/non-streaming-decode-files/go.mod index 4f81374ff..c52a51d40 100644 --- a/go-api-examples/non-streaming-decode-files/go.mod +++ b/go-api-examples/non-streaming-decode-files/go.mod @@ -1,9 +1,3 @@ module non-streaming-decode-files go 1.12 - -require ( - github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha - github.com/spf13/pflag v1.0.5 - github.com/youpy/go-wav v0.3.2 -) diff --git a/go-api-examples/non-streaming-decode-files/go.sum b/go-api-examples/non-streaming-decode-files/go.sum deleted file mode 100644 index 46db02fa2..000000000 --- a/go-api-examples/non-streaming-decode-files/go.sum +++ /dev/null @@ -1,35 +0,0 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ= -github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha h1:pm9VCFe51c59LilgDmGwKGfGB/TalLJX26LSvjrELTk= -github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha/go.mod h1:JLAytuKK2r1sPf8BcyaUTFfvmGGTLpbfG9g9x/Rq7GA= -github.com/k2-fsa/sherpa-onnx-go-linux v1.7.12 h1:9g6Af3kBtcbDrTH7EqlWB9cSvBsc/xY00r7MeA/qVzo= -github.com/k2-fsa/sherpa-onnx-go-linux v1.7.12/go.mod h1:lHZRU/WtBUJetJVPyXHg092diEWYyIEoaob+LMJKWvo= -github.com/k2-fsa/sherpa-onnx-go-macos v1.7.12-alpha h1:G8B6PaPHTFlbe6YtUFc7/H4rJfzmOJRvEzPJMj4h/w8= -github.com/k2-fsa/sherpa-onnx-go-macos v1.7.12-alpha/go.mod h1:o1Cd6Zy+Tpq3bLAWqBoVcDenxi8HSaSubURtbtIqH2s= -github.com/k2-fsa/sherpa-onnx-go-windows v1.7.12 h1:WudeR8tlCsS5uj0d99jJ+jaKjvyND+aCuajFDE9qEY4= -github.com/k2-fsa/sherpa-onnx-go-windows v1.7.12/go.mod h1:R7JSrFkZGkfM/F/gVSR+yTJ+sPaHhJgdqsB5N7dTU6E= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/youpy/go-riff v0.1.0 h1:vZO/37nI4tIET8tQI0Qn0Y79qQh99aEpponTPiPut7k= -github.com/youpy/go-riff v0.1.0/go.mod h1:83nxdDV4Z9RzrTut9losK7ve4hUnxUR8ASSz4BsKXwQ= -github.com/youpy/go-wav v0.3.2 h1:NLM8L/7yZ0Bntadw/0h95OyUsen+DQIVf9gay+SUsMU= -github.com/youpy/go-wav v0.3.2/go.mod h1:0FCieAXAeSdcxFfwLpRuEo0PFmAoc+8NU34h7TUvk50= -github.com/zaf/g711 v0.0.0-20190814101024-76a4a538f52b h1:QqixIpc5WFIqTLxB3Hq8qs0qImAgBdq0p6rq2Qdl634= -github.com/zaf/g711 v0.0.0-20190814101024-76a4a538f52b/go.mod h1:T2h1zV50R/q0CVYnsQOQ6L7P4a2ZxH47ixWcMXFGyx8= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= -gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= diff --git a/go-api-examples/non-streaming-tts/go.mod b/go-api-examples/non-streaming-tts/go.mod new file mode 100644 index 000000000..54127dc87 --- /dev/null +++ b/go-api-examples/non-streaming-tts/go.mod @@ -0,0 +1,3 @@ +module non-streaming-tts + +go 1.12 diff --git a/go-api-examples/non-streaming-tts/main.go b/go-api-examples/non-streaming-tts/main.go new file mode 100644 index 000000000..a263b42b2 --- /dev/null +++ b/go-api-examples/non-streaming-tts/main.go @@ -0,0 +1,61 @@ +package main + +import ( + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" + flag "github.com/spf13/pflag" + "log" +) + +func main() { + log.SetFlags(log.LstdFlags | log.Lmicroseconds) + + config := sherpa.OfflineTtsConfig{} + sid := 0 + filename := "./generated.wav" + + flag.StringVar(&config.Model.Vits.Model, "vits-model", "", "Path to the vits ONNX model") + flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt") + flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt") + + flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS") + flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS") + flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower") + + flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing") + flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message") + flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use") + + flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models") + flag.StringVar(&filename, "output-filename", "./generated.wav", "Filename to save the generated audio") + + flag.Parse() + + if len(flag.Args()) != 1 { + log.Fatalf("Please provide the text to generate audios") + } + + text := flag.Arg(0) + + log.Println("Input text:", text) + log.Println("Speaker ID:", sid) + log.Println("Output filename:", filename) + + log.Println("Initializing model (may take several seconds)") + + tts := sherpa.NewOfflineTts(&config) + defer sherpa.DeleteOfflineTts(tts) + + log.Println("Model created!") + + log.Println("Start generating!") + + audio := tts.Generate(text, sid) + + log.Println("Done!") + + ok := audio.Save(filename) + if ok != 1 { + log.Fatalf("Failed to write", filename) + } + +} diff --git a/go-api-examples/non-streaming-tts/run-vits-ljs.sh b/go-api-examples/non-streaming-tts/run-vits-ljs.sh new file mode 100755 index 000000000..c0f2a7b12 --- /dev/null +++ b/go-api-examples/non-streaming-tts/run-vits-ljs.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# please refer to +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#ljspeech-english-single-speaker +# to download the model before you run this script + +./non-streaming-tts \ + --vits-model=./vits-ljs/vits-ljs.onnx \ + --vits-lexicon=./vits-ljs/lexicon.txt \ + --vits-tokens=./vits-ljs/tokens.txt \ + --sid=0 \ + --debug=1 \ + --output-filename=./vits-ljs.wav \ + "Liliana, the most beautiful and lovely assistant of our team!" diff --git a/go-api-examples/non-streaming-tts/run-vits-vctk.sh b/go-api-examples/non-streaming-tts/run-vits-vctk.sh new file mode 100755 index 000000000..ceaf60433 --- /dev/null +++ b/go-api-examples/non-streaming-tts/run-vits-vctk.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# please refer to +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers +# to download the model before you run this script + +for sid in 0 10 108; do +./non-streaming-tts \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=0 \ + --debug=1 \ + --output-filename=./kennedy-$sid.wav \ + 'Ask not what your country can do for you; ask what you can do for your country.' +done diff --git a/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh b/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh new file mode 100755 index 000000000..2f0d5deab --- /dev/null +++ b/go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# please refer to +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#aishell3-chinese-multi-speaker-174-speakers +# to download the model before you run this script + +for sid in 10 33 99; do +./non-streaming-tts \ + --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ + --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-zh-aishell3/tokens.txt \ + --sid=10 \ + --debug=1 \ + --output-filename=./liliana-$sid.wav \ + "林美丽最美丽、最漂亮、最可爱!" +done diff --git a/go-api-examples/real-time-speech-recognition-from-microphone/go.mod b/go-api-examples/real-time-speech-recognition-from-microphone/go.mod index 13d8f41db..5d6a5b784 100644 --- a/go-api-examples/real-time-speech-recognition-from-microphone/go.mod +++ b/go-api-examples/real-time-speech-recognition-from-microphone/go.mod @@ -1,9 +1,3 @@ module real-time-speech-recognition-from-microphone go 1.12 - -require ( - github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 - github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha - github.com/spf13/pflag v1.0.5 -) diff --git a/go-api-examples/real-time-speech-recognition-from-microphone/go.sum b/go-api-examples/real-time-speech-recognition-from-microphone/go.sum deleted file mode 100644 index ac7a23bb4..000000000 --- a/go-api-examples/real-time-speech-recognition-from-microphone/go.sum +++ /dev/null @@ -1,12 +0,0 @@ -github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 h1:5AlozfqaVjGYGhms2OsdUyfdJME76E6rx5MdGpjzZpc= -github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5/go.mod h1:WY8R6YKlI2ZI3UyzFk7P6yGSuS+hFwNtEzrexRyD7Es= -github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha h1:pm9VCFe51c59LilgDmGwKGfGB/TalLJX26LSvjrELTk= -github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha/go.mod h1:JLAytuKK2r1sPf8BcyaUTFfvmGGTLpbfG9g9x/Rq7GA= -github.com/k2-fsa/sherpa-onnx-go-linux v1.7.12 h1:9g6Af3kBtcbDrTH7EqlWB9cSvBsc/xY00r7MeA/qVzo= -github.com/k2-fsa/sherpa-onnx-go-linux v1.7.12/go.mod h1:lHZRU/WtBUJetJVPyXHg092diEWYyIEoaob+LMJKWvo= -github.com/k2-fsa/sherpa-onnx-go-macos v1.7.12-alpha h1:G8B6PaPHTFlbe6YtUFc7/H4rJfzmOJRvEzPJMj4h/w8= -github.com/k2-fsa/sherpa-onnx-go-macos v1.7.12-alpha/go.mod h1:o1Cd6Zy+Tpq3bLAWqBoVcDenxi8HSaSubURtbtIqH2s= -github.com/k2-fsa/sherpa-onnx-go-windows v1.7.12 h1:WudeR8tlCsS5uj0d99jJ+jaKjvyND+aCuajFDE9qEY4= -github.com/k2-fsa/sherpa-onnx-go-windows v1.7.12/go.mod h1:R7JSrFkZGkfM/F/gVSR+yTJ+sPaHhJgdqsB5N7dTU6E= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= diff --git a/go-api-examples/streaming-decode-files/go.mod b/go-api-examples/streaming-decode-files/go.mod index d8e4837b3..8f30fdafa 100644 --- a/go-api-examples/streaming-decode-files/go.mod +++ b/go-api-examples/streaming-decode-files/go.mod @@ -1,9 +1,3 @@ module streaming-decode-files go 1.12 - -require ( - github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha - github.com/spf13/pflag v1.0.5 - github.com/youpy/go-wav v0.3.2 -) diff --git a/go-api-examples/streaming-decode-files/go.sum b/go-api-examples/streaming-decode-files/go.sum deleted file mode 100644 index 46db02fa2..000000000 --- a/go-api-examples/streaming-decode-files/go.sum +++ /dev/null @@ -1,35 +0,0 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ= -github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha h1:pm9VCFe51c59LilgDmGwKGfGB/TalLJX26LSvjrELTk= -github.com/k2-fsa/sherpa-onnx-go v1.7.12-alpha/go.mod h1:JLAytuKK2r1sPf8BcyaUTFfvmGGTLpbfG9g9x/Rq7GA= -github.com/k2-fsa/sherpa-onnx-go-linux v1.7.12 h1:9g6Af3kBtcbDrTH7EqlWB9cSvBsc/xY00r7MeA/qVzo= -github.com/k2-fsa/sherpa-onnx-go-linux v1.7.12/go.mod h1:lHZRU/WtBUJetJVPyXHg092diEWYyIEoaob+LMJKWvo= -github.com/k2-fsa/sherpa-onnx-go-macos v1.7.12-alpha h1:G8B6PaPHTFlbe6YtUFc7/H4rJfzmOJRvEzPJMj4h/w8= -github.com/k2-fsa/sherpa-onnx-go-macos v1.7.12-alpha/go.mod h1:o1Cd6Zy+Tpq3bLAWqBoVcDenxi8HSaSubURtbtIqH2s= -github.com/k2-fsa/sherpa-onnx-go-windows v1.7.12 h1:WudeR8tlCsS5uj0d99jJ+jaKjvyND+aCuajFDE9qEY4= -github.com/k2-fsa/sherpa-onnx-go-windows v1.7.12/go.mod h1:R7JSrFkZGkfM/F/gVSR+yTJ+sPaHhJgdqsB5N7dTU6E= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/youpy/go-riff v0.1.0 h1:vZO/37nI4tIET8tQI0Qn0Y79qQh99aEpponTPiPut7k= -github.com/youpy/go-riff v0.1.0/go.mod h1:83nxdDV4Z9RzrTut9losK7ve4hUnxUR8ASSz4BsKXwQ= -github.com/youpy/go-wav v0.3.2 h1:NLM8L/7yZ0Bntadw/0h95OyUsen+DQIVf9gay+SUsMU= -github.com/youpy/go-wav v0.3.2/go.mod h1:0FCieAXAeSdcxFfwLpRuEo0PFmAoc+8NU34h7TUvk50= -github.com/zaf/g711 v0.0.0-20190814101024-76a4a538f52b h1:QqixIpc5WFIqTLxB3Hq8qs0qImAgBdq0p6rq2Qdl634= -github.com/zaf/g711 v0.0.0-20190814101024-76a4a538f52b/go.mod h1:T2h1zV50R/q0CVYnsQOQ6L7P4a2ZxH47ixWcMXFGyx8= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= -gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= diff --git a/scripts/go/_internal/non-streaming-tts/.gitignore b/scripts/go/_internal/non-streaming-tts/.gitignore new file mode 100644 index 000000000..78e6b124c --- /dev/null +++ b/scripts/go/_internal/non-streaming-tts/.gitignore @@ -0,0 +1,5 @@ +*.wav +vits-ljs +vits-vctk +vits-zh-aishell3 +non-streaming-tts diff --git a/scripts/go/_internal/non-streaming-tts/go.mod b/scripts/go/_internal/non-streaming-tts/go.mod new file mode 100644 index 000000000..bbb8b070a --- /dev/null +++ b/scripts/go/_internal/non-streaming-tts/go.mod @@ -0,0 +1,5 @@ +module non-streaming-tts + +go 1.12 + +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../ diff --git a/scripts/go/_internal/non-streaming-tts/main.go b/scripts/go/_internal/non-streaming-tts/main.go new file mode 120000 index 000000000..e25d864b3 --- /dev/null +++ b/scripts/go/_internal/non-streaming-tts/main.go @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-tts/main.go \ No newline at end of file diff --git a/scripts/go/_internal/streaming-decode-files/run-paraformer.sh b/scripts/go/_internal/streaming-decode-files/run-paraformer.sh new file mode 120000 index 000000000..38a8e8874 --- /dev/null +++ b/scripts/go/_internal/streaming-decode-files/run-paraformer.sh @@ -0,0 +1 @@ +../../../../go-api-examples/streaming-decode-files/run-paraformer.sh \ No newline at end of file diff --git a/scripts/go/_internal/streaming-decode-files/run-transducer.sh b/scripts/go/_internal/streaming-decode-files/run-transducer.sh new file mode 120000 index 000000000..8f523d0e7 --- /dev/null +++ b/scripts/go/_internal/streaming-decode-files/run-transducer.sh @@ -0,0 +1 @@ +../../../../go-api-examples/streaming-decode-files/run-transducer.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 96e0db7e8..8503fc6b2 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -28,6 +28,11 @@ Usage examples: Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/streaming-decode-files + 4. Convert text to speech using a non-streaming model + + Please see + https://github.com/k2-fsa/sherpa-onnx/tree/master/go-api-examples/non-streaming-tts + [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx [onnxruntime]: https://github.com/microsoft/onnxruntime [Next-gen Kaldi]: https://github.com/k2-fsa/ @@ -488,3 +493,110 @@ func (s *OfflineStream) GetResult() *OfflineRecognizerResult { return result } + +// Configuration for offline/non-streaming text-to-speech (TTS). +// +// Please refer to +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html +// to download pre-trained models +type OfflineTtsVitsModelConfig struct { + Model string // Path to the VITS onnx model + Lexicon string // Path to lexicon.txt + Tokens string // Path to tokens.txt + NoiseScale float32 // noise scale for vits models. Please use 0.667 in general + NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general + LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed +} + +type OfflineTtsModelConfig struct { + Vits OfflineTtsVitsModelConfig + + // Number of threads to use for neural network computation + NumThreads int + + // 1 to print model meta information while loading + Debug int + + // Optional. Valid values: cpu, cuda, coreml + Provider string +} + +type OfflineTtsConfig struct { + Model OfflineTtsModelConfig +} + +type GeneratedAudio struct { + // Normalized samples in the range [-1, 1] + Samples []float32 + + SampleRate int +} + +// The offline tts class. It wraps a pointer from C. +type OfflineTts struct { + impl *C.struct_SherpaOnnxOfflineTts +} + +// Free the internal pointer inside the tts to avoid memory leak. +func DeleteOfflineTts(tts *OfflineTts) { + C.SherpaOnnxDestroyOfflineTts(tts.impl) + tts.impl = nil +} + +// The user is responsible to invoke [DeleteOfflineTts]() to free +// the returned tts to avoid memory leak +func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { + c := C.struct_SherpaOnnxOfflineTtsConfig{} + c.model.vits.model = C.CString(config.Model.Vits.Model) + defer C.free(unsafe.Pointer(c.model.vits.model)) + + c.model.vits.lexicon = C.CString(config.Model.Vits.Lexicon) + defer C.free(unsafe.Pointer(c.model.vits.lexicon)) + + c.model.vits.tokens = C.CString(config.Model.Vits.Tokens) + defer C.free(unsafe.Pointer(c.model.vits.tokens)) + + c.model.vits.noise_scale = C.float(config.Model.Vits.NoiseScale) + c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW) + c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale) + + c.model.num_threads = C.int(config.Model.NumThreads) + c.model.debug = C.int(config.Model.Debug) + + c.model.provider = C.CString(config.Model.Provider) + defer C.free(unsafe.Pointer(c.model.provider)) + + tts := &OfflineTts{} + tts.impl = C.SherpaOnnxCreateOfflineTts(&c) + + return tts +} + +func (tts *OfflineTts) Generate(text string, sid int) *GeneratedAudio { + s := C.CString(text) + defer C.free(unsafe.Pointer(s)) + + audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid)) + defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio) + + ans := &GeneratedAudio{} + ans.SampleRate = int(audio.sample_rate) + n := int(audio.n) + ans.Samples = make([]float32, n) + samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n] + // copy(ans.Samples, samples) + for i := 0; i < n; i++ { + ans.Samples[i] = float32(samples[i]) + } + + return ans +} + +func (audio *GeneratedAudio) Save(filename string) int { + s := C.CString(filename) + defer C.free(unsafe.Pointer(s)) + + ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s)) + + return ok +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index c07592e7d..9be6ff807 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -595,7 +595,7 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio( } } -int32_t SherpaOnnxDestroyOfflineWriteWave(const SherpaOnnxGeneratedAudio *p, - const char *filename) { - return sherpa_onnx::WriteWave(filename, p->sample_rate, p->samples, p->n); +int32_t SherpaOnnxWriteWave(const float *samples, int32_t n, + int32_t sample_rate, const char *filename) { + return sherpa_onnx::WriteWave(filename, sample_rate, samples, n); } diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 2898df30c..aab342390 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -648,8 +648,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio( // The saved wave file contains a single channel and has 16-bit samples. // // Return 1 if the write succeeded; return 0 on failure. -SHERPA_ONNX_API int32_t SherpaOnnxDestroyOfflineWriteWave( - const SherpaOnnxGeneratedAudio *p, const char *filename); +SHERPA_ONNX_API int32_t SherpaOnnxWriteWave(const float *samples, int32_t n, + int32_t sample_rate, + const char *filename); #if defined(__GNUC__) #pragma GCC diagnostic pop