From 1169a0dfbebaf86f1ee4a5d9cf1b252fc338e835 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Tue, 23 Jan 2024 00:20:24 +0100 Subject: [PATCH] ready for review --- notebooks/408-openvoice/408-openvoice.ipynb | 614 +++++++++++--------- notebooks/408-openvoice/openvoice_gradio.py | 40 +- notebooks/408-openvoice/openvoice_utils.py | 30 +- 3 files changed, 346 insertions(+), 338 deletions(-) diff --git a/notebooks/408-openvoice/408-openvoice.ipynb b/notebooks/408-openvoice/408-openvoice.ipynb index a16fc386d39..563eba5cbd3 100644 --- a/notebooks/408-openvoice/408-openvoice.ipynb +++ b/notebooks/408-openvoice/408-openvoice.ipynb @@ -8,73 +8,73 @@ ] }, { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "#!git clone https://github.com/myshell-ai/OpenVoice" + "OpenVoice is a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.\n", + "\n", + "This notebooks provides example of converting original OpenVoice model (https://github.com/myshell-ai/OpenVoice) to OpenVINO IR format for faster inference." ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# pip install openvino\n", - "\n", - "# todo: unfreeze dependencies\n", - "# %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \"torch>=2.1.0\" \"torchaudio>=2.1.0\"\n", - "# wavmark==0.0.2 also installs torch" + "clone the repository and install all dependencies" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# %pip install librosa==0.9.1 \\\n", - "# faster-whisper==0.9.0 \\\n", - "# pydub==0.25.1 \\\n", - "# whisper-timestamped==1.14.2 \\\n", - "# tqdm \\\n", - "# inflect==7.0.0 \\\n", - "# unidecode==1.3.7 \\\n", - "# eng_to_ipa==0.0.2 \\\n", - "# wavmark==0.0.2 \\\n", - "# pypinyin==0.50.0 \\\n", - "# cn2an==0.5.22 \\\n", - "# jieba==0.42.1 \\\n", - "# langid==1.1.6\n", - "# gradio==3.48.0 \\" + "!git clone https://github.com/myshell-ai/OpenVoice" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# %pip install gradio==3.48.0" + "%pip install openvino==2023.2\n", + "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \"torch>=2.1.0\" \"torchaudio>=2.1.0\"\n", + "%pip install wavmark also installs torch\n", + "\n", + "# todo: try to unfreeze dependencies\n", + "%pip install librosa==0.9.1 \\\n", + "faster-whisper==0.9.0 \\\n", + "pydub==0.25.1 \\\n", + "whisper-timestamped==1.14.2 \\\n", + "tqdm \\\n", + "inflect==7.0.0 \\\n", + "unidecode==1.3.7 \\\n", + "eng_to_ipa==0.0.2 \\\n", + "wavmark==0.0.2 \\\n", + "pypinyin==0.50.0 \\\n", + "cn2an==0.5.22 \\\n", + "jieba==0.42.1 \\\n", + "langid==1.1.6\n", + "gradio==4.15 \\\n", + "ipywebrtc \\\n", + "ipywidgets \\" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "import openvino as ov\n", "import os\n", "import torch\n", - "from openvoice_utils import get_tts_forward, get_converter_forward, OVOpenVoiceTTS, OVOpenVoiceConvert" + "from openvoice_utils import OVOpenVoiceTTS, OVOpenVoiceConvert" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -94,120 +94,53 @@ } ], "source": [ + "# cd to the original repo to save original data paths and imports\n", "%cd OpenVoice" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "download all resources from HF Hub" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-01-22 18:47:29-- https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/checkpoint.pth\n", - "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 302 Found\n", - "Location: https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27checkpoint.pth%3B+filename%3D%22checkpoint.pth%22%3B&Expires=1706204855&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvZGU5ZmIwZWI3NDlmMzI1NDEzMGZlMDE3MmZjYmIyMGU3NWY4OGE5YjE2YjU0ZGQwYjczY2FjMGRjNDBkYTdkOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g05Mq2b4B-jTfmZ1o5wZ67TcWOuqSyGp2CUV27L%7EiahZduyiT1R8LAyvTrrNC5i7s3yJ2xaPytGUXHStac4MB6vklQVSbpmmPBO0nZ9Fi%7EGTFHr5n89XWc1WFu6kR9Wn2PrXwadXB47XNAe-nqmEPI8ppaozpl0QSwbKWV6UT4076foFxvKmVd2tUo9zXfiwQG3JsE1VYCHslkH3idKw7w4GgbzLIKf5j0RSqPCjLSAzWvi1NRXY6WvW2-DfpxF2fldX3f73hQga5PZqvOKpEHmcmyYdjDEnGJZzeuXf8A0GrfbkRII%7Egbmcj106hq0CecrvG1XJGC9acMeeCRAASQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", - "--2024-01-22 18:47:29-- https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27checkpoint.pth%3B+filename%3D%22checkpoint.pth%22%3B&Expires=1706204855&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvZGU5ZmIwZWI3NDlmMzI1NDEzMGZlMDE3MmZjYmIyMGU3NWY4OGE5YjE2YjU0ZGQwYjczY2FjMGRjNDBkYTdkOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g05Mq2b4B-jTfmZ1o5wZ67TcWOuqSyGp2CUV27L%7EiahZduyiT1R8LAyvTrrNC5i7s3yJ2xaPytGUXHStac4MB6vklQVSbpmmPBO0nZ9Fi%7EGTFHr5n89XWc1WFu6kR9Wn2PrXwadXB47XNAe-nqmEPI8ppaozpl0QSwbKWV6UT4076foFxvKmVd2tUo9zXfiwQG3JsE1VYCHslkH3idKw7w4GgbzLIKf5j0RSqPCjLSAzWvi1NRXY6WvW2-DfpxF2fldX3f73hQga5PZqvOKpEHmcmyYdjDEnGJZzeuXf8A0GrfbkRII%7Egbmcj106hq0CecrvG1XJGC9acMeeCRAASQ__&Key-Pair-Id=KCD77M1F0VK2B\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 200 OK\n", - "Length: 160467309 (153M) [application/zip]\n", - "Saving to: ‘checkpoints/base_speakers/ZH/checkpoint.pth’\n", - "\n", - "checkpoints/base_sp 100%[===================>] 153,03M 3,98MB/s in 39s \n", - "\n", - "2024-01-22 18:48:08 (3,96 MB/s) - ‘checkpoints/base_speakers/ZH/checkpoint.pth’ saved [160467309/160467309]\n", - "\n", - "--2024-01-22 18:48:08-- https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/ZH/config.json\n", - "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 200 OK\n", - "Length: 1828 (1,8K) [text/plain]\n", - "Saving to: ‘checkpoints/base_speakers/ZH/config.json’\n", - "\n", - "checkpoints/base_sp 100%[===================>] 1,79K --.-KB/s in 0s \n", - "\n", - "2024-01-22 18:48:09 (5,62 GB/s) - ‘checkpoints/base_speakers/ZH/config.json’ saved [1828/1828]\n", - "\n", - "--2024-01-22 18:48:09-- https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_style_se.pth\n", - "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 302 Found\n", - "Location: https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27en_style_se.pth%3B+filename%3D%22en_style_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvNmY2OTgxNTNiZTUwMDRiOTBhODY0MmQxMTU3Yzg5Y2FlN2RkMjk2NzUyYTMyNzY0NTBjZWQ2YTE3YjhiOThhOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=ZTLflxpGZhaVLw7m2Z1yazyw9imi1828LU3PHyTaxGdkRRq%7E3JZwA5Uj%7ETuEICCCR0jLjAhKkywWyRQpZg6uhJzAe7vvQvsRJizpj5y9%7E1SsVszgBhkazxdkcxlHyo3kdOKqI0vaPKe9soQxAKq3KYDrc4LwshsIbrumvRmUuwquiVzZeWqKh-ILriFQfoy9gpbyaHWJt4dzeZUcbUOqVUxjgMFVMHWwiACFeFs5ISiA7glH8y4yhR59FfzyvLKoic3wyoQLvW6kvEiDPDrjumk%7EMlYhoWhKbrZrKUaKu%7ELaD57dPorz2P%7E48dCnIXkKmwRUJtSQfTSORLd%7EhVLAnQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", - "--2024-01-22 18:48:09-- https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27en_style_se.pth%3B+filename%3D%22en_style_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvNmY2OTgxNTNiZTUwMDRiOTBhODY0MmQxMTU3Yzg5Y2FlN2RkMjk2NzUyYTMyNzY0NTBjZWQ2YTE3YjhiOThhOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=ZTLflxpGZhaVLw7m2Z1yazyw9imi1828LU3PHyTaxGdkRRq%7E3JZwA5Uj%7ETuEICCCR0jLjAhKkywWyRQpZg6uhJzAe7vvQvsRJizpj5y9%7E1SsVszgBhkazxdkcxlHyo3kdOKqI0vaPKe9soQxAKq3KYDrc4LwshsIbrumvRmUuwquiVzZeWqKh-ILriFQfoy9gpbyaHWJt4dzeZUcbUOqVUxjgMFVMHWwiACFeFs5ISiA7glH8y4yhR59FfzyvLKoic3wyoQLvW6kvEiDPDrjumk%7EMlYhoWhKbrZrKUaKu%7ELaD57dPorz2P%7E48dCnIXkKmwRUJtSQfTSORLd%7EhVLAnQ__&Key-Pair-Id=KCD77M1F0VK2B\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 200 OK\n", - "Length: 1783 (1,7K) [application/zip]\n", - "Saving to: ‘checkpoints/base_speakers/EN/en_style_se.pth’\n", - "\n", - "checkpoints/base_sp 100%[===================>] 1,74K --.-KB/s in 0s \n", - "\n", - "2024-01-22 18:48:10 (87,7 MB/s) - ‘checkpoints/base_speakers/EN/en_style_se.pth’ saved [1783/1783]\n", - "\n", - "--2024-01-22 18:48:10-- https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/zh_default_se.pth\n", - "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 302 Found\n", - "Location: https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27zh_default_se.pth%3B+filename%3D%22zh_default_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvM2I2MmU4MjY0OTYyMDU5YjhhODRkZDAwYjI5ZTJmY2NjYzkyZjVkM2JlOTBlZWM2N2RmYTA4MmMwY2Y1OGNjZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g85yifinD1PSumgzQizzqdT1D1aHeVX-rhOQ63enKxx%7EjHPMScJ7wX-TxZVhU62KRtBCIExnTslWo%7E2xIHKrCN-4u8UjBxRrURtwrVKaJjqnhcoe2gzVHtlX0w1HYpqPX8LzGhliSWIlLSbcjeeXSMqSKvU7KXj8Bx73aruoz1E-Au6biP3AiWpsPFqyx8XMdjtZzf0m-qrzp4uDGClqr6qtMWuy8hFD4WkhehZ5IUcP5YC81oqCSRk4Hr7yad58Gc0ApsFPKEjtLmY1xmVXJwSsew1xCWMDO4Ca4Fsk9HzOySkmzzW-JRhNefZZZQOhtbpCzNsT1munxY7qa3yIfg__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", - "--2024-01-22 18:48:10-- https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27zh_default_se.pth%3B+filename%3D%22zh_default_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvM2I2MmU4MjY0OTYyMDU5YjhhODRkZDAwYjI5ZTJmY2NjYzkyZjVkM2JlOTBlZWM2N2RmYTA4MmMwY2Y1OGNjZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g85yifinD1PSumgzQizzqdT1D1aHeVX-rhOQ63enKxx%7EjHPMScJ7wX-TxZVhU62KRtBCIExnTslWo%7E2xIHKrCN-4u8UjBxRrURtwrVKaJjqnhcoe2gzVHtlX0w1HYpqPX8LzGhliSWIlLSbcjeeXSMqSKvU7KXj8Bx73aruoz1E-Au6biP3AiWpsPFqyx8XMdjtZzf0m-qrzp4uDGClqr6qtMWuy8hFD4WkhehZ5IUcP5YC81oqCSRk4Hr7yad58Gc0ApsFPKEjtLmY1xmVXJwSsew1xCWMDO4Ca4Fsk9HzOySkmzzW-JRhNefZZZQOhtbpCzNsT1munxY7qa3yIfg__&Key-Pair-Id=KCD77M1F0VK2B\n", - "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", - "Proxy request sent, awaiting response... 200 OK\n", - "Length: 1789 (1,7K) [application/zip]\n", - "Saving to: ‘checkpoints/base_speakers/ZH/zh_default_se.pth’\n", - "\n", - "checkpoints/base_sp 100%[===================>] 1,75K --.-KB/s in 0s \n", - "\n", - "2024-01-22 18:48:10 (87,8 MB/s) - ‘checkpoints/base_speakers/ZH/zh_default_se.pth’ saved [1789/1789]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "# !mkdir -p checkpoints/converter/\n", - "# !mkdir -p checkpoints/base_speakers/EN/\n", - "# !mkdir -p checkpoints/base_speakers/ZH/\n", + "!mkdir -p checkpoints/converter/\n", + "!mkdir -p checkpoints/base_speakers/EN/\n", + "!mkdir -p checkpoints/base_speakers/ZH/\n", "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth -O checkpoints/converter/checkpoint.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/converter/config.json -O checkpoints/converter/config.json\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth -O checkpoints/converter/checkpoint.pth\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/converter/config.json -O checkpoints/converter/config.json\n", "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/checkpoint.pth -O checkpoints/base_speakers/EN/checkpoint.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/EN/config.json -O checkpoints/base_speakers/EN/config.json\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/checkpoint.pth -O checkpoints/base_speakers/EN/checkpoint.pth\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/EN/config.json -O checkpoints/base_speakers/EN/config.json\n", "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/checkpoint.pth -O checkpoints/base_speakers/ZH/checkpoint.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/ZH/config.json -O checkpoints/base_speakers/ZH/config.json\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/checkpoint.pth -O checkpoints/base_speakers/ZH/checkpoint.pth\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/ZH/config.json -O checkpoints/base_speakers/ZH/config.json\n", "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_default_se.pth -O checkpoints/base_speakers/EN/en_default_se.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_style_se.pth -O checkpoints/base_speakers/EN/en_style_se.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/zh_default_se.pth -O checkpoints/base_speakers/ZH/zh_default_se.pth" + "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_default_se.pth -O checkpoints/base_speakers/EN/en_default_se.pth\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_style_se.pth -O checkpoints/base_speakers/EN/en_style_se.pth\n", + "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/zh_default_se.pth -O checkpoints/base_speakers/ZH/zh_default_se.pth" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Importing the dtw module. When using in academic works please cite:\n", - " T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n", - " J. Stat. Soft., doi:10.18637/jss.v031.i07.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "import se_extractor\n", "from api import BaseSpeakerTTS, ToneColorConverter" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -252,7 +185,35 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e0dff3e511e847ce829a8d7bb6ee7943", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(options=('CPU', 'GPU', 'AUTO'), value='CPU')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "devices = ['CPU', 'GPU', 'AUTO']\n", + "device = widgets.Dropdown(options=devices, value=devices[0], disabled=False)\n", + "display(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -260,183 +221,300 @@ "zh_tts_model = OVOpenVoiceTTS(en_base_speaker_tts, ir_path='zh_openvoice_tts.xml')\n", "color_convert_model = OVOpenVoiceConvert(tone_color_converter, ir_path='openvoice_converter.xml')\n", "\n", - "en_tts_model.compile()\n", - "zh_tts_model.compile()\n", - "color_convert_model.compile()" + "en_tts_model.compile(device.value)\n", + "zh_tts_model.compile(device.value)\n", + "color_convert_model.compile(device.value)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# load speaker embeddings\n", - "en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)\n", - "en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)\n", - "zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)\n", - "\n", - "# source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)\n", - "\n", - "# need to install ffmpeg in the system\n", - "reference_speaker = 'resources/example_reference.mp3'\n", - "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)" + "en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth')\n", + "en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth')\n", + "zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Inference" + "First of all, select the reference tone of voice to which the generated text will be converted: your can select from existing ones or record your own by seleceing 'record_manually'" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running on local URL: http://0.0.0.0:7860\n", - "\n", - "To create a public link, set `share=True` in `launch()`.\n" - ] - }, { "data": { - "text/html": [ - "
" - ], + "application/vnd.jupyter.widget-view+json": { + "model_id": "e3068acf38a94a12ad85c87f346d9a14", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "" + "Dropdown(description='reference voice from which tone color will be copied', options=('resources/example_refer…" ] }, "metadata": {}, "output_type": "display_data" - }, + } + ], + "source": [ + "reference_speakers = [\n", + " 'resources/example_reference.mp3',\n", + " 'resources/demo_speaker0.mp3',\n", + " 'resources/demo_speaker1.mp3',\n", + " 'resources/demo_speaker2.mp3',\n", + " 'record_manually',\n", + "]\n", + "\n", + "ref_speaker = widgets.Dropdown(\n", + " options=reference_speakers,\n", + " value=reference_speakers[0],\n", + " description=\"reference voice from which tone color will be copied\",\n", + " disabled=False,\n", + ")\n", + "\n", + "display(ref_speaker)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "ref_speaker_path = ref_speaker.value\n", + "\n", + "if ref_speaker.value == 'record_manually':\n", + " ref_speaker_path = f'{output_dir}/custom_example_sample.webm'\n", + " from ipywebrtc import AudioRecorder, CameraStream\n", + " camera = CameraStream(constraints={'audio': True,'video':False})\n", + " recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)\n", + " display(recorder)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ { "data": { - "text/plain": [] + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" - }, + } + ], + "source": [ + "from IPython.display import Audio\n", + "Audio(ref_speaker_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n", - " warnings.warn(\n" + "Importing the dtw module. When using in academic works please cite:\n", + " T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n", + " J. Stat. Soft., doi:10.18637/jss.v031.i07.\n", + "\n" ] - }, + } + ], + "source": [ + "import se_extractor\n", + "target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir='processed', vad=True) ## ffmpeg must be installed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Detected language:zh\n", - "[(0.43, 13.97), (14.51, 29.65), (29.966, 40.818), (41.038, 54.29), (55.086, 59.218), (59.662, 61.266), (62.51, 67.314), (67.438, 69.042), (69.646, 79.442), (80.334, 86.386), (86.414, 91.090875)]\n", - "after vad: dur = 85.45201814058957\n" + " > Text splitted to sentences.\n", + "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve a variety of tasks including emulation of human vision,\n", + "automatic speech recognition, natural language processing, recommendation systems, and many others.\n", + " > ===========================\n", + "ˈoʊpən vino* toolkit* ɪz ə ˌkɑmpɹiˈhɛnsɪv toolkit* fəɹ kˈwɪkli dɪˈvɛləpɪŋ ˌæpləˈkeɪʃənz ənd səˈluʃənz ðət sɑɫv ə vəɹˈaɪəti əv tæsks ˌɪnˈkludɪŋ ˌɛmjəˈleɪʃən əv ˈjumən ˈvɪʒən,\n", + " length:173\n", + " length:173\n", + "ˌɔtəˈmætɪk spitʃ ˌɹɛkɪgˈnɪʃən, ˈnætʃəɹəɫ ˈlæŋgwɪdʒ ˈpɹɑsɛsɪŋ, ˌɹɛkəmənˈdeɪʃən ˈsɪstəmz, ənd ˈmɛni ˈəðəɹz.\n", + " length:105\n", + " length:105\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Building prefix dict from the default dictionary ...\n" + "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/wavmark/models/my_model.py:25: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:978.)\n", + " return torch.istft(signal_wmd_fft, n_fft=self.n_fft, hop_length=self.hop_length, window=window,\n" ] - }, + } + ], + "source": [ + "save_path = f'{output_dir}/output_en_default.wav'\n", + "\n", + "text = \"\"\"\n", + "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve \n", + "a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing, \n", + "recommendation systems, and many others.\n", + "\"\"\"\n", + "\n", + "src_path = f'{output_dir}/tmp.wav'\n", + "en_tts_model.tts(text, src_path, speaker='default', language='English', speed=1.0)\n", + "\n", + "color_convert_model.convert(\n", + " audio_src_path=src_path, \n", + " src_se=en_source_default_se, \n", + " tgt_se=target_se, \n", + " output_path=save_path,\n", + " message=\"@MyShell\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "今天天气真好, 我们一起出去吃饭吧.\n", - " > ===========================\n" - ] - }, + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Audio(src_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Audio(save_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run OpenVoice Gradio online app\n", + "We can also use [Gradio](https://www.gradio.app/) app to run TTS and voice tone conversion online." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Dumping model to file cache /tmp/jieba.cache\n", - "Loading model cost 0.271 seconds.\n", - "Prefix dict has been built successfully.\n" + "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n", + " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n", - " length:85\n", - " length:85\n" + "Running on local URL: http://0.0.0.0:7860\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/wavmark/models/my_model.py:25: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:978.)\n", - " return torch.istft(signal_wmd_fft, n_fft=self.n_fft, hop_length=self.hop_length, window=window,\n" - ] + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Detected language:zh\n", - "[(0.11, 6.016)]\n", - "after vad: dur = 5.906\n", - " > Text splitted to sentences.\n", - "今天天气真好, 我们一起出去吃饭吧.\n", - " > ===========================\n", - "tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n", - " length:85\n", - " length:85\n", - "Detected language:en\n", - " > Text splitted to sentences.\n", - "i just wanted to check how my voice is copied\n", - " > ===========================\n", - "aɪ dʒɪst ˈwɔntɪd tɪ tʃɛk haʊ maɪ vɔɪs ɪz ˈkɑpid.\n", - " length:48\n", - " length:48\n", - "Detected language:en\n", - "[(0.0, 8.05), (8.782, 12.85)]\n", - "after vad: dur = 12.118\n", - " > Text splitted to sentences.\n", - "i just wanted to check how my voice is copied\n", - " > ===========================\n", - "aɪ dʒɪst ˈwɔntɪd tɪ tʃɛk haʊ maɪ vɔɪs ɪz ˈkɑpid.\n", - " length:48\n", - " length:48\n", - "Detected language:en\n", - " > Text splitted to sentences.\n", - "i just wanted to check how my voice is copied by this model so that i can easily make a fake voice and vocalize my diaries,\n", - "write about news, about a lot of sex and vice versa\n", - " > ===========================\n", - "aɪ dʒɪst ˈwɔntɪd tɪ tʃɛk haʊ maɪ vɔɪs ɪz ˈkɑpid baɪ ðɪs ˈmɑdəɫ soʊ ðət aɪ kən ˈizəli meɪk ə feɪk vɔɪs ənd ˈvoʊkəˌlaɪz maɪ ˈdaɪəɹiz,\n", - " length:131\n", - " length:131\n", - "ɹaɪt əˈbaʊt nuz, əˈbaʊt ə lɔt əv sɛks ənd vaɪs ˈvəɹsə.\n", - " length:54\n", - " length:54\n", - "Detected language:ru\n" - ] + "data": { + "text/plain": [] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# %pip install gradio==4.15\n", "from openvoice_gradio import get_demo\n", "\n", "demo = get_demo(output_dir, color_convert_model, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se)\n", @@ -446,58 +524,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "save_path = f'{output_dir}/output_en_default.wav'\n", - "\n", - "# Run the base speaker tts\n", - "text = \"i am well aware of my capabilities?!! audio is generated by OpenVoice. OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker.\" \\\n", - "\"We provide an example with a Chinese base speaker here and we encourage the readers to try demo_part2.ipynb for a detailed demo.\" \\\n", - "\"Our online English classes feature lots of useful learning materials and activities to help you develop your reading skills with confidence in a safe and inclusive learning environment.\" \\\n", - "\"Practise reading with your classmates in live group classes, get reading support from a personal tutor in one-to-one lessons or practise reading by yourself at your own speed with a self-study course.\"\n", - "\n", - "src_path = f'{output_dir}/tmp.wav'\n", - "# base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0, ov_model=tts_model.compiled_model)\n", - "tts_model.tts(text, src_path, speaker='default', language='English', speed=1.0)\n", - "\n", - "# Run the tone color converter\n", - "encode_message = \"@MyShell\"\n", - "\n", - "color_convert_model.convert(\n", - " audio_src_path=src_path, \n", - " src_se=source_se, \n", - " tgt_se=target_se, \n", - " output_path=save_path,\n", - " message=encode_message)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install ipywebrtc ipywidgets\n", - "\n", - "from ipywebrtc import AudioRecorder, CameraStream\n", - "from IPython.display import Audio\n", - "\n", - "camera = CameraStream(constraints={'audio': True,'video':False})\n", - "recorder = AudioRecorder(stream=camera)\n", - "recorder" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Closing server running on port: 7860\n" + ] + } + ], "source": [ - "from IPython.display import Audio\n", - "\n", - "Audio(filename=save_path)" + "# please run this cell for stopping gradio interface\n", + "demo.close()" ] } ], diff --git a/notebooks/408-openvoice/openvoice_gradio.py b/notebooks/408-openvoice/openvoice_gradio.py index 80a65cb7b5f..95628c0f28e 100644 --- a/notebooks/408-openvoice/openvoice_gradio.py +++ b/notebooks/408-openvoice/openvoice_gradio.py @@ -1,12 +1,9 @@ import os import torch -import argparse import gradio as gr import langid import se_extractor -from api import BaseSpeakerTTS, ToneColorConverter -# This online demo mainly supports English and Chinese supported_languages = ['zh', 'en'] def build_predict(output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se): @@ -15,9 +12,7 @@ def predict(prompt, style, audio_file_pth, agree): return predict def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se): - # initialize a empty info text_hint = '' - # agree with the terms if agree == False: text_hint += '[ERROR] Please accept the Terms & Condition!\n' gr.Warning("Please accept the Terms & Condition!") @@ -27,7 +22,6 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co None, ) - # first detect the input language language_predicted = langid.classify(prompt)[0].strip() print(f"Detected language:{language_predicted}") @@ -40,7 +34,6 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co return ( text_hint, None, - None, ) if language_predicted == "zh": @@ -53,7 +46,6 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co return ( text_hint, None, - None, ) else: @@ -63,13 +55,13 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co else: source_se = en_source_style_se language = 'English' - if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']: - text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n" - gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']") + supported_styles = ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'] + if style not in supported_styles: + text_hint += f"[ERROR] The style {style} is not supported for English, which should be in {*supported_styles,}\n" + gr.Warning(f"The style {style} is not supported for English, which should be in {*supported_styles,}") return ( text_hint, None, - None, ) speaker_wav = audio_file_pth @@ -80,7 +72,6 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co return ( text_hint, None, - None, ) if len(prompt) > 200: text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n" @@ -90,7 +81,6 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co return ( text_hint, None, - None, ) # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference @@ -104,14 +94,12 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co return ( text_hint, None, - None, ) src_path = f'{output_dir}/tmp.wav' tts_model.tts(prompt, src_path, speaker=style, language=language) save_path = f'{output_dir}/output.wav' - # Run the tone color converter encode_message = "@MyShell" tone_color_converter.convert( audio_src_path=src_path, @@ -125,15 +113,18 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co return ( text_hint, save_path, - speaker_wav, ) - +description = """ + # OpenVoice accelerated by OpenVINO: + + a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set. +""" content = """
- If the generated voice does not sound like the reference voice, please refer to this QnA. For multi-lingual & cross-lingual examples, please refer to this jupyter notebook. - This online demo mainly supports English. The default style also supports Chinese. But OpenVoice can adapt to any other language as long as a base speaker is provided. +If the generated voice does not sound like the reference voice, please refer to this QnA. For multi-lingual & cross-lingual examples, please refer to this jupyter notebook. +This online demo mainly supports English. The default style also supports Chinese. But OpenVoice can adapt to any other language as long as a base speaker is provided.
""" wrapped_markdown_content = f"
{content}
" @@ -162,7 +153,8 @@ def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_co def get_demo(output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se): with gr.Blocks(analytics_enabled=False) as demo: - + with gr.Row(): + gr.Markdown(description) with gr.Row(): gr.HTML(wrapped_markdown_content) @@ -198,7 +190,7 @@ def get_demo(output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_so with gr.Column(): out_text_gr = gr.Text(label="Info") audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True) - ref_audio_gr = gr.Audio(label="Reference Audio Used") + # ref_audio_gr = gr.Audio(label="Reference Audio Used") predict = build_predict( output_dir, tone_color_converter, @@ -212,8 +204,8 @@ def get_demo(output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_so gr.Examples(examples, label="Examples", inputs=[input_text_gr, style_gr, ref_gr, tos_gr], - outputs=[out_text_gr, audio_gr, ref_audio_gr], + outputs=[out_text_gr, audio_gr], fn=predict, cache_examples=False,) - tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr]) + tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr]) return demo diff --git a/notebooks/408-openvoice/openvoice_utils.py b/notebooks/408-openvoice/openvoice_utils.py index 76912c097f9..f5fd1cbfcc9 100644 --- a/notebooks/408-openvoice/openvoice_utils.py +++ b/notebooks/408-openvoice/openvoice_utils.py @@ -1,5 +1,4 @@ -from api import OpenVoiceBaseClass -from mel_processing import spectrogram_torch +from OpenVoice.mel_processing import spectrogram_torch import torch import librosa import openvino as ov @@ -8,30 +7,6 @@ import soundfile - -def get_tts_forward(base_class: OpenVoiceBaseClass): - for par in base_class.model.parameters(): - par.requires_grad = False - - speed = 1.0 - kwargs = dict(noise_scale = 0.667, length_scale = 1.0 / speed, noise_scale_w = 0.6, sdp_ratio = 0.2) - - def tts_forward_wrapper(x, x_lengths, sid): - return base_class.model.infer(x, x_lengths, sid, - noise_scale=kwargs['noise_scale'], - length_scale=kwargs['length_scale'], - noise_scale_w=kwargs['noise_scale_w'], - sdp_ratio=kwargs['sdp_ratio']) - return tts_forward_wrapper - -def get_converter_forward(base_class: OpenVoiceBaseClass): - for par in base_class.model.parameters(): - par.requires_grad = False - def converter_forward_wrapper(y, y_lengths, sid_src, sid_tgt): - return base_class.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau=0.3) - return converter_forward_wrapper - - class OVOpenVoiceTTS(torch.nn.Module): def __init__(self, tts_model, noise_scale = 0.667, noise_scale_w = 0.6, speed = 1, sdp_ratio = 0.2, ir_path='openvoice_tts.xml'): super().__init__() @@ -97,6 +72,7 @@ def tts(self, text, output_path, speaker, language='English', speed=1.0): else: soundfile.write(output_path, audio, tts_model.hps.data.sampling_rate) + class OVOpenVoiceConvert(torch.nn.Module): def __init__(self, voice_conversion_model, tau=0.3, ir_path='openvoice_converter.xml'): super().__init__() @@ -107,7 +83,7 @@ def __init__(self, voice_conversion_model, tau=0.3, ir_path='openvoice_converter tau = tau, ) - def get_example_input(): + def get_example_input(self): y = torch.randn([1, 513, 238], dtype=torch.float32) y_lengths = torch.LongTensor([y.size(-1)]) target_se = torch.randn(*(1, 256, 1))