From 0df1b5f36cf95480068b1f6b35d32c7a47c4b71b Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 22 Jan 2024 19:04:02 +0100 Subject: [PATCH] successully run gradio --- notebooks/408-openvoice/408-openvoice.ipynb | 394 ++++++++++++++------ notebooks/408-openvoice/openvoice_gradio.py | 219 +++++++++++ notebooks/408-openvoice/openvoice_utils.py | 2 + 3 files changed, 502 insertions(+), 113 deletions(-) create mode 100644 notebooks/408-openvoice/openvoice_gradio.py diff --git a/notebooks/408-openvoice/408-openvoice.ipynb b/notebooks/408-openvoice/408-openvoice.ipynb index c68b7a05b18..a16fc386d39 100644 --- a/notebooks/408-openvoice/408-openvoice.ipynb +++ b/notebooks/408-openvoice/408-openvoice.ipynb @@ -21,24 +21,6 @@ "execution_count": 2, "metadata": {}, "outputs": [], - "source": [ - "# !mkdir -p OpenVoice/checkpoints/converter/\n", - "# !mkdir -p OpenVoice/checkpoints/base_speakers/EN/\n", - "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth -O checkpoints/converter/checkpoint.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/converter/config.json -O checkpoints/converter/config.json\n", - "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/checkpoint.pth -O checkpoints/base_speakers/EN/checkpoint.pth\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/EN/config.json -O checkpoints/base_speakers/EN/config.json\n", - "\n", - "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_default_se.pth -O checkpoints/base_speakers/EN/en_default_se.pth" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "# pip install openvino\n", "\n", @@ -49,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -69,6 +51,15 @@ "# gradio==3.48.0 \\" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install gradio==3.48.0" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -110,6 +101,93 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-01-22 18:47:29-- https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/checkpoint.pth\n", + "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27checkpoint.pth%3B+filename%3D%22checkpoint.pth%22%3B&Expires=1706204855&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvZGU5ZmIwZWI3NDlmMzI1NDEzMGZlMDE3MmZjYmIyMGU3NWY4OGE5YjE2YjU0ZGQwYjczY2FjMGRjNDBkYTdkOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g05Mq2b4B-jTfmZ1o5wZ67TcWOuqSyGp2CUV27L%7EiahZduyiT1R8LAyvTrrNC5i7s3yJ2xaPytGUXHStac4MB6vklQVSbpmmPBO0nZ9Fi%7EGTFHr5n89XWc1WFu6kR9Wn2PrXwadXB47XNAe-nqmEPI8ppaozpl0QSwbKWV6UT4076foFxvKmVd2tUo9zXfiwQG3JsE1VYCHslkH3idKw7w4GgbzLIKf5j0RSqPCjLSAzWvi1NRXY6WvW2-DfpxF2fldX3f73hQga5PZqvOKpEHmcmyYdjDEnGJZzeuXf8A0GrfbkRII%7Egbmcj106hq0CecrvG1XJGC9acMeeCRAASQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", + "--2024-01-22 18:47:29-- https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27checkpoint.pth%3B+filename%3D%22checkpoint.pth%22%3B&Expires=1706204855&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvZGU5ZmIwZWI3NDlmMzI1NDEzMGZlMDE3MmZjYmIyMGU3NWY4OGE5YjE2YjU0ZGQwYjczY2FjMGRjNDBkYTdkOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g05Mq2b4B-jTfmZ1o5wZ67TcWOuqSyGp2CUV27L%7EiahZduyiT1R8LAyvTrrNC5i7s3yJ2xaPytGUXHStac4MB6vklQVSbpmmPBO0nZ9Fi%7EGTFHr5n89XWc1WFu6kR9Wn2PrXwadXB47XNAe-nqmEPI8ppaozpl0QSwbKWV6UT4076foFxvKmVd2tUo9zXfiwQG3JsE1VYCHslkH3idKw7w4GgbzLIKf5j0RSqPCjLSAzWvi1NRXY6WvW2-DfpxF2fldX3f73hQga5PZqvOKpEHmcmyYdjDEnGJZzeuXf8A0GrfbkRII%7Egbmcj106hq0CecrvG1XJGC9acMeeCRAASQ__&Key-Pair-Id=KCD77M1F0VK2B\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 160467309 (153M) [application/zip]\n", + "Saving to: ‘checkpoints/base_speakers/ZH/checkpoint.pth’\n", + "\n", + "checkpoints/base_sp 100%[===================>] 153,03M 3,98MB/s in 39s \n", + "\n", + "2024-01-22 18:48:08 (3,96 MB/s) - ‘checkpoints/base_speakers/ZH/checkpoint.pth’ saved [160467309/160467309]\n", + "\n", + "--2024-01-22 18:48:08-- https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/ZH/config.json\n", + "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 1828 (1,8K) [text/plain]\n", + "Saving to: ‘checkpoints/base_speakers/ZH/config.json’\n", + "\n", + "checkpoints/base_sp 100%[===================>] 1,79K --.-KB/s in 0s \n", + "\n", + "2024-01-22 18:48:09 (5,62 GB/s) - ‘checkpoints/base_speakers/ZH/config.json’ saved [1828/1828]\n", + "\n", + "--2024-01-22 18:48:09-- https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_style_se.pth\n", + "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27en_style_se.pth%3B+filename%3D%22en_style_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvNmY2OTgxNTNiZTUwMDRiOTBhODY0MmQxMTU3Yzg5Y2FlN2RkMjk2NzUyYTMyNzY0NTBjZWQ2YTE3YjhiOThhOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=ZTLflxpGZhaVLw7m2Z1yazyw9imi1828LU3PHyTaxGdkRRq%7E3JZwA5Uj%7ETuEICCCR0jLjAhKkywWyRQpZg6uhJzAe7vvQvsRJizpj5y9%7E1SsVszgBhkazxdkcxlHyo3kdOKqI0vaPKe9soQxAKq3KYDrc4LwshsIbrumvRmUuwquiVzZeWqKh-ILriFQfoy9gpbyaHWJt4dzeZUcbUOqVUxjgMFVMHWwiACFeFs5ISiA7glH8y4yhR59FfzyvLKoic3wyoQLvW6kvEiDPDrjumk%7EMlYhoWhKbrZrKUaKu%7ELaD57dPorz2P%7E48dCnIXkKmwRUJtSQfTSORLd%7EhVLAnQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", + "--2024-01-22 18:48:09-- https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27en_style_se.pth%3B+filename%3D%22en_style_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvNmY2OTgxNTNiZTUwMDRiOTBhODY0MmQxMTU3Yzg5Y2FlN2RkMjk2NzUyYTMyNzY0NTBjZWQ2YTE3YjhiOThhOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=ZTLflxpGZhaVLw7m2Z1yazyw9imi1828LU3PHyTaxGdkRRq%7E3JZwA5Uj%7ETuEICCCR0jLjAhKkywWyRQpZg6uhJzAe7vvQvsRJizpj5y9%7E1SsVszgBhkazxdkcxlHyo3kdOKqI0vaPKe9soQxAKq3KYDrc4LwshsIbrumvRmUuwquiVzZeWqKh-ILriFQfoy9gpbyaHWJt4dzeZUcbUOqVUxjgMFVMHWwiACFeFs5ISiA7glH8y4yhR59FfzyvLKoic3wyoQLvW6kvEiDPDrjumk%7EMlYhoWhKbrZrKUaKu%7ELaD57dPorz2P%7E48dCnIXkKmwRUJtSQfTSORLd%7EhVLAnQ__&Key-Pair-Id=KCD77M1F0VK2B\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 1783 (1,7K) [application/zip]\n", + "Saving to: ‘checkpoints/base_speakers/EN/en_style_se.pth’\n", + "\n", + "checkpoints/base_sp 100%[===================>] 1,74K --.-KB/s in 0s \n", + "\n", + "2024-01-22 18:48:10 (87,7 MB/s) - ‘checkpoints/base_speakers/EN/en_style_se.pth’ saved [1783/1783]\n", + "\n", + "--2024-01-22 18:48:10-- https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/zh_default_se.pth\n", + "Resolving proxy-dmz.intel.com (proxy-dmz.intel.com)... 10.102.248.16\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27zh_default_se.pth%3B+filename%3D%22zh_default_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvM2I2MmU4MjY0OTYyMDU5YjhhODRkZDAwYjI5ZTJmY2NjYzkyZjVkM2JlOTBlZWM2N2RmYTA4MmMwY2Y1OGNjZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g85yifinD1PSumgzQizzqdT1D1aHeVX-rhOQ63enKxx%7EjHPMScJ7wX-TxZVhU62KRtBCIExnTslWo%7E2xIHKrCN-4u8UjBxRrURtwrVKaJjqnhcoe2gzVHtlX0w1HYpqPX8LzGhliSWIlLSbcjeeXSMqSKvU7KXj8Bx73aruoz1E-Au6biP3AiWpsPFqyx8XMdjtZzf0m-qrzp4uDGClqr6qtMWuy8hFD4WkhehZ5IUcP5YC81oqCSRk4Hr7yad58Gc0ApsFPKEjtLmY1xmVXJwSsew1xCWMDO4Ca4Fsk9HzOySkmzzW-JRhNefZZZQOhtbpCzNsT1munxY7qa3yIfg__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", + "--2024-01-22 18:48:10-- https://cdn-lfs-us-1.huggingface.co/repos/c4/4f/c44ff1065a97d8c91e31c6989e0b1f15abb8c70de9951f7f5b9adda9a9c3a4f5/3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27zh_default_se.pth%3B+filename%3D%22zh_default_se.pth%22%3B&Expires=1706204895&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjIwNDg5NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M0LzRmL2M0NGZmMTA2NWE5N2Q4YzkxZTMxYzY5ODllMGIxZjE1YWJiOGM3MGRlOTk1MWY3ZjViOWFkZGE5YTljM2E0ZjUvM2I2MmU4MjY0OTYyMDU5YjhhODRkZDAwYjI5ZTJmY2NjYzkyZjVkM2JlOTBlZWM2N2RmYTA4MmMwY2Y1OGNjZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g85yifinD1PSumgzQizzqdT1D1aHeVX-rhOQ63enKxx%7EjHPMScJ7wX-TxZVhU62KRtBCIExnTslWo%7E2xIHKrCN-4u8UjBxRrURtwrVKaJjqnhcoe2gzVHtlX0w1HYpqPX8LzGhliSWIlLSbcjeeXSMqSKvU7KXj8Bx73aruoz1E-Au6biP3AiWpsPFqyx8XMdjtZzf0m-qrzp4uDGClqr6qtMWuy8hFD4WkhehZ5IUcP5YC81oqCSRk4Hr7yad58Gc0ApsFPKEjtLmY1xmVXJwSsew1xCWMDO4Ca4Fsk9HzOySkmzzW-JRhNefZZZQOhtbpCzNsT1munxY7qa3yIfg__&Key-Pair-Id=KCD77M1F0VK2B\n", + "Connecting to proxy-dmz.intel.com (proxy-dmz.intel.com)|10.102.248.16|:912... connected.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 1789 (1,7K) [application/zip]\n", + "Saving to: ‘checkpoints/base_speakers/ZH/zh_default_se.pth’\n", + "\n", + "checkpoints/base_sp 100%[===================>] 1,75K --.-KB/s in 0s \n", + "\n", + "2024-01-22 18:48:10 (87,8 MB/s) - ‘checkpoints/base_speakers/ZH/zh_default_se.pth’ saved [1789/1789]\n", + "\n" + ] + } + ], + "source": [ + "# !mkdir -p checkpoints/converter/\n", + "# !mkdir -p checkpoints/base_speakers/EN/\n", + "# !mkdir -p checkpoints/base_speakers/ZH/\n", + "\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth -O checkpoints/converter/checkpoint.pth\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/converter/config.json -O checkpoints/converter/config.json\n", + "\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/checkpoint.pth -O checkpoints/base_speakers/EN/checkpoint.pth\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/EN/config.json -O checkpoints/base_speakers/EN/config.json\n", + "\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/checkpoint.pth -O checkpoints/base_speakers/ZH/checkpoint.pth\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/ZH/config.json -O checkpoints/base_speakers/ZH/config.json\n", + "\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_default_se.pth -O checkpoints/base_speakers/EN/en_default_se.pth\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_style_se.pth -O checkpoints/base_speakers/EN/en_style_se.pth\n", + "# !wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/zh_default_se.pth -O checkpoints/base_speakers/ZH/zh_default_se.pth" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -129,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -138,20 +216,27 @@ "text": [ "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n", "missing/unexpected keys: [] []\n", + "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n", + "missing/unexpected keys: [] []\n", "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n", "missing/unexpected keys: [] []\n" ] } ], "source": [ - "ckpt_base = 'checkpoints/base_speakers/EN'\n", - "ckpt_converter = 'checkpoints/converter'\n", + "en_ckpt_base = 'checkpoints/base_speakers/EN'\n", + "zh_ckpt_base = 'checkpoints/base_speakers/ZH'\n", + "\n", "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "output_dir = 'outputs'\n", "\n", - "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n", - "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n", + "en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)\n", + "en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')\n", + "\n", + "zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)\n", + "zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')\n", "\n", + "ckpt_converter = 'checkpoints/converter'\n", "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n", "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n", "\n", @@ -159,35 +244,43 @@ ] }, { - "cell_type": "code", - "execution_count": 9, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)\n", - "\n", - "# need to install ffmpeg in the system\n", - "reference_speaker = 'resources/example_reference.mp3'\n", - "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)" + "### Convert models to OpenVINO" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 12, "metadata": {}, + "outputs": [], "source": [ - "### Convert models to OpenVINO" + "en_tts_model = OVOpenVoiceTTS(en_base_speaker_tts, ir_path='en_openvoice_tts.xml')\n", + "zh_tts_model = OVOpenVoiceTTS(en_base_speaker_tts, ir_path='zh_openvoice_tts.xml')\n", + "color_convert_model = OVOpenVoiceConvert(tone_color_converter, ir_path='openvoice_converter.xml')\n", + "\n", + "en_tts_model.compile()\n", + "zh_tts_model.compile()\n", + "color_convert_model.compile()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "tts_model = OVOpenVoiceTTS(base_speaker_tts)\n", - "color_convert_model = OVOpenVoiceConvert(tone_color_converter)\n", - "tts_model.compile()\n", - "color_convert_model.compile()" + "# load speaker embeddings\n", + "en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)\n", + "en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)\n", + "zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)\n", + "\n", + "# source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)\n", + "\n", + "# need to install ffmpeg in the system\n", + "reference_speaker = 'resources/example_reference.mp3'\n", + "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)" ] }, { @@ -199,35 +292,95 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://0.0.0.0:7860\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected language:zh\n", + "[(0.43, 13.97), (14.51, 29.65), (29.966, 40.818), (41.038, 54.29), (55.086, 59.218), (59.662, 61.266), (62.51, 67.314), (67.438, 69.042), (69.646, 79.442), (80.334, 86.386), (86.414, 91.090875)]\n", + "after vad: dur = 85.45201814058957\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Building prefix dict from the default dictionary ...\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ " > Text splitted to sentences.\n", - "i am well aware of my capabilities? ! ! audio is generated by OpenVoice.\n", - "OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker.\n", - "We provide an example with a Chinese base speaker here and we encourage the readers to try demo_part2.\n", - "ipynb for a detailed demo. Our online English classes feature lots of useful learning materials and activities to help you develop your reading skills with confidence in a safe and inclusive learning environment.\n", - "Practise reading with your classmates in live group classes, get reading support from a personal tutor in one-to-one lessons or practise reading by yourself at your own speed with a self-study course.\n", - " > ===========================\n", - "aɪ æm wɛɫ əˈwɛɹ əv maɪ ˌkeɪpəˈbɪlətiz? ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n", - " length:80\n", - " length:80\n", - "ˈoʊpən vɔɪs kən əˈtʃiv multi-linguaɫ* vɔɪs ˈkloʊnɪŋ baɪ ˈsɪmpli ˌɹiˈpleɪs ðə beɪs ˈspikəɹ.\n", - " length:90\n", - " length:90\n", - "wi pɹəˈvaɪd ən ɪgˈzæmpəɫ wɪθ ə tʃaɪˈniz beɪs ˈspikəɹ hiɹ ənd wi ɪnˈkəɹədʒ ðə ˈɹidəɹz tɪ tɹaɪ demo_paɹttwo*.\n", - " length:107\n", - " length:107\n", - "ipynb* fəɹ ə dɪˈteɪɫd ˈdɛmoʊ. ɑɹ ˈɔnˌlaɪn ˈɪŋlɪʃ ˈklæsɪz ˈfitʃəɹ lɑts əv ˈjusfəɫ ˈləɹnɪŋ məˈtɪɹiəɫz ənd ækˈtɪvɪtiz tɪ hɛɫp ju dɪˈvɛləp jʊɹ ˈɹɛdɪŋ skɪɫz wɪθ ˈkɑnfədɛns ɪn ə seɪf ənd ˌɪnˈklusɪv ˈləɹnɪŋ ɪnˈvaɪɹənmənt.\n", - " length:214\n", - " length:214\n", - "ˈpɹæktɪs ˈɹɛdɪŋ wɪθ jʊɹ ˈklæsˌmeɪts ɪn lɪv gɹup ˈklæsɪz, gɪt ˈɹɛdɪŋ səˈpɔɹt fɹəm ə ˈpəɹsɪnəɫ ˈtutəɹ ɪn one-to-one* ˈlɛsənz əɹ ˈpɹæktɪs ˈɹɛdɪŋ baɪ ˈjɔɹsɛɫf æt jʊɹ oʊn spid wɪθ ə self-study* kɔɹs.\n", - " length:194\n", - " length:194\n" + "今天天气真好, 我们一起出去吃饭吧.\n", + " > ===========================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Dumping model to file cache /tmp/jieba.cache\n", + "Loading model cost 0.271 seconds.\n", + "Prefix dict has been built successfully.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n", + " length:85\n", + " length:85\n" ] }, { @@ -237,8 +390,65 @@ "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/wavmark/models/my_model.py:25: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:978.)\n", " return torch.istft(signal_wmd_fft, n_fft=self.n_fft, hop_length=self.hop_length, window=window,\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected language:zh\n", + "[(0.11, 6.016)]\n", + "after vad: dur = 5.906\n", + " > Text splitted to sentences.\n", + "今天天气真好, 我们一起出去吃饭吧.\n", + " > ===========================\n", + "tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n", + " length:85\n", + " length:85\n", + "Detected language:en\n", + " > Text splitted to sentences.\n", + "i just wanted to check how my voice is copied\n", + " > ===========================\n", + "aɪ dʒɪst ˈwɔntɪd tɪ tʃɛk haʊ maɪ vɔɪs ɪz ˈkɑpid.\n", + " length:48\n", + " length:48\n", + "Detected language:en\n", + "[(0.0, 8.05), (8.782, 12.85)]\n", + "after vad: dur = 12.118\n", + " > Text splitted to sentences.\n", + "i just wanted to check how my voice is copied\n", + " > ===========================\n", + "aɪ dʒɪst ˈwɔntɪd tɪ tʃɛk haʊ maɪ vɔɪs ɪz ˈkɑpid.\n", + " length:48\n", + " length:48\n", + "Detected language:en\n", + " > Text splitted to sentences.\n", + "i just wanted to check how my voice is copied by this model so that i can easily make a fake voice and vocalize my diaries,\n", + "write about news, about a lot of sex and vice versa\n", + " > ===========================\n", + "aɪ dʒɪst ˈwɔntɪd tɪ tʃɛk haʊ maɪ vɔɪs ɪz ˈkɑpid baɪ ðɪs ˈmɑdəɫ soʊ ðət aɪ kən ˈizəli meɪk ə feɪk vɔɪs ənd ˈvoʊkəˌlaɪz maɪ ˈdaɪəɹiz,\n", + " length:131\n", + " length:131\n", + "ɹaɪt əˈbaʊt nuz, əˈbaʊt ə lɔt əv sɛks ənd vaɪs ˈvəɹsə.\n", + " length:54\n", + " length:54\n", + "Detected language:ru\n" + ] } ], + "source": [ + "# %pip install gradio==4.15\n", + "from openvoice_gradio import get_demo\n", + "\n", + "demo = get_demo(output_dir, color_convert_model, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se)\n", + "demo.queue(max_size=2)\n", + "demo.launch(server_name=\"0.0.0.0\", server_port=7860)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "save_path = f'{output_dir}/output_en_default.wav'\n", "\n", @@ -254,13 +464,6 @@ "\n", "# Run the tone color converter\n", "encode_message = \"@MyShell\"\n", - "# tone_color_converter.convert(\n", - "# audio_src_path=src_path, \n", - "# src_se=source_se, \n", - "# tgt_se=target_se, \n", - "# output_path=save_path,\n", - "# message=encode_message,\n", - "# ov_model=color_convert_model.compiled_model)\n", "\n", "color_convert_model.convert(\n", " audio_src_path=src_path, \n", @@ -272,64 +475,29 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ + "# !pip install ipywebrtc ipywidgets\n", + "\n", + "from ipywebrtc import AudioRecorder, CameraStream\n", "from IPython.display import Audio\n", "\n", - "Audio(filename=save_path)" + "camera = CameraStream(constraints={'audio': True,'video':False})\n", + "recorder = AudioRecorder(stream=camera)\n", + "recorder" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "baef2c4b0b6f49679947f128bd0dbc47", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# !pip install ipywebrtc ipywidgets\n", - "\n", - "from ipywebrtc import AudioRecorder, CameraStream\n", "from IPython.display import Audio\n", "\n", - "camera = CameraStream(constraints={'audio': True,'video':False})\n", - "recorder = AudioRecorder(stream=camera)\n", - "recorder" + "Audio(filename=save_path)" ] } ], diff --git a/notebooks/408-openvoice/openvoice_gradio.py b/notebooks/408-openvoice/openvoice_gradio.py new file mode 100644 index 00000000000..80a65cb7b5f --- /dev/null +++ b/notebooks/408-openvoice/openvoice_gradio.py @@ -0,0 +1,219 @@ +import os +import torch +import argparse +import gradio as gr +import langid +import se_extractor +from api import BaseSpeakerTTS, ToneColorConverter + +# This online demo mainly supports English and Chinese +supported_languages = ['zh', 'en'] + +def build_predict(output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se): + def predict(prompt, style, audio_file_pth, agree): + return predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se) + return predict + +def predict_impl(prompt, style, audio_file_pth, agree, output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se): + # initialize a empty info + text_hint = '' + # agree with the terms + if agree == False: + text_hint += '[ERROR] Please accept the Terms & Condition!\n' + gr.Warning("Please accept the Terms & Condition!") + return ( + text_hint, + None, + None, + ) + + # first detect the input language + language_predicted = langid.classify(prompt)[0].strip() + print(f"Detected language:{language_predicted}") + + if language_predicted not in supported_languages: + text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" + gr.Warning( + f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}" + ) + + return ( + text_hint, + None, + None, + ) + + if language_predicted == "zh": + tts_model = zh_tts_model + source_se = zh_source_se + language = 'Chinese' + if style not in ['default']: + text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n" + gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']") + return ( + text_hint, + None, + None, + ) + + else: + tts_model = en_tts_model + if style == 'default': + source_se = en_source_default_se + else: + source_se = en_source_style_se + language = 'English' + if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']: + text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n" + gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']") + return ( + text_hint, + None, + None, + ) + + speaker_wav = audio_file_pth + + if len(prompt) < 2: + text_hint += f"[ERROR] Please give a longer prompt text \n" + gr.Warning("Please give a longer prompt text") + return ( + text_hint, + None, + None, + ) + if len(prompt) > 200: + text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n" + gr.Warning( + "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage" + ) + return ( + text_hint, + None, + None, + ) + + # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference + try: + target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter.voice_conversion_model, target_dir='processed', vad=True) + except Exception as e: + text_hint += f"[ERROR] Get target tone color error {str(e)} \n" + gr.Warning( + "[ERROR] Get target tone color error {str(e)} \n" + ) + return ( + text_hint, + None, + None, + ) + + src_path = f'{output_dir}/tmp.wav' + tts_model.tts(prompt, src_path, speaker=style, language=language) + + save_path = f'{output_dir}/output.wav' + # Run the tone color converter + encode_message = "@MyShell" + tone_color_converter.convert( + audio_src_path=src_path, + src_se=source_se, + tgt_se=target_se, + output_path=save_path, + message=encode_message) + + text_hint += f'''Get response successfully \n''' + + return ( + text_hint, + save_path, + speaker_wav, + ) + + + +content = """ +
+ If the generated voice does not sound like the reference voice, please refer to this QnA. For multi-lingual & cross-lingual examples, please refer to this jupyter notebook. + This online demo mainly supports English. The default style also supports Chinese. But OpenVoice can adapt to any other language as long as a base speaker is provided. +
+""" +wrapped_markdown_content = f"
{content}
" + + +examples = [ + [ + "今天天气真好,我们一起出去吃饭吧。", + 'default', + "resources/demo_speaker1.mp3", + True, + ],[ + "This audio is generated by open voice with a half-performance model.", + 'whispering', + "resources/demo_speaker2.mp3", + True, + ], + [ + "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.", + 'sad', + "resources/demo_speaker0.mp3", + True, + ], +] + +def get_demo(output_dir, tone_color_converter, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se): + with gr.Blocks(analytics_enabled=False) as demo: + + + with gr.Row(): + gr.HTML(wrapped_markdown_content) + + with gr.Row(): + with gr.Column(): + input_text_gr = gr.Textbox( + label="Text Prompt", + info="One or two sentences at a time is better. Up to 200 text characters.", + value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.", + ) + style_gr = gr.Dropdown( + label="Style", + info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)", + choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'], + max_choices=1, + value="default", + ) + ref_gr = gr.Audio( + label="Reference Audio", + # info="Click on the ✎ button to upload your own target speaker audio", + type="filepath", + value="resources/demo_speaker2.mp3", + ) + tos_gr = gr.Checkbox( + label="Agree", + value=False, + info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE", + ) + + tts_button = gr.Button("Send", elem_id="send-btn", visible=True) + + + with gr.Column(): + out_text_gr = gr.Text(label="Info") + audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True) + ref_audio_gr = gr.Audio(label="Reference Audio Used") + predict = build_predict( + output_dir, + tone_color_converter, + en_tts_model, + zh_tts_model, + en_source_default_se, + en_source_style_se, + zh_source_se + ) + + gr.Examples(examples, + label="Examples", + inputs=[input_text_gr, style_gr, ref_gr, tos_gr], + outputs=[out_text_gr, audio_gr, ref_audio_gr], + fn=predict, + cache_examples=False,) + tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr]) + return demo diff --git a/notebooks/408-openvoice/openvoice_utils.py b/notebooks/408-openvoice/openvoice_utils.py index 283443d0164..76912c097f9 100644 --- a/notebooks/408-openvoice/openvoice_utils.py +++ b/notebooks/408-openvoice/openvoice_utils.py @@ -64,6 +64,7 @@ def compile(self, ov_device='CPU'): self.ov_tts = core.read_model(self.ir_path) else: self.ov_tts = ov.convert_model(self, example_input=self.get_example_input()) + ov.save_model(self.ov_tts, self.ir_path) self.compiled_model = core.compile_model(self.ov_tts, ov_device) @@ -119,6 +120,7 @@ def compile(self, ov_device='CPU'): self.ov_voice_conversion = core.read_model(self.ir_path) else: self.ov_voice_conversion = ov.convert_model(self, example_input=self.get_example_input()) + ov.save_model(self.ov_voice_conversion, self.ir_path) self.compiled_model = core.compile_model(self.ov_voice_conversion, ov_device)