diff --git a/notebooks/284-openvoice/284-openvoice-quantization.ipynb b/notebooks/284-openvoice/284-openvoice-quantization.ipynb
new file mode 100644
index 00000000000..6f359fa0ac4
--- /dev/null
+++ b/notebooks/284-openvoice/284-openvoice-quantization.ipynb
@@ -0,0 +1,804 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### TODO: unfinished"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !git clone https://github.com/myshell-ai/OpenVoice"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "import openvino as ov\n",
+ "core = ov.Core()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/home/epavel/devel/openvino/tools/mo/openvino/__init__.py'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ov.__file__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/devel/openvino_notebooks/notebooks/280-openvoice/OpenVoice\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/opt/envs/py310-openvoice/lib/python3.10/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
+ " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# cd to the original repo to save original data paths and imports\n",
+ "%cd OpenVoice"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "download all resources from HF Hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from notebook_utils import download_file\n",
+ "base_url = 'https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/'\n",
+ "\n",
+ "CKPT_BASE_PATH = '../checkpoints/'\n",
+ "en_ckpt_path = f'{CKPT_BASE_PATH}/base_speakers/EN/'\n",
+ "zh_ckpt_path = f'{CKPT_BASE_PATH}/base_speakers/ZH/'\n",
+ "converter_path = f'{CKPT_BASE_PATH}/converter/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Importing the dtw module. When using in academic works please cite:\n",
+ " T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
+ " J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass\n",
+ "import se_extractor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/opt/envs/py310-openvoice/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
+ " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint '../checkpoints//base_speakers/EN//checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n",
+ "Loaded checkpoint '../checkpoints//converter//checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "\n",
+ "en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_path}/config.json', device=device)\n",
+ "en_base_speaker_tts.load_ckpt(f'{en_ckpt_path}/checkpoint.pth')\n",
+ "tone_color_converter = ToneColorConverter(f'{converter_path}/config.json', device=device)\n",
+ "tone_color_converter.load_ckpt(f'{converter_path}/checkpoint.pth')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Convert models to OpenVINO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To convert to OpenVino IR format first we need to get acceptable pytorch nn.Module object. \n",
+ "\n",
+ "Both ToneColorConverter, BaseSpeakerTTS instead of using self.forward as the main entry point use custom methods infer and convert_voice respectively, therefore need to wrap them with a custom class that is inherited from torch.nn.Module. \n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tts_kwargs = dict(noise_scale = 0.667, noise_scale_w = 0.6, speed = 1.0, sdp_ratio = 0.2)\n",
+ "voice_convert_kwargs = dict(tau=0.3)\n",
+ "\n",
+ "class OVOpenVoiceBase(torch.nn.Module):\n",
+ " def __init__(self, voice_model: OpenVoiceBaseClass, **kwargs):\n",
+ " super().__init__()\n",
+ " self.voice_model = voice_model\n",
+ " self.default_kwargs = kwargs\n",
+ " for par in voice_model.model.parameters():\n",
+ " par.requires_grad = False\n",
+ " \n",
+ "class OVOpenVoiceTTS(OVOpenVoiceBase):\n",
+ " def get_example_input(self):\n",
+ " stn_tst = self.voice_model.get_text('this is original text', self.voice_model.hps, False)\n",
+ " x_tst = stn_tst.unsqueeze(0)\n",
+ " x_tst_lengths = torch.LongTensor([stn_tst.size(0)])\n",
+ " speaker_id = torch.LongTensor([1])\n",
+ " return (x_tst, x_tst_lengths, speaker_id)\n",
+ "\n",
+ " def forward(self, x, x_lengths, sid):\n",
+ " return self.voice_model.model.infer(x, x_lengths, sid, **self.default_kwargs)\n",
+ " \n",
+ "class OVOpenVoiceConverter(OVOpenVoiceBase):\n",
+ " def get_example_input(self):\n",
+ " y = torch.randn([1, 513, 238], dtype=torch.float32)\n",
+ " y_lengths = torch.LongTensor([y.size(-1)])\n",
+ " target_se = torch.randn(*(1, 256, 1))\n",
+ " source_se = torch.randn(*(1, 256, 1))\n",
+ " return (y, y_lengths, source_se, target_se)\n",
+ " \n",
+ " def forward(self, y, y_lengths, sid_src, sid_tgt):\n",
+ " return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, **self.default_kwargs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Convert to OpenVino IR and save to IRs_path folder for the future use. If IRs already exist skip conversion and read them directly"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "IRS_PATH = '../openvino_irs/'\n",
+ "TTS_IR = f'{IRS_PATH}/openvoice_tts.xml'\n",
+ "VOICE_CONVERTER_IR = f'{IRS_PATH}/openvoice_tone_conversion.xml'\n",
+ "\n",
+ "if not os.path.exists(TTS_IR):\n",
+ " pt_tts = OVOpenVoiceTTS(en_base_speaker_tts)\n",
+ " ov_tts = ov.convert_model(pt_tts, example_input=pt_tts.get_example_input())\n",
+ " ov.save_model(ov_tts, TTS_IR)\n",
+ "else:\n",
+ " ov_tts = core.read_model(TTS_IR)\n",
+ "\n",
+ "if not os.path.exists(VOICE_CONVERTER_IR):\n",
+ " pt_voice_converter = OVOpenVoiceConverter(tone_color_converter)\n",
+ " ov_voice_conversion = ov.convert_model(pt_voice_converter, example_input=pt_voice_converter.get_example_input())\n",
+ " ov.save_model(ov_voice_conversion, VOICE_CONVERTER_IR)\n",
+ "else:\n",
+ " ov_voice_conversion = core.read_model(VOICE_CONVERTER_IR)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "cc2edc77f9f84caaa59401228083910b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import ipywidgets as widgets\n",
+ "\n",
+ "core = ov.Core()\n",
+ "device = widgets.Dropdown(\n",
+ " options=core.available_devices + [\"AUTO\"],\n",
+ " value='CPU',\n",
+ " description='Device:',\n",
+ " disabled=False,\n",
+ ")\n",
+ "device"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "First of all, select the reference tone of voice to which the generated text will be converted: your can select from existing ones or record your own by seleceing 'record_manually'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "dd541f3cd2284cf39829cd654f36b292",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Dropdown(description='reference voice from which tone color will be copied', options=('resources/example_refer…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "reference_speakers = [\n",
+ " 'resources/example_reference.mp3',\n",
+ " 'resources/demo_speaker0.mp3',\n",
+ " 'resources/demo_speaker1.mp3',\n",
+ " 'resources/demo_speaker2.mp3',\n",
+ " 'record_manually',\n",
+ "]\n",
+ "\n",
+ "ref_speaker = widgets.Dropdown(\n",
+ " options=reference_speakers,\n",
+ " value=reference_speakers[0],\n",
+ " description=\"reference voice from which tone color will be copied\",\n",
+ " disabled=False,\n",
+ ")\n",
+ "\n",
+ "display(ref_speaker)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_dir = '../outputs/'\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ref_speaker_path = ref_speaker.value\n",
+ "\n",
+ "if ref_speaker.value == 'record_manually':\n",
+ " ref_speaker_path = f'{output_dir}/custom_example_sample.webm'\n",
+ " from ipywebrtc import AudioRecorder, CameraStream\n",
+ " camera = CameraStream(constraints={'audio': True,'video':False})\n",
+ " recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)\n",
+ " display(recorder)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from IPython.display import Audio\n",
+ "Audio(ref_speaker_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/opt/envs/py310-openvoice/lib/python3.10/site-packages/librosa/util/decorators.py:88: UserWarning: PySoundFile failed. Trying audioread instead.\n",
+ " return f(*args, **kwargs)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load speaker embeddings\n",
+ "en_source_default_se = torch.load(f'{en_ckpt_path}/en_default_se.pth')\n",
+ "en_source_style_se = torch.load(f'{en_ckpt_path}/en_style_se.pth')\n",
+ "zh_source_se = torch.load(f'{zh_ckpt_path}/zh_default_se.pth')\n",
+ "\n",
+ "target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir='processed', vad=True) ## ffmpeg must be installed\n",
+ "\n",
+ "custom_se = se_extractor.get_se('/home/epavel/my_base_voice.m4a', tone_color_converter, target_dir='processed', vad=True)[0] ## ffmpeg must be installed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Inference"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "There are pre and post processings that are not traceable and could not be offloaded to OpenVINO, instead of writing such processing ourselves we will rely on the already existing ones. We just replace infer and voice conversion functions of OpenVoiceBaseClass so that the the most computationally expensive part is done in OpenVINO."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_pathched_infer(ov_model: ov.Model, device_name: str = device.value) -> callable:\n",
+ " compiled_model = core.compile_model(ov_model, device_name)\n",
+ " \n",
+ " def infer_impl(x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):\n",
+ " # todo: assert that other params match to compiled ones\n",
+ " ov_output = compiled_model((x, x_lengths, sid))\n",
+ " return (torch.tensor(ov_output[0]), )\n",
+ " return infer_impl\n",
+ "\n",
+ "def get_patched_voice_conversion(ov_model: ov.Model, device_name: str = device.value) -> callable:\n",
+ " compiled_model = core.compile_model(ov_model, device_name)\n",
+ "\n",
+ " def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau=1.0):\n",
+ " # todo: assert that tau matches to compiled ones\n",
+ " ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt))\n",
+ " return (torch.tensor(ov_output[0]), )\n",
+ " return voice_conversion_impl\n",
+ "\n",
+ "en_base_speaker_tts.model.infer = get_pathched_infer(ov_tts)\n",
+ "tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "article = \"\"\"\n",
+ "The work chronicles the Napoleonic era within Russia, \n",
+ "notably detailing the French invasion of Russia and its aftermath. \n",
+ "The book highlights the impact of Napoleon on Tsarist society \n",
+ "through five interlocking narratives following different \n",
+ "Russian aristocratic families. Portions of an earlier version, \n",
+ "titled The Year 1805, were serialized in The Russian Messenger \n",
+ "from 1865 to 1867 before the novel was published in its entirety in 1869.\n",
+ "Tolstoy said that the best Russian literature does not conform \n",
+ "to standards and hence hesitated to classify War and Peace, \n",
+ "saying it is \"not a novel, even less is it a poem, \n",
+ "and still less a historical chronicle\". Large sections, \n",
+ "especially the later chapters, are philosophical discussions rather \n",
+ "than narrative. He regarded Anna Karenina as his first true novel.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "The work chronicles the Napoleonic era within Russia, notably detailing the French invasion of Russia and its aftermath.\n",
+ "The book highlights the impact of Napoleon on Tsarist society through five interlocking narratives following different Russian aristocratic families.\n",
+ "Portions of an earlier version, titled The Year 1805, were serialized in The Russian Messenger from 1865 to 1867 before the novel was published in its entirety in 1869.\n",
+ "Tolstoy said that the best Russian literature does not conform to standards and hence hesitated to classify War and Peace,\n",
+ "saying it is not a novel, even less is it a poem,\n",
+ "and still less a historical chronicle. Large sections, especially the later chapters,\n",
+ "are philosophical discussions rather than narrative. He regarded Anna Karenina as his first true novel.\n",
+ " > ===========================\n"
+ ]
+ }
+ ],
+ "source": [
+ "lines = en_base_speaker_tts.split_sentences_into_pieces(article, 'EN')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pt_device = 'cpu'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino\n"
+ ]
+ }
+ ],
+ "source": [
+ "import nncf\n",
+ "import re\n",
+ "mark = 'EN'\n",
+ "speaker='default'\n",
+ "\n",
+ "def transform_fn(line: str):\n",
+ " t = re.sub(r'([a-z])([A-Z])', r'\\1 \\2', line)\n",
+ " t = f'[{mark}]{t}[{mark}]'\n",
+ " stn_tst = en_base_speaker_tts.get_text(t, en_base_speaker_tts.hps, False)\n",
+ " speaker_id = en_base_speaker_tts.hps.speakers[speaker]\n",
+ " x_tst = stn_tst.unsqueeze(0).to(pt_device)\n",
+ " x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(pt_device)\n",
+ " sid = torch.LongTensor([speaker_id]).to(pt_device)\n",
+ " return (x_tst, x_tst_lengths, sid)\n",
+ "\n",
+ "calibration_dataset = nncf.Dataset(lines, transform_fn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ignored_scope = nncf.IgnoredScope(\n",
+ " types=[\"Multiply\", \"Subtract\", \"Sigmoid\"], # ignore operations\n",
+ ")\n",
+ "\n",
+ "# quantized_model = nncf.quantize(ov_tts, calibration_dataset, ignored_scope=ignored_scope)\n",
+ "# ov.save_model(quantized_model, '../openvino_irs/openvoice_tts_quantized.xml')\n",
+ "\n",
+ "quantized_model = ov.Core().read_model('../openvino_irs/openvoice_tts_quantized.xml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "en_base_speaker_tts.model.infer = get_pathched_infer(quantized_model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/home/epavel/devel/openvino/tools/mo/openvino/__init__.py'"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ov.__file__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve a variety of tasks including emulation of human vision,\n",
+ "automatic speech recognition, natural language processing, recommendation systems, and many others.\n",
+ " > ===========================\n",
+ "ˈoʊpən vino* toolkit* ɪz ə ˌkɑmpɹiˈhɛnsɪv toolkit* fəɹ kˈwɪkli dɪˈvɛləpɪŋ ˌæpləˈkeɪʃənz ənd səˈluʃənz ðət sɑɫv ə vəɹˈaɪəti əv tæsks ˌɪnˈkludɪŋ ˌɛmjəˈleɪʃən əv ˈjumən ˈvɪʒən,\n",
+ " length:173\n",
+ " length:173\n"
+ ]
+ },
+ {
+ "ename": "RuntimeError",
+ "evalue": "Exception from src/inference/src/infer_request.cpp:233:\nException from src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h:89:\nParameterMismatch: Can not clone with new dims. Descriptor's shape: {0 - ?, 192, 1} is incompatible with provided dimensions: {1, 192, 347}.\n\n",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[26], line 10\u001b[0m\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124mOpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve \u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124ma variety of tasks including emulation of human vision, automatic speech recognition, natural language processing, \u001b[39m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124mrecommendation systems, and many others.\u001b[39m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 9\u001b[0m src_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutput_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/tmp.wav\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 10\u001b[0m \u001b[43men_base_speaker_tts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msrc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mspeaker\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdefault\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mEnglish\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mspeed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# src_path = '/home/epavel/my_base_voice.m4a'\u001b[39;00m\n\u001b[1;32m 13\u001b[0m tone_color_converter\u001b[38;5;241m.\u001b[39mconvert(\n\u001b[1;32m 14\u001b[0m audio_src_path\u001b[38;5;241m=\u001b[39msrc_path, \n\u001b[1;32m 15\u001b[0m src_se\u001b[38;5;241m=\u001b[39men_source_default_se, \n\u001b[1;32m 16\u001b[0m tgt_se\u001b[38;5;241m=\u001b[39mtarget_se, \n\u001b[1;32m 17\u001b[0m output_path\u001b[38;5;241m=\u001b[39msave_path,\n\u001b[1;32m 18\u001b[0m message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m@MyShell\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+ "File \u001b[0;32m~/devel/openvino_notebooks/notebooks/280-openvoice/OpenVoice/api.py:90\u001b[0m, in \u001b[0;36mBaseSpeakerTTS.tts\u001b[0;34m(self, text, output_path, speaker, language, speed)\u001b[0m\n\u001b[1;32m 88\u001b[0m x_tst_lengths \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mLongTensor([stn_tst\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m)])\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 89\u001b[0m sid \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mLongTensor([speaker_id])\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 90\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx_tst\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx_tst_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msid\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnoise_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.667\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnoise_scale_w\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.6\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 91\u001b[0m \u001b[43m \u001b[49m\u001b[43mlength_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mspeed\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mfloat()\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 92\u001b[0m audio_list\u001b[38;5;241m.\u001b[39mappend(audio)\n\u001b[1;32m 93\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maudio_numpy_concat(audio_list, sr\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhps\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39msampling_rate, speed\u001b[38;5;241m=\u001b[39mspeed)\n",
+ "Cell \u001b[0;32mIn[15], line 6\u001b[0m, in \u001b[0;36mget_pathched_infer..infer_impl\u001b[0;34m(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w, sdp_ratio, max_len)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minfer_impl\u001b[39m(x, x_lengths, sid\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, noise_scale\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, length_scale\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, noise_scale_w\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.\u001b[39m, sdp_ratio\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m, max_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# todo: assert that other params match to compiled ones\u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m ov_output \u001b[38;5;241m=\u001b[39m \u001b[43mcompiled_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msid\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39mtensor(ov_output[\u001b[38;5;241m0\u001b[39m]), )\n",
+ "File \u001b[0;32m~/devel/openvino/bin/intel64/Release/python/openvino/runtime/ie_api.py:365\u001b[0m, in \u001b[0;36mCompiledModel.__call__\u001b[0;34m(self, inputs, share_inputs, share_outputs, decode_strings)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_infer_request \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 363\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_infer_request \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcreate_infer_request()\n\u001b[0;32m--> 365\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_infer_request\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 367\u001b[0m \u001b[43m \u001b[49m\u001b[43mshare_inputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshare_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 368\u001b[0m \u001b[43m \u001b[49m\u001b[43mshare_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshare_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 369\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_strings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_strings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/devel/openvino/bin/intel64/Release/python/openvino/runtime/ie_api.py:132\u001b[0m, in \u001b[0;36mInferRequest.infer\u001b[0;34m(self, inputs, share_inputs, share_outputs, decode_strings)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minfer\u001b[39m(\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 57\u001b[0m inputs: Any \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 61\u001b[0m decode_strings: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 62\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m OVDict:\n\u001b[1;32m 63\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Infers specified input(s) in synchronous mode.\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m Blocks all methods of InferRequest while request is running.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;124;03m :rtype: OVDict\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m OVDict(\u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_data_dispatch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 133\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 135\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_shared\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshare_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 136\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshare_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshare_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_strings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_strings\u001b[49m\u001b[43m)\u001b[49m)\n",
+ "\u001b[0;31mRuntimeError\u001b[0m: Exception from src/inference/src/infer_request.cpp:233:\nException from src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h:89:\nParameterMismatch: Can not clone with new dims. Descriptor's shape: {0 - ?, 192, 1} is incompatible with provided dimensions: {1, 192, 347}.\n\n"
+ ]
+ }
+ ],
+ "source": [
+ "save_path = f'{output_dir}/output_en_default.wav'\n",
+ "\n",
+ "text = \"\"\"\n",
+ "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve \n",
+ "a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing, \n",
+ "recommendation systems, and many others.\n",
+ "\"\"\"\n",
+ "\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "en_base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
+ "# src_path = '/home/epavel/my_base_voice.m4a'\n",
+ "\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=en_source_default_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=\"@MyShell\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio(src_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio(save_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Run OpenVoice Gradio online app\n",
+ "We can also use [Gradio](https://www.gradio.app/) app to run TTS and voice tone conversion online."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running on local URL: http://0.0.0.0:7860\n",
+ "\n",
+ "To create a public link, set `share=True` in `launch()`.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": []
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from openvoice_gradio import get_demo\n",
+ "\n",
+ "demo = get_demo(output_dir, color_convert_model, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se)\n",
+ "demo.queue(max_size=2)\n",
+ "demo.launch(server_name=\"0.0.0.0\", server_port=7860)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Closing server running on port: 7860\n"
+ ]
+ }
+ ],
+ "source": [
+ "# please run this cell for stopping gradio interface\n",
+ "demo.close()\n",
+ "\n",
+ "# clean up \n",
+ "# import shutil\n",
+ "# shutil.rmtree(CKPT_BASE_PATH)\n",
+ "# shutil.rmtree(IRS_PATH)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/284-openvoice/284-openvoice.ipynb b/notebooks/284-openvoice/284-openvoice.ipynb
new file mode 100644
index 00000000000..def99340b5b
--- /dev/null
+++ b/notebooks/284-openvoice/284-openvoice.ipynb
@@ -0,0 +1,863 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Introduction"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "OpenVoice is a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.\n",
+ "\n",
+ "This notebooks provides example of converting original OpenVoice model (https://github.com/myshell-ai/OpenVoice) to OpenVINO IR format for faster inference."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "clone the repository and install all dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/devel/openvino_notebooks/notebooks/280-openvoice/OpenVoice\n"
+ ]
+ }
+ ],
+ "source": [
+ "from pathlib import Path\n",
+ "\n",
+ "repo_dir = Path(\"OpenVoice\")\n",
+ "\n",
+ "if not repo_dir.exists():\n",
+ " !git clone https://github.com/myshell-ai/OpenVoice\n",
+ "\n",
+ "# cd to the original repo to save original data paths and imports\n",
+ "%cd $repo_dir"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: openvino>=2023.3 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (2023.3.0)\n",
+ "Requirement already satisfied: librosa>=0.9.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.9.1)\n",
+ "Requirement already satisfied: wavmark>=0.0.3 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.0.3)\n",
+ "Requirement already satisfied: faster-whisper>=0.9.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.9.0)\n",
+ "Requirement already satisfied: pydub>=0.25.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.25.1)\n",
+ "Requirement already satisfied: whisper-timestamped>=1.14.2 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (1.14.2)\n",
+ "Requirement already satisfied: tqdm in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (4.66.1)\n",
+ "Requirement already satisfied: inflect>=7.0.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (7.0.0)\n",
+ "Requirement already satisfied: unidecode>=1.3.7 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (1.3.7)\n",
+ "Requirement already satisfied: eng_to_ipa>=0.0.2 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.0.2)\n",
+ "Requirement already satisfied: pypinyin>=0.50.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.50.0)\n",
+ "Requirement already satisfied: cn2an>=0.5.22 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.5.22)\n",
+ "Requirement already satisfied: jieba>=0.42.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.42.1)\n",
+ "Requirement already satisfied: langid>=1.1.6 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (1.1.6)\n",
+ "Requirement already satisfied: gradio>=4.15 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (4.15.0)\n",
+ "Requirement already satisfied: ipywebrtc in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (0.6.0)\n",
+ "Requirement already satisfied: numpy>=1.16.6 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from openvino>=2023.3) (1.26.3)\n",
+ "Requirement already satisfied: openvino-telemetry>=2023.2.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from openvino>=2023.3) (2023.2.1)\n",
+ "Requirement already satisfied: audioread>=2.1.5 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (3.0.1)\n",
+ "Requirement already satisfied: scipy>=1.2.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (1.11.4)\n",
+ "Requirement already satisfied: scikit-learn>=0.19.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (1.4.0)\n",
+ "Requirement already satisfied: joblib>=0.14 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (1.3.2)\n",
+ "Requirement already satisfied: decorator>=4.0.10 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (5.1.1)\n",
+ "Requirement already satisfied: resampy>=0.2.2 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (0.4.2)\n",
+ "Requirement already satisfied: numba>=0.45.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (0.58.1)\n",
+ "Requirement already satisfied: soundfile>=0.10.2 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (0.12.1)\n",
+ "Requirement already satisfied: pooch>=1.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (1.8.0)\n",
+ "Requirement already satisfied: packaging>=20.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from librosa>=0.9.1) (23.2)\n",
+ "Requirement already satisfied: huggingface-hub in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from wavmark>=0.0.3) (0.20.3)\n",
+ "Requirement already satisfied: torch in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from wavmark>=0.0.3) (1.13.1)\n",
+ "Requirement already satisfied: torchaudio in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from wavmark>=0.0.3) (0.13.1)\n",
+ "Requirement already satisfied: av==10.* in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from faster-whisper>=0.9.0) (10.0.0)\n",
+ "Requirement already satisfied: ctranslate2<4,>=3.17 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from faster-whisper>=0.9.0) (3.24.0)\n",
+ "Requirement already satisfied: tokenizers<0.15,>=0.13 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from faster-whisper>=0.9.0) (0.13.3)\n",
+ "Requirement already satisfied: onnxruntime<2,>=1.14 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from faster-whisper>=0.9.0) (1.16.3)\n",
+ "Requirement already satisfied: Cython in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from whisper-timestamped>=1.14.2) (3.0.8)\n",
+ "Requirement already satisfied: dtw-python in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from whisper-timestamped>=1.14.2) (1.3.1)\n",
+ "Requirement already satisfied: openai-whisper in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from whisper-timestamped>=1.14.2) (20231117)\n",
+ "Requirement already satisfied: pydantic>=1.9.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from inflect>=7.0.0) (2.5.3)\n",
+ "Requirement already satisfied: typing-extensions in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from inflect>=7.0.0) (4.9.0)\n",
+ "Requirement already satisfied: setuptools>=47.3.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from cn2an>=0.5.22) (69.0.3)\n",
+ "Requirement already satisfied: proces>=0.1.3 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from cn2an>=0.5.22) (0.1.7)\n",
+ "Requirement already satisfied: aiofiles<24.0,>=22.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (23.2.1)\n",
+ "Requirement already satisfied: altair<6.0,>=4.2.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (5.2.0)\n",
+ "Requirement already satisfied: fastapi in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.109.0)\n",
+ "Requirement already satisfied: ffmpy in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.3.1)\n",
+ "Requirement already satisfied: gradio-client==0.8.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.8.1)\n",
+ "Requirement already satisfied: httpx in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.26.0)\n",
+ "Requirement already satisfied: importlib-resources<7.0,>=1.3 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (6.1.1)\n",
+ "Requirement already satisfied: jinja2<4.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (3.1.3)\n",
+ "Requirement already satisfied: markupsafe~=2.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (2.1.3)\n",
+ "Requirement already satisfied: matplotlib~=3.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (3.8.2)\n",
+ "Requirement already satisfied: orjson~=3.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (3.9.12)\n",
+ "Requirement already satisfied: pandas<3.0,>=1.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (2.1.4)\n",
+ "Requirement already satisfied: pillow<11.0,>=8.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (10.2.0)\n",
+ "Requirement already satisfied: python-multipart in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.0.6)\n",
+ "Requirement already satisfied: pyyaml<7.0,>=5.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (6.0.1)\n",
+ "Requirement already satisfied: ruff>=0.1.7 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.1.14)\n",
+ "Requirement already satisfied: semantic-version~=2.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (2.10.0)\n",
+ "Requirement already satisfied: tomlkit==0.12.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.12.0)\n",
+ "Requirement already satisfied: typer<1.0,>=0.9 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from typer[all]<1.0,>=0.9->gradio>=4.15) (0.9.0)\n",
+ "Requirement already satisfied: uvicorn>=0.14.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio>=4.15) (0.26.0)\n",
+ "Requirement already satisfied: fsspec in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio-client==0.8.1->gradio>=4.15) (2023.12.2)\n",
+ "Requirement already satisfied: websockets<12.0,>=10.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from gradio-client==0.8.1->gradio>=4.15) (11.0.3)\n",
+ "Requirement already satisfied: jsonschema>=3.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio>=4.15) (4.21.1)\n",
+ "Requirement already satisfied: toolz in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio>=4.15) (0.12.0)\n",
+ "Requirement already satisfied: filelock in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from huggingface-hub->wavmark>=0.0.3) (3.13.1)\n",
+ "Requirement already satisfied: requests in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from huggingface-hub->wavmark>=0.0.3) (2.31.0)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from matplotlib~=3.0->gradio>=4.15) (1.2.0)\n",
+ "Requirement already satisfied: cycler>=0.10 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from matplotlib~=3.0->gradio>=4.15) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from matplotlib~=3.0->gradio>=4.15) (4.47.2)\n",
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from matplotlib~=3.0->gradio>=4.15) (1.4.5)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from matplotlib~=3.0->gradio>=4.15) (3.1.1)\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from matplotlib~=3.0->gradio>=4.15) (2.8.2)\n",
+ "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from numba>=0.45.1->librosa>=0.9.1) (0.41.1)\n",
+ "Requirement already satisfied: coloredlogs in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from onnxruntime<2,>=1.14->faster-whisper>=0.9.0) (15.0.1)\n",
+ "Requirement already satisfied: flatbuffers in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from onnxruntime<2,>=1.14->faster-whisper>=0.9.0) (23.5.26)\n",
+ "Requirement already satisfied: protobuf in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from onnxruntime<2,>=1.14->faster-whisper>=0.9.0) (4.25.2)\n",
+ "Requirement already satisfied: sympy in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from onnxruntime<2,>=1.14->faster-whisper>=0.9.0) (1.12)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from pandas<3.0,>=1.0->gradio>=4.15) (2023.3.post1)\n",
+ "Requirement already satisfied: tzdata>=2022.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from pandas<3.0,>=1.0->gradio>=4.15) (2023.4)\n",
+ "Requirement already satisfied: platformdirs>=2.5.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from pooch>=1.0->librosa>=0.9.1) (4.1.0)\n",
+ "Requirement already satisfied: annotated-types>=0.4.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from pydantic>=1.9.1->inflect>=7.0.0) (0.6.0)\n",
+ "Requirement already satisfied: pydantic-core==2.14.6 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from pydantic>=1.9.1->inflect>=7.0.0) (2.14.6)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from scikit-learn>=0.19.1->librosa>=0.9.1) (3.2.0)\n",
+ "Requirement already satisfied: cffi>=1.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from soundfile>=0.10.2->librosa>=0.9.1) (1.16.0)\n",
+ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from typer<1.0,>=0.9->typer[all]<1.0,>=0.9->gradio>=4.15) (8.1.7)\n",
+ "Requirement already satisfied: colorama<0.5.0,>=0.4.3 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from typer[all]<1.0,>=0.9->gradio>=4.15) (0.4.6)\n",
+ "Requirement already satisfied: shellingham<2.0.0,>=1.3.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from typer[all]<1.0,>=0.9->gradio>=4.15) (1.5.4)\n",
+ "Requirement already satisfied: rich<14.0.0,>=10.11.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from typer[all]<1.0,>=0.9->gradio>=4.15) (13.7.0)\n",
+ "Requirement already satisfied: h11>=0.8 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from uvicorn>=0.14.0->gradio>=4.15) (0.14.0)\n",
+ "Requirement already satisfied: starlette<0.36.0,>=0.35.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from fastapi->gradio>=4.15) (0.35.1)\n",
+ "Requirement already satisfied: anyio in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from httpx->gradio>=4.15) (4.2.0)\n",
+ "Requirement already satisfied: certifi in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from httpx->gradio>=4.15) (2023.11.17)\n",
+ "Requirement already satisfied: httpcore==1.* in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from httpx->gradio>=4.15) (1.0.2)\n",
+ "Requirement already satisfied: idna in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from httpx->gradio>=4.15) (3.6)\n",
+ "Requirement already satisfied: sniffio in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from httpx->gradio>=4.15) (1.3.0)\n",
+ "Requirement already satisfied: triton<3,>=2.0.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from openai-whisper->whisper-timestamped>=1.14.2) (2.2.0)\n",
+ "Requirement already satisfied: more-itertools in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from openai-whisper->whisper-timestamped>=1.14.2) (10.2.0)\n",
+ "Requirement already satisfied: tiktoken in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from openai-whisper->whisper-timestamped>=1.14.2) (0.5.2)\n",
+ "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from torch->wavmark>=0.0.3) (11.7.99)\n",
+ "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from torch->wavmark>=0.0.3) (8.5.0.96)\n",
+ "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from torch->wavmark>=0.0.3) (11.10.3.66)\n",
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from torch->wavmark>=0.0.3) (11.7.99)\n",
+ "Requirement already satisfied: wheel in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch->wavmark>=0.0.3) (0.42.0)\n",
+ "Requirement already satisfied: pycparser in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from cffi>=1.0->soundfile>=0.10.2->librosa>=0.9.1) (2.21)\n",
+ "Requirement already satisfied: attrs>=22.2.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio>=4.15) (23.2.0)\n",
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio>=4.15) (2023.12.1)\n",
+ "Requirement already satisfied: referencing>=0.28.4 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio>=4.15) (0.32.1)\n",
+ "Requirement already satisfied: rpds-py>=0.7.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio>=4.15) (0.17.1)\n",
+ "Requirement already satisfied: six>=1.5 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio>=4.15) (1.16.0)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from requests->huggingface-hub->wavmark>=0.0.3) (3.3.2)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from requests->huggingface-hub->wavmark>=0.0.3) (2.1.0)\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from rich<14.0.0,>=10.11.0->typer[all]<1.0,>=0.9->gradio>=4.15) (2.2.0)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from rich<14.0.0,>=10.11.0->typer[all]<1.0,>=0.9->gradio>=4.15) (2.17.2)\n",
+ "Requirement already satisfied: exceptiongroup>=1.0.2 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from anyio->httpx->gradio>=4.15) (1.2.0)\n",
+ "Requirement already satisfied: humanfriendly>=9.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper>=0.9.0) (10.0)\n",
+ "Requirement already satisfied: mpmath>=0.19 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from sympy->onnxruntime<2,>=1.14->faster-whisper>=0.9.0) (1.3.0)\n",
+ "Requirement already satisfied: regex>=2022.1.18 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from tiktoken->openai-whisper->whisper-timestamped>=1.14.2) (2023.12.25)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=10.11.0->typer[all]<1.0,>=0.9->gradio>=4.15) (0.1.2)\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install \"openvino>=2023.3\" \\\n",
+ "\"librosa>=0.9.1\" \\\n",
+ "\"wavmark>=0.0.3\" \\\n",
+ "\"faster-whisper>=0.9.0\" \\\n",
+ "\"pydub>=0.25.1\" \\\n",
+ "\"whisper-timestamped>=1.14.2\" \\\n",
+ "\"tqdm\" \\\n",
+ "\"inflect>=7.0.0\" \\\n",
+ "\"unidecode>=1.3.7\" \\\n",
+ "\"eng_to_ipa>=0.0.2\" \\\n",
+ "\"pypinyin>=0.50.0\" \\\n",
+ "\"cn2an>=0.5.22\" \\\n",
+ "\"jieba>=0.42.1\" \\\n",
+ "\"langid>=1.1.6\" \\\n",
+ "\"gradio>=4.15\" \\\n",
+ "\"ipywebrtc\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Importing the dtw module. When using in academic works please cite:\n",
+ " T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
+ " J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "import openvino as ov\n",
+ "core = ov.Core()\n",
+ "\n",
+ "from api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass\n",
+ "import se_extractor\n",
+ "from notebook_utils import download_file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "download all resources from HF Hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "base_url = 'https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/'\n",
+ "\n",
+ "CKPT_BASE_PATH = '../checkpoints/'\n",
+ "en_ckpt_path = f'{CKPT_BASE_PATH}/base_speakers/EN/'\n",
+ "zh_ckpt_path = f'{CKPT_BASE_PATH}/base_speakers/ZH/'\n",
+ "converter_path = f'{CKPT_BASE_PATH}/converter/'\n",
+ "\n",
+ "enable_chineese_lang = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "'../checkpoints/converter/checkpoint.pth' already exists.\n",
+ "'../checkpoints/converter/config.json' already exists.\n",
+ "'../checkpoints/base_speakers/EN/checkpoint.pth' already exists.\n",
+ "'../checkpoints/base_speakers/EN/config.json' already exists.\n",
+ "'../checkpoints/base_speakers/ZH/checkpoint.pth' already exists.\n",
+ "'../checkpoints/base_speakers/ZH/config.json' already exists.\n",
+ "'../checkpoints/base_speakers/EN/en_default_se.pth' already exists.\n",
+ "'../checkpoints/base_speakers/EN/en_style_se.pth' already exists.\n",
+ "'../checkpoints/base_speakers/ZH/zh_default_se.pth' already exists.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/home/epavel/devel/openvino_notebooks/notebooks/280-openvoice/checkpoints/base_speakers/ZH/zh_default_se.pth')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "download_file(base_url + 'converter/checkpoint.pth', directory=converter_path)\n",
+ "download_file(base_url + 'converter/config.json', directory=converter_path)\n",
+ "\n",
+ "download_file(base_url + 'base_speakers/EN/checkpoint.pth', directory=en_ckpt_path)\n",
+ "download_file(base_url + 'base_speakers/EN/config.json', directory=en_ckpt_path)\n",
+ "\n",
+ "if enable_chineese_lang:\n",
+ " download_file(base_url + 'base_speakers/ZH/checkpoint.pth', directory=zh_ckpt_path)\n",
+ " download_file(base_url + 'base_speakers/ZH/config.json', directory=zh_ckpt_path)\n",
+ "\n",
+ "download_file(base_url + 'base_speakers/EN/en_default_se.pth', directory=en_ckpt_path)\n",
+ "download_file(base_url + 'base_speakers/EN/en_style_se.pth', directory=en_ckpt_path)\n",
+ "download_file(base_url + 'base_speakers/ZH/zh_default_se.pth', directory=zh_ckpt_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/home/epavel/devel/openvino_notebooks/notebooks/280-openvoice/OpenVoice/../checkpoints/base_speakers/ZH/config.json')"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Path(f'{zh_ckpt_path}/config.json').absolute()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint '../checkpoints//base_speakers/EN//checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n",
+ "Loaded checkpoint '../checkpoints//base_speakers/ZH//checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n",
+ "Loaded checkpoint '../checkpoints//converter//checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "pt_device = \"cpu\"\n",
+ "\n",
+ "en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_path}/config.json', device=pt_device)\n",
+ "en_base_speaker_tts.load_ckpt(f'{en_ckpt_path}/checkpoint.pth')\n",
+ "\n",
+ "if enable_chineese_lang:\n",
+ " zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_path}/config.json', device=pt_device)\n",
+ " zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_path}/checkpoint.pth')\n",
+ "\n",
+ "tone_color_converter = ToneColorConverter(f'{converter_path}/config.json', device=pt_device)\n",
+ "tone_color_converter.load_ckpt(f'{converter_path}/checkpoint.pth')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Convert models to OpenVINO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To convert to OpenVino IR format first we need to get acceptable pytorch nn.Module object. \n",
+ "\n",
+ "Both ToneColorConverter, BaseSpeakerTTS instead of using self.forward as the main entry point use custom methods infer and convert_voice respectively, therefore need to wrap them with a custom class that is inherited from torch.nn.Module. \n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tts_kwargs = dict(noise_scale = 0.667, noise_scale_w = 0.6, speed = 1.0, sdp_ratio = 0.2)\n",
+ "voice_convert_kwargs = dict(tau=0.3)\n",
+ "\n",
+ "class OVOpenVoiceBase(torch.nn.Module):\n",
+ " def __init__(self, voice_model: OpenVoiceBaseClass, **kwargs):\n",
+ " super().__init__()\n",
+ " self.voice_model = voice_model\n",
+ " self.default_kwargs = kwargs\n",
+ " for par in voice_model.model.parameters():\n",
+ " par.requires_grad = False\n",
+ " \n",
+ "class OVOpenVoiceTTS(OVOpenVoiceBase):\n",
+ " def get_example_input(self):\n",
+ " stn_tst = self.voice_model.get_text('this is original text', self.voice_model.hps, False)\n",
+ " x_tst = stn_tst.unsqueeze(0)\n",
+ " x_tst_lengths = torch.LongTensor([stn_tst.size(0)])\n",
+ " speaker_id = torch.LongTensor([1])\n",
+ " return (x_tst, x_tst_lengths, speaker_id)\n",
+ "\n",
+ " def forward(self, x, x_lengths, sid):\n",
+ " return self.voice_model.model.infer(x, x_lengths, sid, **self.default_kwargs)\n",
+ " \n",
+ "class OVOpenVoiceConverter(OVOpenVoiceBase):\n",
+ " def get_example_input(self):\n",
+ " y = torch.randn([1, 513, 238], dtype=torch.float32)\n",
+ " y_lengths = torch.LongTensor([y.size(-1)])\n",
+ " target_se = torch.randn(*(1, 256, 1))\n",
+ " source_se = torch.randn(*(1, 256, 1))\n",
+ " return (y, y_lengths, source_se, target_se)\n",
+ " \n",
+ " def forward(self, y, y_lengths, sid_src, sid_tgt):\n",
+ " return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, **self.default_kwargs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Convert to OpenVino IR and save to IRs_path folder for the future use. If IRs already exist skip conversion and read them directly"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "IRS_PATH = '../openvino_irs/'\n",
+ "EN_TTS_IR = f'{IRS_PATH}/openvoice_en_tts.xml'\n",
+ "ZH_TTS_IR = f'{IRS_PATH}/openvoice_zh_tts.xml'\n",
+ "\n",
+ "VOICE_CONVERTER_IR = f'{IRS_PATH}/openvoice_tone_conversion.xml'\n",
+ "\n",
+ "paths = [EN_TTS_IR, VOICE_CONVERTER_IR]\n",
+ "models = [OVOpenVoiceTTS(en_base_speaker_tts), OVOpenVoiceConverter(tone_color_converter)]\n",
+ "if enable_chineese_lang:\n",
+ " models.append(OVOpenVoiceTTS(zh_base_speaker_tts))\n",
+ "\n",
+ "ov_models = []\n",
+ "for model, path in zip(models, paths):\n",
+ " if not os.path.exists(path):\n",
+ " ov_model = ov.convert_model(model, example_input=model.get_example_input())\n",
+ " ov.save_model(ov_model, path)\n",
+ " else:\n",
+ " ov_model = core.read_model(path)\n",
+ " ov_models.append(ov_model)\n",
+ "\n",
+ "ov_en_tts, ov_voice_conversion = ov_models[:2]\n",
+ "if enable_chineese_lang:\n",
+ " ov_zh_tts = ov_models[-1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e1dc6e5d28004123adc33206338a51a5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Dropdown(description='Device:', index=1, options=('CPU', 'GPU', 'AUTO'), value='GPU')"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import ipywidgets as widgets\n",
+ "\n",
+ "core = ov.Core()\n",
+ "device = widgets.Dropdown(\n",
+ " options=core.available_devices + [\"AUTO\"],\n",
+ " value='GPU',\n",
+ " description='Device:',\n",
+ " disabled=False,\n",
+ ")\n",
+ "device"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "First of all, select the reference tone of voice to which the generated text will be converted: your can select from existing ones or record your own by seleceing 'record_manually'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0ce97f9b4d924d5d89174cc69a936cd8",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Dropdown(description='reference voice from which tone color will be copied', options=('resources/example_refer…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "reference_speakers = [\n",
+ " 'resources/example_reference.mp3',\n",
+ " 'resources/demo_speaker0.mp3',\n",
+ " 'resources/demo_speaker1.mp3',\n",
+ " 'resources/demo_speaker2.mp3',\n",
+ " 'record_manually',\n",
+ "]\n",
+ "\n",
+ "ref_speaker = widgets.Dropdown(\n",
+ " options=reference_speakers,\n",
+ " value=reference_speakers[0],\n",
+ " description=\"reference voice from which tone color will be copied\",\n",
+ " disabled=False,\n",
+ ")\n",
+ "\n",
+ "display(ref_speaker)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_dir = '../outputs/'\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ref_speaker_path = ref_speaker.value\n",
+ "\n",
+ "if ref_speaker.value == 'record_manually':\n",
+ " ref_speaker_path = f'{output_dir}/custom_example_sample.webm'\n",
+ " from ipywebrtc import AudioRecorder, CameraStream\n",
+ " camera = CameraStream(constraints={'audio': True,'video':False})\n",
+ " recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)\n",
+ " display(recorder)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from IPython.display import Audio\n",
+ "Audio(ref_speaker_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/librosa/util/decorators.py:88: UserWarning: PySoundFile failed. Trying audioread instead.\n",
+ " return f(*args, **kwargs)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load speaker embeddings\n",
+ "en_source_default_se = torch.load(f'{en_ckpt_path}/en_default_se.pth')\n",
+ "en_source_style_se = torch.load(f'{en_ckpt_path}/en_style_se.pth')\n",
+ "zh_source_se = torch.load(f'{zh_ckpt_path}/zh_default_se.pth')\n",
+ "\n",
+ "target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir='processed', vad=True) ## ffmpeg must be installed\n",
+ "\n",
+ "custom_se = se_extractor.get_se('/home/epavel/my_base_voice.m4a', tone_color_converter, target_dir='processed', vad=True)[0] ## ffmpeg must be installed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Inference"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "There are pre and post processings that are not traceable and could not be offloaded to OpenVINO, instead of writing such processing ourselves we will rely on the already existing ones. We just replace infer and voice conversion functions of OpenVoiceBaseClass so that the the most computationally expensive part is done in OpenVINO."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_pathched_infer(ov_model: ov.Model, device_name: str = device.value) -> callable:\n",
+ " compiled_model = core.compile_model(ov_model, device_name)\n",
+ " \n",
+ " def infer_impl(x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):\n",
+ " # todo: assert that other params match to compiled ones\n",
+ " ov_output = compiled_model((x, x_lengths, sid))\n",
+ " return (torch.tensor(ov_output[0]), )\n",
+ " return infer_impl\n",
+ "\n",
+ "def get_patched_voice_conversion(ov_model: ov.Model, device_name: str = device.value) -> callable:\n",
+ " compiled_model = core.compile_model(ov_model, device_name)\n",
+ "\n",
+ " def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau=1.0):\n",
+ " # todo: assert that tau matches to compiled ones\n",
+ " ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt))\n",
+ " return (torch.tensor(ov_output[0]), )\n",
+ " return voice_conversion_impl\n",
+ "\n",
+ "en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts)\n",
+ "tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve a variety of tasks including emulation of human vision,\n",
+ "automatic speech recognition, natural language processing, recommendation systems, and many others.\n",
+ " > ===========================\n",
+ "ˈoʊpən vino* toolkit* ɪz ə ˌkɑmpɹiˈhɛnsɪv toolkit* fəɹ kˈwɪkli dɪˈvɛləpɪŋ ˌæpləˈkeɪʃənz ənd səˈluʃənz ðət sɑɫv ə vəɹˈaɪəti əv tæsks ˌɪnˈkludɪŋ ˌɛmjəˈleɪʃən əv ˈjumən ˈvɪʒən,\n",
+ " length:173\n",
+ " length:173\n",
+ "ˌɔtəˈmætɪk spitʃ ˌɹɛkɪgˈnɪʃən, ˈnætʃəɹəɫ ˈlæŋgwɪdʒ ˈpɹɑsɛsɪŋ, ˌɹɛkəmənˈdeɪʃən ˈsɪstəmz, ənd ˈmɛni ˈəðəɹz.\n",
+ " length:105\n",
+ " length:105\n"
+ ]
+ }
+ ],
+ "source": [
+ "save_path = f'{output_dir}/output_en_default.wav'\n",
+ "\n",
+ "text = \"\"\"\n",
+ "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve \n",
+ "a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing, \n",
+ "recommendation systems, and many others.\n",
+ "\"\"\"\n",
+ "\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "en_base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
+ "# src_path = '/home/epavel/my_base_voice.m4a'\n",
+ "\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=en_source_default_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path, \n",
+ " message=\"@MyShell\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio(src_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio(save_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Run OpenVoice Gradio online app\n",
+ "We can also use [Gradio](https://www.gradio.app/) app to run TTS and voice tone conversion online."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running on local URL: http://0.0.0.0:7860\n",
+ "\n",
+ "To create a public link, set `share=True` in `launch()`.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": []
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from openvoice_gradio import get_demo\n",
+ "\n",
+ "demo = get_demo(output_dir, color_convert_model, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se)\n",
+ "demo.queue(max_size=2)\n",
+ "demo.launch(server_name=\"0.0.0.0\", server_port=7860)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Closing server running on port: 7860\n"
+ ]
+ }
+ ],
+ "source": [
+ "# please run this cell for stopping gradio interface\n",
+ "demo.close()\n",
+ "\n",
+ "# clean up \n",
+ "# import shutil\n",
+ "# shutil.rmtree(CKPT_BASE_PATH)\n",
+ "# shutil.rmtree(IRS_PATH)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/284-openvoice/README.md b/notebooks/284-openvoice/README.md
new file mode 100644
index 00000000000..5f7a44d542f
--- /dev/null
+++ b/notebooks/284-openvoice/README.md
@@ -0,0 +1,30 @@
+# Voice tone cloning with OpenVoice and OpenVINO
+
+[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/openvinotoolkit/openvino_notebooks/HEAD?filepath=notebooks%2F280-openvoice%2F280-openvoice.ipynb)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/main/notebooks/280-openvoice/280-openvoice.ipynb)
+
+
+![sdf](openvoice_scheme.png)
+
+[OpenVoice](https://github.com/myshell-ai/OpenVoice) OpenVoice, a versatile instant voice tone transfering and generating speech in various languages with just a brief audio snippet from the source speaker. OpenVoice represents a significant advancement and has three main features: (i) high quality tone color replication with multiple languages and accents; (ii) it provides fine-tuned control over voice styles, including emotions, accents, as well as other parameters such as rhythm, pauses, and intonation. (iii) OpenVoice achieves zero-shot cross-lingual voice cloning, eliminating the need for the generated speech and the reference speech to be part of a massive-speaker multilingual training dataset
+
+More details about model can be found in [project web page](https://research.myshell.ai/open-voice), [paper](https://arxiv.org/abs/2312.01479), and official [repository](https://github.com/myshell-ai/OpenVoice)
+
+In this tutorial we will explore how to convert and run OpenVoice using OpenVINO.
+
+## Notebook Contents
+
+This notebook demonstrates voice tone cloning with [OpenVoice](https://github.com/myshell-ai/OpenVoice) in OpenVINO.
+
+The tutorial consists of following steps:
+- Install prerequisites
+- Load PyTorch model
+- Convert Model to Openvino Intermediate Representation format
+- Run OpenVINO model inference on a single example
+- Launch interactive demo
+
+## Installation Instructions
+
+This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
diff --git a/notebooks/408-openvoice/openvoice_gradio.py b/notebooks/284-openvoice/openvoice_gradio.py
similarity index 100%
rename from notebooks/408-openvoice/openvoice_gradio.py
rename to notebooks/284-openvoice/openvoice_gradio.py
diff --git a/notebooks/284-openvoice/openvoice_scheme.png b/notebooks/284-openvoice/openvoice_scheme.png
new file mode 100644
index 00000000000..7f3c45eb5cb
Binary files /dev/null and b/notebooks/284-openvoice/openvoice_scheme.png differ
diff --git a/notebooks/408-openvoice/408-openvoice.ipynb b/notebooks/408-openvoice/408-openvoice.ipynb
deleted file mode 100644
index 563eba5cbd3..00000000000
--- a/notebooks/408-openvoice/408-openvoice.ipynb
+++ /dev/null
@@ -1,565 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Introduction"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "OpenVoice is a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.\n",
- "\n",
- "This notebooks provides example of converting original OpenVoice model (https://github.com/myshell-ai/OpenVoice) to OpenVINO IR format for faster inference."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "clone the repository and install all dependencies"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "!git clone https://github.com/myshell-ai/OpenVoice"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "%pip install openvino==2023.2\n",
- "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \"torch>=2.1.0\" \"torchaudio>=2.1.0\"\n",
- "%pip install wavmark also installs torch\n",
- "\n",
- "# todo: try to unfreeze dependencies\n",
- "%pip install librosa==0.9.1 \\\n",
- "faster-whisper==0.9.0 \\\n",
- "pydub==0.25.1 \\\n",
- "whisper-timestamped==1.14.2 \\\n",
- "tqdm \\\n",
- "inflect==7.0.0 \\\n",
- "unidecode==1.3.7 \\\n",
- "eng_to_ipa==0.0.2 \\\n",
- "wavmark==0.0.2 \\\n",
- "pypinyin==0.50.0 \\\n",
- "cn2an==0.5.22 \\\n",
- "jieba==0.42.1 \\\n",
- "langid==1.1.6\n",
- "gradio==4.15 \\\n",
- "ipywebrtc \\\n",
- "ipywidgets \\"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "from openvoice_utils import OVOpenVoiceTTS, OVOpenVoiceConvert"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/home/epavel/devel/openvino_notebooks/notebooks/408-openvoice/OpenVoice\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
- " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
- ]
- }
- ],
- "source": [
- "# cd to the original repo to save original data paths and imports\n",
- "%cd OpenVoice"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "download all resources from HF Hub"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "!mkdir -p checkpoints/converter/\n",
- "!mkdir -p checkpoints/base_speakers/EN/\n",
- "!mkdir -p checkpoints/base_speakers/ZH/\n",
- "\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth -O checkpoints/converter/checkpoint.pth\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/converter/config.json -O checkpoints/converter/config.json\n",
- "\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/checkpoint.pth -O checkpoints/base_speakers/EN/checkpoint.pth\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/EN/config.json -O checkpoints/base_speakers/EN/config.json\n",
- "\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/checkpoint.pth -O checkpoints/base_speakers/ZH/checkpoint.pth\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/raw/main/checkpoints/base_speakers/ZH/config.json -O checkpoints/base_speakers/ZH/config.json\n",
- "\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_default_se.pth -O checkpoints/base_speakers/EN/en_default_se.pth\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/EN/en_style_se.pth -O checkpoints/base_speakers/EN/en_style_se.pth\n",
- "!wget https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/base_speakers/ZH/zh_default_se.pth -O checkpoints/base_speakers/ZH/zh_default_se.pth"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "from api import BaseSpeakerTTS, ToneColorConverter"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
- "missing/unexpected keys: [] []\n",
- "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
- "missing/unexpected keys: [] []\n",
- "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
- "missing/unexpected keys: [] []\n"
- ]
- }
- ],
- "source": [
- "en_ckpt_base = 'checkpoints/base_speakers/EN'\n",
- "zh_ckpt_base = 'checkpoints/base_speakers/ZH'\n",
- "\n",
- "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
- "output_dir = 'outputs'\n",
- "\n",
- "en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)\n",
- "en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')\n",
- "\n",
- "zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)\n",
- "zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')\n",
- "\n",
- "ckpt_converter = 'checkpoints/converter'\n",
- "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
- "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
- "\n",
- "os.makedirs(output_dir, exist_ok=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Convert models to OpenVINO"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "e0dff3e511e847ce829a8d7bb6ee7943",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Dropdown(options=('CPU', 'GPU', 'AUTO'), value='CPU')"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import ipywidgets as widgets\n",
- "\n",
- "devices = ['CPU', 'GPU', 'AUTO']\n",
- "device = widgets.Dropdown(options=devices, value=devices[0], disabled=False)\n",
- "display(device)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "en_tts_model = OVOpenVoiceTTS(en_base_speaker_tts, ir_path='en_openvoice_tts.xml')\n",
- "zh_tts_model = OVOpenVoiceTTS(en_base_speaker_tts, ir_path='zh_openvoice_tts.xml')\n",
- "color_convert_model = OVOpenVoiceConvert(tone_color_converter, ir_path='openvoice_converter.xml')\n",
- "\n",
- "en_tts_model.compile(device.value)\n",
- "zh_tts_model.compile(device.value)\n",
- "color_convert_model.compile(device.value)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "# load speaker embeddings\n",
- "en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth')\n",
- "en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth')\n",
- "zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "First of all, select the reference tone of voice to which the generated text will be converted: your can select from existing ones or record your own by seleceing 'record_manually'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "e3068acf38a94a12ad85c87f346d9a14",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Dropdown(description='reference voice from which tone color will be copied', options=('resources/example_refer…"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "reference_speakers = [\n",
- " 'resources/example_reference.mp3',\n",
- " 'resources/demo_speaker0.mp3',\n",
- " 'resources/demo_speaker1.mp3',\n",
- " 'resources/demo_speaker2.mp3',\n",
- " 'record_manually',\n",
- "]\n",
- "\n",
- "ref_speaker = widgets.Dropdown(\n",
- " options=reference_speakers,\n",
- " value=reference_speakers[0],\n",
- " description=\"reference voice from which tone color will be copied\",\n",
- " disabled=False,\n",
- ")\n",
- "\n",
- "display(ref_speaker)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "ref_speaker_path = ref_speaker.value\n",
- "\n",
- "if ref_speaker.value == 'record_manually':\n",
- " ref_speaker_path = f'{output_dir}/custom_example_sample.webm'\n",
- " from ipywebrtc import AudioRecorder, CameraStream\n",
- " camera = CameraStream(constraints={'audio': True,'video':False})\n",
- " recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)\n",
- " display(recorder)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from IPython.display import Audio\n",
- "Audio(ref_speaker_path)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Importing the dtw module. When using in academic works please cite:\n",
- " T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
- " J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
- "\n"
- ]
- }
- ],
- "source": [
- "import se_extractor\n",
- "target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir='processed', vad=True) ## ffmpeg must be installed"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve a variety of tasks including emulation of human vision,\n",
- "automatic speech recognition, natural language processing, recommendation systems, and many others.\n",
- " > ===========================\n",
- "ˈoʊpən vino* toolkit* ɪz ə ˌkɑmpɹiˈhɛnsɪv toolkit* fəɹ kˈwɪkli dɪˈvɛləpɪŋ ˌæpləˈkeɪʃənz ənd səˈluʃənz ðət sɑɫv ə vəɹˈaɪəti əv tæsks ˌɪnˈkludɪŋ ˌɛmjəˈleɪʃən əv ˈjumən ˈvɪʒən,\n",
- " length:173\n",
- " length:173\n",
- "ˌɔtəˈmætɪk spitʃ ˌɹɛkɪgˈnɪʃən, ˈnætʃəɹəɫ ˈlæŋgwɪdʒ ˈpɹɑsɛsɪŋ, ˌɹɛkəmənˈdeɪʃən ˈsɪstəmz, ənd ˈmɛni ˈəðəɹz.\n",
- " length:105\n",
- " length:105\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/wavmark/models/my_model.py:25: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:978.)\n",
- " return torch.istft(signal_wmd_fft, n_fft=self.n_fft, hop_length=self.hop_length, window=window,\n"
- ]
- }
- ],
- "source": [
- "save_path = f'{output_dir}/output_en_default.wav'\n",
- "\n",
- "text = \"\"\"\n",
- "OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve \n",
- "a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing, \n",
- "recommendation systems, and many others.\n",
- "\"\"\"\n",
- "\n",
- "src_path = f'{output_dir}/tmp.wav'\n",
- "en_tts_model.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
- "\n",
- "color_convert_model.convert(\n",
- " audio_src_path=src_path, \n",
- " src_se=en_source_default_se, \n",
- " tgt_se=target_se, \n",
- " output_path=save_path,\n",
- " message=\"@MyShell\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Audio(src_path)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Audio(save_path)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Run OpenVoice Gradio online app\n",
- "We can also use [Gradio](https://www.gradio.app/) app to run TTS and voice tone conversion online."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/epavel/devel/openvino_notebooks/.venv/lib/python3.10/site-packages/gradio/components/dropdown.py:90: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False.\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Running on local URL: http://0.0.0.0:7860\n",
- "\n",
- "To create a public link, set `share=True` in `launch()`.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- ""
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": []
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from openvoice_gradio import get_demo\n",
- "\n",
- "demo = get_demo(output_dir, color_convert_model, en_tts_model, zh_tts_model, en_source_default_se, en_source_style_se, zh_source_se)\n",
- "demo.queue(max_size=2)\n",
- "demo.launch(server_name=\"0.0.0.0\", server_port=7860)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Closing server running on port: 7860\n"
- ]
- }
- ],
- "source": [
- "# please run this cell for stopping gradio interface\n",
- "demo.close()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/408-openvoice/openvoice_utils.py b/notebooks/408-openvoice/openvoice_utils.py
deleted file mode 100644
index f5fd1cbfcc9..00000000000
--- a/notebooks/408-openvoice/openvoice_utils.py
+++ /dev/null
@@ -1,131 +0,0 @@
-from OpenVoice.mel_processing import spectrogram_torch
-import torch
-import librosa
-import openvino as ov
-import os
-import re
-import soundfile
-
-
-class OVOpenVoiceTTS(torch.nn.Module):
- def __init__(self, tts_model, noise_scale = 0.667, noise_scale_w = 0.6, speed = 1, sdp_ratio = 0.2, ir_path='openvoice_tts.xml'):
- super().__init__()
- self.tts_model = tts_model
- self.ir_path = ir_path
-
- self.default_kwargs = dict(
- noise_scale = noise_scale,
- noise_scale_w = noise_scale_w,
- length_scale = 1 / speed,
- sdp_ratio = sdp_ratio
- )
-
-
- def forward(self, x, x_lengths, sid):
- for par in self.tts_model.model.parameters():
- par.requires_grad = False
- return self.tts_model.model.infer(x, x_lengths, sid, **self.default_kwargs)
-
- def get_example_input(self):
- stn_tst = self.tts_model.get_text('this is original text', self.tts_model.hps, False)
- x_tst = stn_tst.unsqueeze(0)
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
- speaker_id = torch.LongTensor([1])
- return (x_tst, x_tst_lengths, speaker_id)
-
- def compile(self, ov_device='CPU'):
- core = ov.Core()
- if os.path.exists(self.ir_path):
- self.ov_tts = core.read_model(self.ir_path)
- else:
- self.ov_tts = ov.convert_model(self, example_input=self.get_example_input())
- ov.save_model(self.ov_tts, self.ir_path)
-
- self.compiled_model = core.compile_model(self.ov_tts, ov_device)
-
- def tts(self, text, output_path, speaker, language='English', speed=1.0):
- tts_model = self.tts_model
-
- mark = tts_model.language_marks.get(language.lower(), None)
- assert mark is not None, f"language {language} is not supported"
-
- texts = tts_model.split_sentences_into_pieces(text, mark)
-
- audio_list = []
- for t in texts:
- t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
- t = f'[{mark}]{t}[{mark}]'
- stn_tst = tts_model.get_text(t, tts_model.hps, False)
- device = tts_model.device
- speaker_id = tts_model.hps.speakers[speaker]
- with torch.no_grad():
- x_tst = stn_tst.unsqueeze(0).to(device)
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
- sid = torch.LongTensor([speaker_id]).to(device)
- # call OpenVino instead of torch
- audio = self.compiled_model(((x_tst, x_tst_lengths, sid)))[0][0, 0]
- audio_list.append(audio)
- audio = tts_model.audio_numpy_concat(audio_list, sr=tts_model.hps.data.sampling_rate, speed=speed)
-
- if output_path is None:
- return audio
- else:
- soundfile.write(output_path, audio, tts_model.hps.data.sampling_rate)
-
-
-class OVOpenVoiceConvert(torch.nn.Module):
- def __init__(self, voice_conversion_model, tau=0.3, ir_path='openvoice_converter.xml'):
- super().__init__()
- self.voice_conversion_model = voice_conversion_model
- self.ir_path = ir_path
-
- self.default_kwargs = dict(
- tau = tau,
- )
-
- def get_example_input(self):
- y = torch.randn([1, 513, 238], dtype=torch.float32)
- y_lengths = torch.LongTensor([y.size(-1)])
- target_se = torch.randn(*(1, 256, 1))
- source_se = torch.randn(*(1, 256, 1))
- return (y, y_lengths, source_se, target_se)
-
- def compile(self, ov_device='CPU'):
- core = ov.Core()
- if os.path.exists(self.ir_path):
- self.ov_voice_conversion = core.read_model(self.ir_path)
- else:
- self.ov_voice_conversion = ov.convert_model(self, example_input=self.get_example_input())
- ov.save_model(self.ov_voice_conversion, self.ir_path)
-
- self.compiled_model = core.compile_model(self.ov_voice_conversion, ov_device)
-
- def forward(self, y, y_lengths, sid_src, sid_tgt):
- for par in self.voice_conversion_model.model.parameters():
- par.requires_grad = False
- return self.voice_conversion_model.model.infer(y, y_lengths, sid_src, sid_tgt, **self.default_kwargs)
-
- def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
- model = self.voice_conversion_model
-
- hps = model.hps
- # load audio
- audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
- audio = torch.tensor(audio).float()
-
- with torch.no_grad():
- y = torch.FloatTensor(audio).to(model.device)
- y = y.unsqueeze(0)
- spec = spectrogram_torch(y, hps.data.filter_length,
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
- center=False).to(model.device)
- spec_lengths = torch.LongTensor([spec.size(-1)]).to(model.device)
- audio = model.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
- 0, 0].data.cpu().float().numpy()
- # call OpenVino instead of torch
- audio = self.compiled_model((spec, spec_lengths, src_se, tgt_se))[0][0, 0]
- audio = model.add_watermark(audio, message)
- if output_path is None:
- return audio
- else:
- soundfile.write(output_path, audio, hps.data.sampling_rate)