From c918016170f9d807d6434cebeef206653393cbb0 Mon Sep 17 00:00:00 2001 From: boboiazumi Date: Sat, 23 Sep 2023 21:50:23 +0700 Subject: [PATCH] New Colab Notebook --- ERPISI_EXPERIMENT.ipynb | 1 + README.md | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 ERPISI_EXPERIMENT.ipynb diff --git a/ERPISI_EXPERIMENT.ipynb b/ERPISI_EXPERIMENT.ipynb new file mode 100644 index 0000000..4e963fc --- /dev/null +++ b/ERPISI_EXPERIMENT.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":["HU3kYTEpJJkn"],"gpuType":"T4","mount_file_id":"19V2KQOuaXWsM7gzEDTPJrmrYDI1R7K1T","authorship_tag":"ABX9TyMAEEfwzNIrJ713bsaO6OzK"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# AICoverGen Interactive"],"metadata":{"id":"Drxtf339IMwx"}},{"cell_type":"markdown","source":["# Instalation"],"metadata":{"id":"HU3kYTEpJJkn"}},{"cell_type":"code","source":["!pip install deemix\n","!pip install fairseq==0.12.2\n","!pip install faiss-cpu==1.7.3\n","!pip install ffmpeg-python>=0.2.0\n","!pip install gradio==3.39.0\n","!pip install lib==4.0.0\n","!pip install librosa==0.9.1\n","!pip install numpy==1.23.5\n","!pip install onnxruntime_gpu\n","!pip install praat-parselmouth>=0.4.2\n","!pip install pedalboard==0.7.7\n","!pip install pydub==0.25.1\n","!pip install pyworld==0.3.4\n","!pip install Requests==2.31.0\n","!pip install scipy==1.11.1\n","!pip install soundfile==0.12.1\n","!pip install --find-links https://download.pytorch.org/whl/torch_stable.html\n","!pip install torch==2.0.1+cu118\n","!pip install torchcrepe==0.0.20\n","!pip install tqdm==4.65.0\n","!pip install yt_dlp==2023.7.6\n","!pip install sox==1.4.1\n","!apt install sox"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aLbMvCFiJYpc","executionInfo":{"status":"ok","timestamp":1695448677329,"user_tz":-420,"elapsed":250066,"user":{"displayName":"ana taqa126","userId":"16118498281411814981"}},"outputId":"1a2a6abb-248e-43f6-ff35-6f5d7e794100"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting deemix\n"," Downloading deemix-3.6.6-py3-none-any.whl (51 kB)\n","\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/52.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m52.0/52.0 kB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from deemix) (8.1.7)\n","Collecting pycryptodomex (from deemix)\n"," Downloading pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting mutagen (from deemix)\n"," Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.4/194.4 kB\u001b[0m \u001b[31m20.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from deemix) (2.31.0)\n","Collecting deezer-py>=1.3.0 (from deemix)\n"," Downloading deezer_py-1.3.7-py3-none-any.whl (25 kB)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->deemix) (3.2.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->deemix) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->deemix) (2.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->deemix) (2023.7.22)\n","Installing collected packages: pycryptodomex, mutagen, deezer-py, deemix\n","Successfully installed deemix-3.6.6 deezer-py-1.3.7 mutagen-1.47.0 pycryptodomex-3.19.0\n","Collecting fairseq==0.12.2\n"," Downloading fairseq-0.12.2.tar.gz (9.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m70.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n"," Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n"," Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n"," Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: cffi in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (1.15.1)\n","Requirement already satisfied: cython in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (3.0.2)\n","Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)\n"," Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.8/123.8 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting omegaconf<2.1 (from fairseq==0.12.2)\n"," Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n","Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (2023.6.3)\n","Collecting sacrebleu>=1.4.12 (from fairseq==0.12.2)\n"," Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m118.9/118.9 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (2.0.1+cu118)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (4.66.1)\n","Collecting bitarray (from fairseq==0.12.2)\n"," Downloading bitarray-2.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (286 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m286.2/286.2 kB\u001b[0m \u001b[31m29.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: torchaudio>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (2.0.2+cu118)\n","Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fairseq==0.12.2) (1.23.5)\n","Collecting antlr4-python3-runtime==4.8 (from hydra-core<1.1,>=1.0.7->fairseq==0.12.2)\n"," Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.4/112.4 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: PyYAML>=5.1.* in /usr/local/lib/python3.10/dist-packages (from omegaconf<2.1->fairseq==0.12.2) (6.0.1)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from omegaconf<2.1->fairseq==0.12.2) (4.5.0)\n","Collecting portalocker (from sacrebleu>=1.4.12->fairseq==0.12.2)\n"," Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n","Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.4.12->fairseq==0.12.2) (0.9.0)\n","Collecting colorama (from sacrebleu>=1.4.12->fairseq==0.12.2)\n"," Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n","Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.4.12->fairseq==0.12.2) (4.9.3)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->fairseq==0.12.2) (3.12.2)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->fairseq==0.12.2) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->fairseq==0.12.2) (3.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->fairseq==0.12.2) (3.1.2)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->fairseq==0.12.2) (2.0.0)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch->fairseq==0.12.2) (3.27.4.1)\n","Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch->fairseq==0.12.2) (16.0.6)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi->fairseq==0.12.2) (2.21)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->fairseq==0.12.2) (2.1.3)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->fairseq==0.12.2) (1.3.0)\n","Building wheels for collected packages: fairseq, antlr4-python3-runtime\n"," Building wheel for fairseq (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for fairseq: filename=fairseq-0.12.2-cp310-cp310-linux_x86_64.whl size=11291800 sha256=0da389bc3e1ac4fd4ecd0fabf3b9abe385e11869c0d494409a781533f16814b9\n"," Stored in directory: /root/.cache/pip/wheels/e4/35/55/9c66f65ec7c83fd6fbc2b9502a0ac81b2448a1196159dacc32\n"," Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141209 sha256=6bd032700758cd397321bba1f0d18e0a83647e7c5da5fa6b1d09a74bbd06518e\n"," Stored in directory: /root/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9e801350d5\n","Successfully built fairseq antlr4-python3-runtime\n","Installing collected packages: bitarray, antlr4-python3-runtime, portalocker, omegaconf, colorama, sacrebleu, hydra-core, fairseq\n","Successfully installed antlr4-python3-runtime-4.8 bitarray-2.8.1 colorama-0.4.6 fairseq-0.12.2 hydra-core-1.0.7 omegaconf-2.0.6 portalocker-2.8.2 sacrebleu-2.3.1\n","Collecting faiss-cpu==1.7.3\n"," Downloading faiss_cpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.0/17.0 MB\u001b[0m \u001b[31m81.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: faiss-cpu\n","Successfully installed faiss-cpu-1.7.3\n","Collecting gradio==3.39.0\n"," Downloading gradio-3.39.0-py3-none-any.whl (19.9 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.9/19.9 MB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting aiofiles<24.0,>=22.0 (from gradio==3.39.0)\n"," Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n","Requirement already satisfied: aiohttp~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (3.8.5)\n","Requirement already satisfied: altair<6.0,>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (4.2.2)\n","Collecting fastapi (from gradio==3.39.0)\n"," Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.2/66.2 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting ffmpy (from gradio==3.39.0)\n"," Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n"," Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Collecting gradio-client>=0.3.0 (from gradio==3.39.0)\n"," Downloading gradio_client-0.5.1-py3-none-any.whl (298 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.2/298.2 kB\u001b[0m \u001b[31m30.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting httpx (from gradio==3.39.0)\n"," Downloading httpx-0.25.0-py3-none-any.whl (75 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.7/75.7 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting huggingface-hub>=0.14.0 (from gradio==3.39.0)\n"," Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (3.1.2)\n","Requirement already satisfied: markdown-it-py[linkify]>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (3.0.0)\n","Requirement already satisfied: markupsafe~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (2.1.3)\n","Requirement already satisfied: matplotlib~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (3.7.1)\n","Collecting mdit-py-plugins<=0.3.3 (from gradio==3.39.0)\n"," Downloading mdit_py_plugins-0.3.3-py3-none-any.whl (50 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.5/50.5 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy~=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (1.23.5)\n","Collecting orjson~=3.0 (from gradio==3.39.0)\n"," Downloading orjson-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (23.1)\n","Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (1.5.3)\n","Requirement already satisfied: pillow<11.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (9.4.0)\n","Requirement already satisfied: pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (1.10.12)\n","Collecting pydub (from gradio==3.39.0)\n"," Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n","Collecting python-multipart (from gradio==3.39.0)\n"," Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (6.0.1)\n","Requirement already satisfied: requests~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (2.31.0)\n","Collecting semantic-version~=2.0 (from gradio==3.39.0)\n"," Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n","Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio==3.39.0) (4.5.0)\n","Collecting uvicorn>=0.14.0 (from gradio==3.39.0)\n"," Downloading uvicorn-0.23.2-py3-none-any.whl (59 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting websockets<12.0,>=10.0 (from gradio==3.39.0)\n"," Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (23.1.0)\n","Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (3.2.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (6.0.4)\n","Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (4.0.3)\n","Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (1.9.2)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (1.4.0)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp~=3.0->gradio==3.39.0) (1.3.1)\n","Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio==3.39.0) (0.4)\n","Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio==3.39.0) (4.19.0)\n","Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio==3.39.0) (0.12.0)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client>=0.3.0->gradio==3.39.0) (2023.6.0)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.14.0->gradio==3.39.0) (3.12.2)\n","Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.14.0->gradio==3.39.0) (4.66.1)\n","Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py[linkify]>=2.0.0->gradio==3.39.0) (0.1.2)\n","Requirement already satisfied: linkify-it-py<3,>=1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py[linkify]>=2.0.0->gradio==3.39.0) (2.0.2)\n","Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio==3.39.0) (1.1.0)\n","Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio==3.39.0) (0.11.0)\n","Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio==3.39.0) (4.42.1)\n","Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio==3.39.0) (1.4.5)\n","Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio==3.39.0) (3.1.1)\n","Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio==3.39.0) (2.8.2)\n","INFO: pip is looking at multiple versions of mdit-py-plugins to determine which version is compatible with other requirements. This could take a while.\n","Collecting mdit-py-plugins<=0.3.3 (from gradio==3.39.0)\n"," Downloading mdit_py_plugins-0.3.2-py3-none-any.whl (50 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.4/50.4 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Downloading mdit_py_plugins-0.3.1-py3-none-any.whl (46 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.5/46.5 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Downloading mdit_py_plugins-0.3.0-py3-none-any.whl (43 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Downloading mdit_py_plugins-0.2.8-py3-none-any.whl (41 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Downloading mdit_py_plugins-0.2.7-py3-none-any.whl (41 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Downloading mdit_py_plugins-0.2.6-py3-none-any.whl (39 kB)\n"," Downloading mdit_py_plugins-0.2.5-py3-none-any.whl (39 kB)\n","INFO: pip is looking at multiple versions of mdit-py-plugins to determine which version is compatible with other requirements. This could take a while.\n"," Downloading mdit_py_plugins-0.2.4-py3-none-any.whl (39 kB)\n"," Downloading mdit_py_plugins-0.2.3-py3-none-any.whl (39 kB)\n"," Downloading mdit_py_plugins-0.2.2-py3-none-any.whl (39 kB)\n"," Downloading mdit_py_plugins-0.2.1-py3-none-any.whl (38 kB)\n"," Downloading mdit_py_plugins-0.2.0-py3-none-any.whl (38 kB)\n","INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n"," Downloading mdit_py_plugins-0.1.0-py3-none-any.whl (37 kB)\n","Collecting markdown-it-py[linkify]>=2.0.0 (from gradio==3.39.0)\n"," Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Downloading markdown_it_py-2.2.0-py3-none-any.whl (84 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio==3.39.0) (2023.3.post1)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests~=2.0->gradio==3.39.0) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests~=2.0->gradio==3.39.0) (2.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests~=2.0->gradio==3.39.0) (2023.7.22)\n","Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.10/dist-packages (from uvicorn>=0.14.0->gradio==3.39.0) (8.1.7)\n","Collecting h11>=0.8 (from uvicorn>=0.14.0->gradio==3.39.0)\n"," Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: anyio<4.0.0,>=3.7.1 in /usr/local/lib/python3.10/dist-packages (from fastapi->gradio==3.39.0) (3.7.1)\n","Collecting starlette<0.28.0,>=0.27.0 (from fastapi->gradio==3.39.0)\n"," Downloading starlette-0.27.0-py3-none-any.whl (66 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting httpcore<0.19.0,>=0.18.0 (from httpx->gradio==3.39.0)\n"," Downloading httpcore-0.18.0-py3-none-any.whl (76 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.0/76.0 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx->gradio==3.39.0) (1.3.0)\n","Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0.0,>=3.7.1->fastapi->gradio==3.39.0) (1.1.3)\n","Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.39.0) (2023.7.1)\n","Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.39.0) (0.30.2)\n","Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.39.0) (0.10.2)\n","Requirement already satisfied: uc-micro-py in /usr/local/lib/python3.10/dist-packages (from linkify-it-py<3,>=1->markdown-it-py[linkify]>=2.0.0->gradio==3.39.0) (1.0.2)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio==3.39.0) (1.16.0)\n","Building wheels for collected packages: ffmpy\n"," Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=42b840054c64be954f001fd46ee8fdd32d71359cfbf18932e79ad4255af59fda\n"," Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n","Successfully built ffmpy\n","Installing collected packages: pydub, ffmpy, websockets, semantic-version, python-multipart, orjson, markdown-it-py, h11, aiofiles, uvicorn, starlette, mdit-py-plugins, huggingface-hub, httpcore, httpx, fastapi, gradio-client, gradio\n"," Attempting uninstall: markdown-it-py\n"," Found existing installation: markdown-it-py 3.0.0\n"," Uninstalling markdown-it-py-3.0.0:\n"," Successfully uninstalled markdown-it-py-3.0.0\n"," Attempting uninstall: mdit-py-plugins\n"," Found existing installation: mdit-py-plugins 0.4.0\n"," Uninstalling mdit-py-plugins-0.4.0:\n"," Successfully uninstalled mdit-py-plugins-0.4.0\n","Successfully installed aiofiles-23.2.1 fastapi-0.103.1 ffmpy-0.3.1 gradio-3.39.0 gradio-client-0.5.1 h11-0.14.0 httpcore-0.18.0 httpx-0.25.0 huggingface-hub-0.17.2 markdown-it-py-2.2.0 mdit-py-plugins-0.3.3 orjson-3.9.7 pydub-0.25.1 python-multipart-0.0.6 semantic-version-2.10.0 starlette-0.27.0 uvicorn-0.23.2 websockets-11.0.3\n","Collecting lib==4.0.0\n"," Downloading lib-4.0.0-py3-none-any.whl (4.0 kB)\n","Installing collected packages: lib\n","Successfully installed lib-4.0.0\n","Collecting librosa==0.9.1\n"," Downloading librosa-0.9.1-py3-none-any.whl (213 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.1/213.1 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: audioread>=2.1.5 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (3.0.0)\n","Requirement already satisfied: numpy>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (1.23.5)\n","Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (1.11.2)\n","Requirement already satisfied: scikit-learn>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (1.2.2)\n","Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (1.3.2)\n","Requirement already satisfied: decorator>=4.0.10 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (4.4.2)\n","Collecting resampy>=0.2.2 (from librosa==0.9.1)\n"," Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m39.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numba>=0.45.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (0.56.4)\n","Requirement already satisfied: soundfile>=0.10.2 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (0.12.1)\n","Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (1.7.0)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1) (23.1)\n","Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.45.1->librosa==0.9.1) (0.39.1)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba>=0.45.1->librosa==0.9.1) (67.7.2)\n","Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.9.1) (3.10.0)\n","Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.9.1) (2.31.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.19.1->librosa==0.9.1) (3.2.0)\n","Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.10.2->librosa==0.9.1) (1.15.1)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.10.2->librosa==0.9.1) (2.21)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1) (3.2.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1) (2.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1) (2023.7.22)\n","Installing collected packages: resampy, librosa\n"," Attempting uninstall: librosa\n"," Found existing installation: librosa 0.10.1\n"," Uninstalling librosa-0.10.1:\n"," Successfully uninstalled librosa-0.10.1\n","Successfully installed librosa-0.9.1 resampy-0.4.2\n","Requirement already satisfied: numpy==1.23.5 in /usr/local/lib/python3.10/dist-packages (1.23.5)\n","Collecting onnxruntime_gpu\n"," Downloading onnxruntime_gpu-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (153.4 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m153.4/153.4 MB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting coloredlogs (from onnxruntime_gpu)\n"," Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime_gpu) (23.5.26)\n","Requirement already satisfied: numpy>=1.21.6 in /usr/local/lib/python3.10/dist-packages (from onnxruntime_gpu) (1.23.5)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from onnxruntime_gpu) (23.1)\n","Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime_gpu) (3.20.3)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime_gpu) (1.12)\n","Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime_gpu)\n"," Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime_gpu) (1.3.0)\n","Installing collected packages: humanfriendly, coloredlogs, onnxruntime_gpu\n","Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime_gpu-1.16.0\n","Collecting pedalboard==0.7.7\n"," Downloading pedalboard-0.7.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/3.3 MB\u001b[0m \u001b[31m35.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from pedalboard==0.7.7) (1.23.5)\n","Installing collected packages: pedalboard\n","Successfully installed pedalboard-0.7.7\n","Requirement already satisfied: pydub==0.25.1 in /usr/local/lib/python3.10/dist-packages (0.25.1)\n","Collecting pyworld==0.3.4\n"," Downloading pyworld-0.3.4.tar.gz (251 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m252.0/252.0 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n"," Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n"," Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from pyworld==0.3.4) (1.23.5)\n","Requirement already satisfied: cython>=0.24 in /usr/local/lib/python3.10/dist-packages (from pyworld==0.3.4) (3.0.2)\n","Building wheels for collected packages: pyworld\n"," Building wheel for pyworld (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for pyworld: filename=pyworld-0.3.4-cp310-cp310-linux_x86_64.whl size=864218 sha256=4c38889f2556a7fbc606989ea43e04c36f60ca74b9c9aa131f8d248db7d44d06\n"," Stored in directory: /root/.cache/pip/wheels/66/09/8a/a1d79b73d59756f66e9bfe55a199840efc7473adb76ddacdfd\n","Successfully built pyworld\n","Installing collected packages: pyworld\n","Successfully installed pyworld-0.3.4\n","Requirement already satisfied: Requests==2.31.0 in /usr/local/lib/python3.10/dist-packages (2.31.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from Requests==2.31.0) (3.2.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from Requests==2.31.0) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from Requests==2.31.0) (2.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from Requests==2.31.0) (2023.7.22)\n","Collecting scipy==1.11.1\n"," Downloading scipy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.3/36.3 MB\u001b[0m \u001b[31m34.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy<1.28.0,>=1.21.6 in /usr/local/lib/python3.10/dist-packages (from scipy==1.11.1) (1.23.5)\n","Installing collected packages: scipy\n"," Attempting uninstall: scipy\n"," Found existing installation: scipy 1.11.2\n"," Uninstalling scipy-1.11.2:\n"," Successfully uninstalled scipy-1.11.2\n","Successfully installed scipy-1.11.1\n","Requirement already satisfied: soundfile==0.12.1 in /usr/local/lib/python3.10/dist-packages (0.12.1)\n","Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile==0.12.1) (1.15.1)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile==0.12.1) (2.21)\n","\u001b[31mERROR: You must give at least one requirement to install (maybe you meant \"pip install https://download.pytorch.org/whl/torch_stable.html\"?)\u001b[0m\u001b[31m\n","\u001b[0mRequirement already satisfied: torch==2.0.1+cu118 in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1+cu118) (3.12.2)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1+cu118) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1+cu118) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1+cu118) (3.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1+cu118) (3.1.2)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1+cu118) (2.0.0)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.1+cu118) (3.27.4.1)\n","Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.1+cu118) (16.0.6)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.0.1+cu118) (2.1.3)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.0.1+cu118) (1.3.0)\n","Collecting torchcrepe==0.0.20\n"," Downloading torchcrepe-0.0.20-py3-none-any.whl (72.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.3/72.3 MB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: librosa==0.9.1 in /usr/local/lib/python3.10/dist-packages (from torchcrepe==0.0.20) (0.9.1)\n","Requirement already satisfied: resampy in /usr/local/lib/python3.10/dist-packages (from torchcrepe==0.0.20) (0.4.2)\n","Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from torchcrepe==0.0.20) (1.11.1)\n","Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from torchcrepe==0.0.20) (2.0.1+cu118)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from torchcrepe==0.0.20) (4.66.1)\n","Requirement already satisfied: audioread>=2.1.5 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (3.0.0)\n","Requirement already satisfied: numpy>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (1.23.5)\n","Requirement already satisfied: scikit-learn>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (1.2.2)\n","Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (1.3.2)\n","Requirement already satisfied: decorator>=4.0.10 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (4.4.2)\n","Requirement already satisfied: numba>=0.45.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (0.56.4)\n","Requirement already satisfied: soundfile>=0.10.2 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (0.12.1)\n","Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (1.7.0)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.9.1->torchcrepe==0.0.20) (23.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->torchcrepe==0.0.20) (3.12.2)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch->torchcrepe==0.0.20) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->torchcrepe==0.0.20) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->torchcrepe==0.0.20) (3.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->torchcrepe==0.0.20) (3.1.2)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch->torchcrepe==0.0.20) (2.0.0)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch->torchcrepe==0.0.20) (3.27.4.1)\n","Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch->torchcrepe==0.0.20) (16.0.6)\n","Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.45.1->librosa==0.9.1->torchcrepe==0.0.20) (0.39.1)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba>=0.45.1->librosa==0.9.1->torchcrepe==0.0.20) (67.7.2)\n","Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.9.1->torchcrepe==0.0.20) (3.10.0)\n","Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa==0.9.1->torchcrepe==0.0.20) (2.31.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.19.1->librosa==0.9.1->torchcrepe==0.0.20) (3.2.0)\n","Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.10.2->librosa==0.9.1->torchcrepe==0.0.20) (1.15.1)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->torchcrepe==0.0.20) (2.1.3)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->torchcrepe==0.0.20) (1.3.0)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.10.2->librosa==0.9.1->torchcrepe==0.0.20) (2.21)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1->torchcrepe==0.0.20) (3.2.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1->torchcrepe==0.0.20) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1->torchcrepe==0.0.20) (2.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa==0.9.1->torchcrepe==0.0.20) (2023.7.22)\n","Installing collected packages: torchcrepe\n","Successfully installed torchcrepe-0.0.20\n","Collecting tqdm==4.65.0\n"," Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.1/77.1 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: tqdm\n"," Attempting uninstall: tqdm\n"," Found existing installation: tqdm 4.66.1\n"," Uninstalling tqdm-4.66.1:\n"," Successfully uninstalled tqdm-4.66.1\n","Successfully installed tqdm-4.65.0\n","Collecting yt_dlp==2023.7.6\n"," Downloading yt_dlp-2023.7.6-py2.py3-none-any.whl (3.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m34.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: mutagen in /usr/local/lib/python3.10/dist-packages (from yt_dlp==2023.7.6) (1.47.0)\n","Requirement already satisfied: pycryptodomex in /usr/local/lib/python3.10/dist-packages (from yt_dlp==2023.7.6) (3.19.0)\n","Requirement already satisfied: websockets in /usr/local/lib/python3.10/dist-packages (from yt_dlp==2023.7.6) (11.0.3)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from yt_dlp==2023.7.6) (2023.7.22)\n","Collecting brotli (from yt_dlp==2023.7.6)\n"," Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: brotli, yt_dlp\n","Successfully installed brotli-1.1.0 yt_dlp-2023.7.6\n","Collecting sox==1.4.1\n"," Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)\n","Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from sox==1.4.1) (1.23.5)\n","Installing collected packages: sox\n","Successfully installed sox-1.4.1\n","Reading package lists... Done\n","Building dependency tree... Done\n","Reading state information... Done\n","The following additional packages will be installed:\n"," libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base\n"," libsox3 libwavpack1\n","Suggested packages:\n"," libsox-fmt-all\n","The following NEW packages will be installed:\n"," libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base\n"," libsox3 libwavpack1 sox\n","0 upgraded, 7 newly installed, 0 to remove and 18 not upgraded.\n","Need to get 617 kB of archives.\n","After this operation, 1,764 kB of additional disk space will be used.\n","Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrnb0 amd64 0.1.5-1 [94.8 kB]\n","Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrwb0 amd64 0.1.5-1 [49.1 kB]\n","Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox3 amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [240 kB]\n","Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox-fmt-alsa amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [11.2 kB]\n","Get:5 http://archive.ubuntu.com/ubuntu jammy/main amd64 libwavpack1 amd64 5.4.0-1build2 [83.7 kB]\n","Get:6 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox-fmt-base amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [33.7 kB]\n","Get:7 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 sox amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [104 kB]\n","Fetched 617 kB in 2s (351 kB/s)\n","Selecting previously unselected package libopencore-amrnb0:amd64.\n","(Reading database ... 120895 files and directories currently installed.)\n","Preparing to unpack .../0-libopencore-amrnb0_0.1.5-1_amd64.deb ...\n","Unpacking libopencore-amrnb0:amd64 (0.1.5-1) ...\n","Selecting previously unselected package libopencore-amrwb0:amd64.\n","Preparing to unpack .../1-libopencore-amrwb0_0.1.5-1_amd64.deb ...\n","Unpacking libopencore-amrwb0:amd64 (0.1.5-1) ...\n","Selecting previously unselected package libsox3:amd64.\n","Preparing to unpack .../2-libsox3_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n","Unpacking libsox3:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Selecting previously unselected package libsox-fmt-alsa:amd64.\n","Preparing to unpack .../3-libsox-fmt-alsa_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n","Unpacking libsox-fmt-alsa:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Selecting previously unselected package libwavpack1:amd64.\n","Preparing to unpack .../4-libwavpack1_5.4.0-1build2_amd64.deb ...\n","Unpacking libwavpack1:amd64 (5.4.0-1build2) ...\n","Selecting previously unselected package libsox-fmt-base:amd64.\n","Preparing to unpack .../5-libsox-fmt-base_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n","Unpacking libsox-fmt-base:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Selecting previously unselected package sox.\n","Preparing to unpack .../6-sox_14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1_amd64.deb ...\n","Unpacking sox (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Setting up libsox3:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Setting up libopencore-amrwb0:amd64 (0.1.5-1) ...\n","Setting up libsox-fmt-alsa:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Setting up libwavpack1:amd64 (5.4.0-1build2) ...\n","Setting up libopencore-amrnb0:amd64 (0.1.5-1) ...\n","Setting up libsox-fmt-base:amd64 (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Setting up sox (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1) ...\n","Processing triggers for man-db (2.10.2-1) ...\n","Processing triggers for libc-bin (2.35-0ubuntu3.1) ...\n","/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n","\n","/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n","\n","/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n","\n","/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n","\n","/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n","\n","/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n","\n"]}]},{"cell_type":"markdown","source":["# Infer Pack\n"],"metadata":{"id":"YX4zV-12IVgn"}},{"cell_type":"code","source":["#@title Commons\n","import math\n","import numpy as np\n","import torch\n","from torch import nn\n","from torch.nn import functional as F\n","\n","\n","def init_weights(m, mean=0.0, std=0.01):\n"," classname = m.__class__.__name__\n"," if classname.find(\"Conv\") != -1:\n"," m.weight.data.normal_(mean, std)\n","\n","\n","def get_padding(kernel_size, dilation=1):\n"," return int((kernel_size * dilation - dilation) / 2)\n","\n","\n","def convert_pad_shape(pad_shape):\n"," l = pad_shape[::-1]\n"," pad_shape = [item for sublist in l for item in sublist]\n"," return pad_shape\n","\n","\n","def kl_divergence(m_p, logs_p, m_q, logs_q):\n"," \"\"\"KL(P||Q)\"\"\"\n"," kl = (logs_q - logs_p) - 0.5\n"," kl += (\n"," 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)\n"," )\n"," return kl\n","\n","\n","def rand_gumbel(shape):\n"," \"\"\"Sample from the Gumbel distribution, protect from overflows.\"\"\"\n"," uniform_samples = torch.rand(shape) * 0.99998 + 0.00001\n"," return -torch.log(-torch.log(uniform_samples))\n","\n","\n","def rand_gumbel_like(x):\n"," g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)\n"," return g\n","\n","\n","def slice_segments(x, ids_str, segment_size=4):\n"," ret = torch.zeros_like(x[:, :, :segment_size])\n"," for i in range(x.size(0)):\n"," idx_str = ids_str[i]\n"," idx_end = idx_str + segment_size\n"," ret[i] = x[i, :, idx_str:idx_end]\n"," return ret\n","\n","\n","def slice_segments2(x, ids_str, segment_size=4):\n"," ret = torch.zeros_like(x[:, :segment_size])\n"," for i in range(x.size(0)):\n"," idx_str = ids_str[i]\n"," idx_end = idx_str + segment_size\n"," ret[i] = x[i, idx_str:idx_end]\n"," return ret\n","\n","\n","def rand_slice_segments(x, x_lengths=None, segment_size=4):\n"," b, d, t = x.size()\n"," if x_lengths is None:\n"," x_lengths = t\n"," ids_str_max = x_lengths - segment_size + 1\n"," ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)\n"," ret = slice_segments(x, ids_str, segment_size)\n"," return ret, ids_str\n","\n","\n","def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):\n"," position = torch.arange(length, dtype=torch.float)\n"," num_timescales = channels // 2\n"," log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (\n"," num_timescales - 1\n"," )\n"," inv_timescales = min_timescale * torch.exp(\n"," torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment\n"," )\n"," scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)\n"," signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)\n"," signal = F.pad(signal, [0, 0, 0, channels % 2])\n"," signal = signal.view(1, channels, length)\n"," return signal\n","\n","\n","def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):\n"," b, channels, length = x.size()\n"," signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)\n"," return x + signal.to(dtype=x.dtype, device=x.device)\n","\n","\n","def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):\n"," b, channels, length = x.size()\n"," signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)\n"," return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)\n","\n","\n","def subsequent_mask(length):\n"," mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)\n"," return mask\n","\n","\n","@torch.jit.script\n","def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):\n"," n_channels_int = n_channels[0]\n"," in_act = input_a + input_b\n"," t_act = torch.tanh(in_act[:, :n_channels_int, :])\n"," s_act = torch.sigmoid(in_act[:, n_channels_int:, :])\n"," acts = t_act * s_act\n"," return acts\n","\n","\n","def convert_pad_shape(pad_shape):\n"," l = pad_shape[::-1]\n"," pad_shape = [item for sublist in l for item in sublist]\n"," return pad_shape\n","\n","\n","def shift_1d(x):\n"," x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]\n"," return x\n","\n","\n","def sequence_mask(length, max_length=None):\n"," if max_length is None:\n"," max_length = length.max()\n"," x = torch.arange(max_length, dtype=length.dtype, device=length.device)\n"," return x.unsqueeze(0) < length.unsqueeze(1)\n","\n","\n","def generate_path(duration, mask):\n"," \"\"\"\n"," duration: [b, 1, t_x]\n"," mask: [b, 1, t_y, t_x]\n"," \"\"\"\n"," device = duration.device\n","\n"," b, _, t_y, t_x = mask.shape\n"," cum_duration = torch.cumsum(duration, -1)\n","\n"," cum_duration_flat = cum_duration.view(b * t_x)\n"," path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)\n"," path = path.view(b, t_x, t_y)\n"," path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]\n"," path = path.unsqueeze(1).transpose(2, 3) * mask\n"," return path\n","\n","\n","def clip_grad_value_(parameters, clip_value, norm_type=2):\n"," if isinstance(parameters, torch.Tensor):\n"," parameters = [parameters]\n"," parameters = list(filter(lambda p: p.grad is not None, parameters))\n"," norm_type = float(norm_type)\n"," if clip_value is not None:\n"," clip_value = float(clip_value)\n","\n"," total_norm = 0\n"," for p in parameters:\n"," param_norm = p.grad.data.norm(norm_type)\n"," total_norm += param_norm.item() ** norm_type\n"," if clip_value is not None:\n"," p.grad.data.clamp_(min=-clip_value, max=clip_value)\n"," total_norm = total_norm ** (1.0 / norm_type)\n"," return total_norm"],"metadata":{"id":"IpatwiH6Jble"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Modules\n","\n","import copy\n","import math\n","import numpy as np\n","import scipy\n","import torch\n","from torch import nn\n","from torch.nn import functional as F\n","\n","from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d\n","from torch.nn.utils import weight_norm, remove_weight_norm\n","\n","LRELU_SLOPE = 0.1\n","\n","\n","class LayerNorm(nn.Module):\n"," def __init__(self, channels, eps=1e-5):\n"," super().__init__()\n"," self.channels = channels\n"," self.eps = eps\n","\n"," self.gamma = nn.Parameter(torch.ones(channels))\n"," self.beta = nn.Parameter(torch.zeros(channels))\n","\n"," def forward(self, x):\n"," x = x.transpose(1, -1)\n"," x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)\n"," return x.transpose(1, -1)\n","\n","\n","class ConvReluNorm(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," hidden_channels,\n"," out_channels,\n"," kernel_size,\n"," n_layers,\n"," p_dropout,\n"," ):\n"," super().__init__()\n"," self.in_channels = in_channels\n"," self.hidden_channels = hidden_channels\n"," self.out_channels = out_channels\n"," self.kernel_size = kernel_size\n"," self.n_layers = n_layers\n"," self.p_dropout = p_dropout\n"," assert n_layers > 1, \"Number of layers should be larger than 0.\"\n","\n"," self.conv_layers = nn.ModuleList()\n"," self.norm_layers = nn.ModuleList()\n"," self.conv_layers.append(\n"," nn.Conv1d(\n"," in_channels, hidden_channels, kernel_size, padding=kernel_size // 2\n"," )\n"," )\n"," self.norm_layers.append(LayerNorm(hidden_channels))\n"," self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))\n"," for _ in range(n_layers - 1):\n"," self.conv_layers.append(\n"," nn.Conv1d(\n"," hidden_channels,\n"," hidden_channels,\n"," kernel_size,\n"," padding=kernel_size // 2,\n"," )\n"," )\n"," self.norm_layers.append(LayerNorm(hidden_channels))\n"," self.proj = nn.Conv1d(hidden_channels, out_channels, 1)\n"," self.proj.weight.data.zero_()\n"," self.proj.bias.data.zero_()\n","\n"," def forward(self, x, x_mask):\n"," x_org = x\n"," for i in range(self.n_layers):\n"," x = self.conv_layers[i](x * x_mask)\n"," x = self.norm_layers[i](x)\n"," x = self.relu_drop(x)\n"," x = x_org + self.proj(x)\n"," return x * x_mask\n","\n","\n","class DDSConv(nn.Module):\n"," \"\"\"\n"," Dialted and Depth-Separable Convolution\n"," \"\"\"\n","\n"," def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):\n"," super().__init__()\n"," self.channels = channels\n"," self.kernel_size = kernel_size\n"," self.n_layers = n_layers\n"," self.p_dropout = p_dropout\n","\n"," self.drop = nn.Dropout(p_dropout)\n"," self.convs_sep = nn.ModuleList()\n"," self.convs_1x1 = nn.ModuleList()\n"," self.norms_1 = nn.ModuleList()\n"," self.norms_2 = nn.ModuleList()\n"," for i in range(n_layers):\n"," dilation = kernel_size**i\n"," padding = (kernel_size * dilation - dilation) // 2\n"," self.convs_sep.append(\n"," nn.Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," groups=channels,\n"," dilation=dilation,\n"," padding=padding,\n"," )\n"," )\n"," self.convs_1x1.append(nn.Conv1d(channels, channels, 1))\n"," self.norms_1.append(LayerNorm(channels))\n"," self.norms_2.append(LayerNorm(channels))\n","\n"," def forward(self, x, x_mask, g=None):\n"," if g is not None:\n"," x = x + g\n"," for i in range(self.n_layers):\n"," y = self.convs_sep[i](x * x_mask)\n"," y = self.norms_1[i](y)\n"," y = F.gelu(y)\n"," y = self.convs_1x1[i](y)\n"," y = self.norms_2[i](y)\n"," y = F.gelu(y)\n"," y = self.drop(y)\n"," x = x + y\n"," return x * x_mask\n","\n","\n","class WN(torch.nn.Module):\n"," def __init__(\n"," self,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=0,\n"," p_dropout=0,\n"," ):\n"," super(WN, self).__init__()\n"," assert kernel_size % 2 == 1\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = (kernel_size,)\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.gin_channels = gin_channels\n"," self.p_dropout = p_dropout\n","\n"," self.in_layers = torch.nn.ModuleList()\n"," self.res_skip_layers = torch.nn.ModuleList()\n"," self.drop = nn.Dropout(p_dropout)\n","\n"," if gin_channels != 0:\n"," cond_layer = torch.nn.Conv1d(\n"," gin_channels, 2 * hidden_channels * n_layers, 1\n"," )\n"," self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name=\"weight\")\n","\n"," for i in range(n_layers):\n"," dilation = dilation_rate**i\n"," padding = int((kernel_size * dilation - dilation) / 2)\n"," in_layer = torch.nn.Conv1d(\n"," hidden_channels,\n"," 2 * hidden_channels,\n"," kernel_size,\n"," dilation=dilation,\n"," padding=padding,\n"," )\n"," in_layer = torch.nn.utils.weight_norm(in_layer, name=\"weight\")\n"," self.in_layers.append(in_layer)\n","\n"," # last one is not necessary\n"," if i < n_layers - 1:\n"," res_skip_channels = 2 * hidden_channels\n"," else:\n"," res_skip_channels = hidden_channels\n","\n"," res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)\n"," res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name=\"weight\")\n"," self.res_skip_layers.append(res_skip_layer)\n","\n"," def forward(self, x, x_mask, g=None, **kwargs):\n"," output = torch.zeros_like(x)\n"," n_channels_tensor = torch.IntTensor([self.hidden_channels])\n","\n"," if g is not None:\n"," g = self.cond_layer(g)\n","\n"," for i in range(self.n_layers):\n"," x_in = self.in_layers[i](x)\n"," if g is not None:\n"," cond_offset = i * 2 * self.hidden_channels\n"," g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]\n"," else:\n"," g_l = torch.zeros_like(x_in)\n","\n"," acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)\n"," acts = self.drop(acts)\n","\n"," res_skip_acts = self.res_skip_layers[i](acts)\n"," if i < self.n_layers - 1:\n"," res_acts = res_skip_acts[:, : self.hidden_channels, :]\n"," x = (x + res_acts) * x_mask\n"," output = output + res_skip_acts[:, self.hidden_channels :, :]\n"," else:\n"," output = output + res_skip_acts\n"," return output * x_mask\n","\n"," def remove_weight_norm(self):\n"," if self.gin_channels != 0:\n"," torch.nn.utils.remove_weight_norm(self.cond_layer)\n"," for l in self.in_layers:\n"," torch.nn.utils.remove_weight_norm(l)\n"," for l in self.res_skip_layers:\n"," torch.nn.utils.remove_weight_norm(l)\n","\n","\n","class ResBlock1(torch.nn.Module):\n"," def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):\n"," super(ResBlock1, self).__init__()\n"," self.convs1 = nn.ModuleList(\n"," [\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=dilation[0],\n"," padding=get_padding(kernel_size, dilation[0]),\n"," )\n"," ),\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=dilation[1],\n"," padding=get_padding(kernel_size, dilation[1]),\n"," )\n"," ),\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=dilation[2],\n"," padding=get_padding(kernel_size, dilation[2]),\n"," )\n"," ),\n"," ]\n"," )\n"," self.convs1.apply(init_weights)\n","\n"," self.convs2 = nn.ModuleList(\n"," [\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=1,\n"," padding=get_padding(kernel_size, 1),\n"," )\n"," ),\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=1,\n"," padding=get_padding(kernel_size, 1),\n"," )\n"," ),\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=1,\n"," padding=get_padding(kernel_size, 1),\n"," )\n"," ),\n"," ]\n"," )\n"," self.convs2.apply(init_weights)\n","\n"," def forward(self, x, x_mask=None):\n"," for c1, c2 in zip(self.convs1, self.convs2):\n"," xt = F.leaky_relu(x, LRELU_SLOPE)\n"," if x_mask is not None:\n"," xt = xt * x_mask\n"," xt = c1(xt)\n"," xt = F.leaky_relu(xt, LRELU_SLOPE)\n"," if x_mask is not None:\n"," xt = xt * x_mask\n"," xt = c2(xt)\n"," x = xt + x\n"," if x_mask is not None:\n"," x = x * x_mask\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.convs1:\n"," remove_weight_norm(l)\n"," for l in self.convs2:\n"," remove_weight_norm(l)\n","\n","\n","class ResBlock2(torch.nn.Module):\n"," def __init__(self, channels, kernel_size=3, dilation=(1, 3)):\n"," super(ResBlock2, self).__init__()\n"," self.convs = nn.ModuleList(\n"," [\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=dilation[0],\n"," padding=get_padding(kernel_size, dilation[0]),\n"," )\n"," ),\n"," weight_norm(\n"," Conv1d(\n"," channels,\n"," channels,\n"," kernel_size,\n"," 1,\n"," dilation=dilation[1],\n"," padding=get_padding(kernel_size, dilation[1]),\n"," )\n"," ),\n"," ]\n"," )\n"," self.convs.apply(init_weights)\n","\n"," def forward(self, x, x_mask=None):\n"," for c in self.convs:\n"," xt = F.leaky_relu(x, LRELU_SLOPE)\n"," if x_mask is not None:\n"," xt = xt * x_mask\n"," xt = c(xt)\n"," x = xt + x\n"," if x_mask is not None:\n"," x = x * x_mask\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.convs:\n"," remove_weight_norm(l)\n","\n","\n","class Log(nn.Module):\n"," def forward(self, x, x_mask, reverse=False, **kwargs):\n"," if not reverse:\n"," y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask\n"," logdet = torch.sum(-y, [1, 2])\n"," return y, logdet\n"," else:\n"," x = torch.exp(x) * x_mask\n"," return x\n","\n","\n","class Flip(nn.Module):\n"," def forward(self, x, *args, reverse=False, **kwargs):\n"," x = torch.flip(x, [1])\n"," if not reverse:\n"," logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)\n"," return x, logdet\n"," else:\n"," return x\n","\n","\n","class ElementwiseAffine(nn.Module):\n"," def __init__(self, channels):\n"," super().__init__()\n"," self.channels = channels\n"," self.m = nn.Parameter(torch.zeros(channels, 1))\n"," self.logs = nn.Parameter(torch.zeros(channels, 1))\n","\n"," def forward(self, x, x_mask, reverse=False, **kwargs):\n"," if not reverse:\n"," y = self.m + torch.exp(self.logs) * x\n"," y = y * x_mask\n"," logdet = torch.sum(self.logs * x_mask, [1, 2])\n"," return y, logdet\n"," else:\n"," x = (x - self.m) * torch.exp(-self.logs) * x_mask\n"," return x\n","\n","\n","class ResidualCouplingLayer(nn.Module):\n"," def __init__(\n"," self,\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," p_dropout=0,\n"," gin_channels=0,\n"," mean_only=False,\n"," ):\n"," assert channels % 2 == 0, \"channels should be divisible by 2\"\n"," super().__init__()\n"," self.channels = channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.half_channels = channels // 2\n"," self.mean_only = mean_only\n","\n"," self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)\n"," self.enc = WN(\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," p_dropout=p_dropout,\n"," gin_channels=gin_channels,\n"," )\n"," self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)\n"," self.post.weight.data.zero_()\n"," self.post.bias.data.zero_()\n","\n"," def forward(self, x, x_mask, g=None, reverse=False):\n"," x0, x1 = torch.split(x, [self.half_channels] * 2, 1)\n"," h = self.pre(x0) * x_mask\n"," h = self.enc(h, x_mask, g=g)\n"," stats = self.post(h) * x_mask\n"," if not self.mean_only:\n"," m, logs = torch.split(stats, [self.half_channels] * 2, 1)\n"," else:\n"," m = stats\n"," logs = torch.zeros_like(m)\n","\n"," if not reverse:\n"," x1 = m + x1 * torch.exp(logs) * x_mask\n"," x = torch.cat([x0, x1], 1)\n"," logdet = torch.sum(logs, [1, 2])\n"," return x, logdet\n"," else:\n"," x1 = (x1 - m) * torch.exp(-logs) * x_mask\n"," x = torch.cat([x0, x1], 1)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," self.enc.remove_weight_norm()\n","\n","\n","class ConvFlow(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," filter_channels,\n"," kernel_size,\n"," n_layers,\n"," num_bins=10,\n"," tail_bound=5.0,\n"," ):\n"," super().__init__()\n"," self.in_channels = in_channels\n"," self.filter_channels = filter_channels\n"," self.kernel_size = kernel_size\n"," self.n_layers = n_layers\n"," self.num_bins = num_bins\n"," self.tail_bound = tail_bound\n"," self.half_channels = in_channels // 2\n","\n"," self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)\n"," self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)\n"," self.proj = nn.Conv1d(\n"," filter_channels, self.half_channels * (num_bins * 3 - 1), 1\n"," )\n"," self.proj.weight.data.zero_()\n"," self.proj.bias.data.zero_()\n","\n"," def forward(self, x, x_mask, g=None, reverse=False):\n"," x0, x1 = torch.split(x, [self.half_channels] * 2, 1)\n"," h = self.pre(x0)\n"," h = self.convs(h, x_mask, g=g)\n"," h = self.proj(h) * x_mask\n","\n"," b, c, t = x0.shape\n"," h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]\n","\n"," unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)\n"," unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(\n"," self.filter_channels\n"," )\n"," unnormalized_derivatives = h[..., 2 * self.num_bins :]\n","\n"," x1, logabsdet = piecewise_rational_quadratic_transform(\n"," x1,\n"," unnormalized_widths,\n"," unnormalized_heights,\n"," unnormalized_derivatives,\n"," inverse=reverse,\n"," tails=\"linear\",\n"," tail_bound=self.tail_bound,\n"," )\n","\n"," x = torch.cat([x0, x1], 1) * x_mask\n"," logdet = torch.sum(logabsdet * x_mask, [1, 2])\n"," if not reverse:\n"," return x, logdet\n"," else:\n"," return x"],"metadata":{"id":"4LE18t3DJva0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Attentions\n","import copy\n","import math\n","import numpy as np\n","import torch\n","from torch import nn\n","from torch.nn import functional as F\n","\n","class Encoder(nn.Module):\n"," def __init__(\n"," self,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size=1,\n"," p_dropout=0.0,\n"," window_size=10,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.window_size = window_size\n","\n"," self.drop = nn.Dropout(p_dropout)\n"," self.attn_layers = nn.ModuleList()\n"," self.norm_layers_1 = nn.ModuleList()\n"," self.ffn_layers = nn.ModuleList()\n"," self.norm_layers_2 = nn.ModuleList()\n"," for i in range(self.n_layers):\n"," self.attn_layers.append(\n"," MultiHeadAttention(\n"," hidden_channels,\n"," hidden_channels,\n"," n_heads,\n"," p_dropout=p_dropout,\n"," window_size=window_size,\n"," )\n"," )\n"," self.norm_layers_1.append(LayerNorm(hidden_channels))\n"," self.ffn_layers.append(\n"," FFN(\n"," hidden_channels,\n"," hidden_channels,\n"," filter_channels,\n"," kernel_size,\n"," p_dropout=p_dropout,\n"," )\n"," )\n"," self.norm_layers_2.append(LayerNorm(hidden_channels))\n","\n"," def forward(self, x, x_mask):\n"," attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)\n"," x = x * x_mask\n"," for i in range(self.n_layers):\n"," y = self.attn_layers[i](x, x, attn_mask)\n"," y = self.drop(y)\n"," x = self.norm_layers_1[i](x + y)\n","\n"," y = self.ffn_layers[i](x, x_mask)\n"," y = self.drop(y)\n"," x = self.norm_layers_2[i](x + y)\n"," x = x * x_mask\n"," return x\n","\n","\n","class Decoder(nn.Module):\n"," def __init__(\n"," self,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size=1,\n"," p_dropout=0.0,\n"," proximal_bias=False,\n"," proximal_init=True,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.proximal_bias = proximal_bias\n"," self.proximal_init = proximal_init\n","\n"," self.drop = nn.Dropout(p_dropout)\n"," self.self_attn_layers = nn.ModuleList()\n"," self.norm_layers_0 = nn.ModuleList()\n"," self.encdec_attn_layers = nn.ModuleList()\n"," self.norm_layers_1 = nn.ModuleList()\n"," self.ffn_layers = nn.ModuleList()\n"," self.norm_layers_2 = nn.ModuleList()\n"," for i in range(self.n_layers):\n"," self.self_attn_layers.append(\n"," MultiHeadAttention(\n"," hidden_channels,\n"," hidden_channels,\n"," n_heads,\n"," p_dropout=p_dropout,\n"," proximal_bias=proximal_bias,\n"," proximal_init=proximal_init,\n"," )\n"," )\n"," self.norm_layers_0.append(LayerNorm(hidden_channels))\n"," self.encdec_attn_layers.append(\n"," MultiHeadAttention(\n"," hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout\n"," )\n"," )\n"," self.norm_layers_1.append(LayerNorm(hidden_channels))\n"," self.ffn_layers.append(\n"," FFN(\n"," hidden_channels,\n"," hidden_channels,\n"," filter_channels,\n"," kernel_size,\n"," p_dropout=p_dropout,\n"," causal=True,\n"," )\n"," )\n"," self.norm_layers_2.append(LayerNorm(hidden_channels))\n","\n"," def forward(self, x, x_mask, h, h_mask):\n"," \"\"\"\n"," x: decoder input\n"," h: encoder output\n"," \"\"\"\n"," self_attn_mask = subsequent_mask(x_mask.size(2)).to(\n"," device=x.device, dtype=x.dtype\n"," )\n"," encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)\n"," x = x * x_mask\n"," for i in range(self.n_layers):\n"," y = self.self_attn_layers[i](x, x, self_attn_mask)\n"," y = self.drop(y)\n"," x = self.norm_layers_0[i](x + y)\n","\n"," y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)\n"," y = self.drop(y)\n"," x = self.norm_layers_1[i](x + y)\n","\n"," y = self.ffn_layers[i](x, x_mask)\n"," y = self.drop(y)\n"," x = self.norm_layers_2[i](x + y)\n"," x = x * x_mask\n"," return x\n","\n","\n","class MultiHeadAttention(nn.Module):\n"," def __init__(\n"," self,\n"," channels,\n"," out_channels,\n"," n_heads,\n"," p_dropout=0.0,\n"," window_size=None,\n"," heads_share=True,\n"," block_length=None,\n"," proximal_bias=False,\n"," proximal_init=False,\n"," ):\n"," super().__init__()\n"," assert channels % n_heads == 0\n","\n"," self.channels = channels\n"," self.out_channels = out_channels\n"," self.n_heads = n_heads\n"," self.p_dropout = p_dropout\n"," self.window_size = window_size\n"," self.heads_share = heads_share\n"," self.block_length = block_length\n"," self.proximal_bias = proximal_bias\n"," self.proximal_init = proximal_init\n"," self.attn = None\n","\n"," self.k_channels = channels // n_heads\n"," self.conv_q = nn.Conv1d(channels, channels, 1)\n"," self.conv_k = nn.Conv1d(channels, channels, 1)\n"," self.conv_v = nn.Conv1d(channels, channels, 1)\n"," self.conv_o = nn.Conv1d(channels, out_channels, 1)\n"," self.drop = nn.Dropout(p_dropout)\n","\n"," if window_size is not None:\n"," n_heads_rel = 1 if heads_share else n_heads\n"," rel_stddev = self.k_channels**-0.5\n"," self.emb_rel_k = nn.Parameter(\n"," torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)\n"," * rel_stddev\n"," )\n"," self.emb_rel_v = nn.Parameter(\n"," torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)\n"," * rel_stddev\n"," )\n","\n"," nn.init.xavier_uniform_(self.conv_q.weight)\n"," nn.init.xavier_uniform_(self.conv_k.weight)\n"," nn.init.xavier_uniform_(self.conv_v.weight)\n"," if proximal_init:\n"," with torch.no_grad():\n"," self.conv_k.weight.copy_(self.conv_q.weight)\n"," self.conv_k.bias.copy_(self.conv_q.bias)\n","\n"," def forward(self, x, c, attn_mask=None):\n"," q = self.conv_q(x)\n"," k = self.conv_k(c)\n"," v = self.conv_v(c)\n","\n"," x, self.attn = self.attention(q, k, v, mask=attn_mask)\n","\n"," x = self.conv_o(x)\n"," return x\n","\n"," def attention(self, query, key, value, mask=None):\n"," # reshape [b, d, t] -> [b, n_h, t, d_k]\n"," b, d, t_s, t_t = (*key.size(), query.size(2))\n"," query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)\n"," key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)\n"," value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)\n","\n"," scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))\n"," if self.window_size is not None:\n"," assert (\n"," t_s == t_t\n"," ), \"Relative attention is only available for self-attention.\"\n"," key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)\n"," rel_logits = self._matmul_with_relative_keys(\n"," query / math.sqrt(self.k_channels), key_relative_embeddings\n"," )\n"," scores_local = self._relative_position_to_absolute_position(rel_logits)\n"," scores = scores + scores_local\n"," if self.proximal_bias:\n"," assert t_s == t_t, \"Proximal bias is only available for self-attention.\"\n"," scores = scores + self._attention_bias_proximal(t_s).to(\n"," device=scores.device, dtype=scores.dtype\n"," )\n"," if mask is not None:\n"," scores = scores.masked_fill(mask == 0, -1e4)\n"," if self.block_length is not None:\n"," assert (\n"," t_s == t_t\n"," ), \"Local attention is only available for self-attention.\"\n"," block_mask = (\n"," torch.ones_like(scores)\n"," .triu(-self.block_length)\n"," .tril(self.block_length)\n"," )\n"," scores = scores.masked_fill(block_mask == 0, -1e4)\n"," p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]\n"," p_attn = self.drop(p_attn)\n"," output = torch.matmul(p_attn, value)\n"," if self.window_size is not None:\n"," relative_weights = self._absolute_position_to_relative_position(p_attn)\n"," value_relative_embeddings = self._get_relative_embeddings(\n"," self.emb_rel_v, t_s\n"," )\n"," output = output + self._matmul_with_relative_values(\n"," relative_weights, value_relative_embeddings\n"," )\n"," output = (\n"," output.transpose(2, 3).contiguous().view(b, d, t_t)\n"," ) # [b, n_h, t_t, d_k] -> [b, d, t_t]\n"," return output, p_attn\n","\n"," def _matmul_with_relative_values(self, x, y):\n"," \"\"\"\n"," x: [b, h, l, m]\n"," y: [h or 1, m, d]\n"," ret: [b, h, l, d]\n"," \"\"\"\n"," ret = torch.matmul(x, y.unsqueeze(0))\n"," return ret\n","\n"," def _matmul_with_relative_keys(self, x, y):\n"," \"\"\"\n"," x: [b, h, l, d]\n"," y: [h or 1, m, d]\n"," ret: [b, h, l, m]\n"," \"\"\"\n"," ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))\n"," return ret\n","\n"," def _get_relative_embeddings(self, relative_embeddings, length):\n"," max_relative_position = 2 * self.window_size + 1\n"," # Pad first before slice to avoid using cond ops.\n"," pad_length = max(length - (self.window_size + 1), 0)\n"," slice_start_position = max((self.window_size + 1) - length, 0)\n"," slice_end_position = slice_start_position + 2 * length - 1\n"," if pad_length > 0:\n"," padded_relative_embeddings = F.pad(\n"," relative_embeddings,\n"," convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),\n"," )\n"," else:\n"," padded_relative_embeddings = relative_embeddings\n"," used_relative_embeddings = padded_relative_embeddings[\n"," :, slice_start_position:slice_end_position\n"," ]\n"," return used_relative_embeddings\n","\n"," def _relative_position_to_absolute_position(self, x):\n"," \"\"\"\n"," x: [b, h, l, 2*l-1]\n"," ret: [b, h, l, l]\n"," \"\"\"\n"," batch, heads, length, _ = x.size()\n"," # Concat columns of pad to shift from relative to absolute indexing.\n"," x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))\n","\n"," # Concat extra elements so to add up to shape (len+1, 2*len-1).\n"," x_flat = x.view([batch, heads, length * 2 * length])\n"," x_flat = F.pad(\n"," x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])\n"," )\n","\n"," # Reshape and slice out the padded elements.\n"," x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[\n"," :, :, :length, length - 1 :\n"," ]\n"," return x_final\n","\n"," def _absolute_position_to_relative_position(self, x):\n"," \"\"\"\n"," x: [b, h, l, l]\n"," ret: [b, h, l, 2*l-1]\n"," \"\"\"\n"," batch, heads, length, _ = x.size()\n"," # padd along column\n"," x = F.pad(\n"," x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])\n"," )\n"," x_flat = x.view([batch, heads, length**2 + length * (length - 1)])\n"," # add 0's in the beginning that will skew the elements after reshape\n"," x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))\n"," x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]\n"," return x_final\n","\n"," def _attention_bias_proximal(self, length):\n"," \"\"\"Bias for self-attention to encourage attention to close positions.\n"," Args:\n"," length: an integer scalar.\n"," Returns:\n"," a Tensor with shape [1, 1, length, length]\n"," \"\"\"\n"," r = torch.arange(length, dtype=torch.float32)\n"," diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)\n"," return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)\n","\n","\n","class FFN(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," out_channels,\n"," filter_channels,\n"," kernel_size,\n"," p_dropout=0.0,\n"," activation=None,\n"," causal=False,\n"," ):\n"," super().__init__()\n"," self.in_channels = in_channels\n"," self.out_channels = out_channels\n"," self.filter_channels = filter_channels\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.activation = activation\n"," self.causal = causal\n","\n"," if causal:\n"," self.padding = self._causal_padding\n"," else:\n"," self.padding = self._same_padding\n","\n"," self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)\n"," self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)\n"," self.drop = nn.Dropout(p_dropout)\n","\n"," def forward(self, x, x_mask):\n"," x = self.conv_1(self.padding(x * x_mask))\n"," if self.activation == \"gelu\":\n"," x = x * torch.sigmoid(1.702 * x)\n"," else:\n"," x = torch.relu(x)\n"," x = self.drop(x)\n"," x = self.conv_2(self.padding(x * x_mask))\n"," return x * x_mask\n","\n"," def _causal_padding(self, x):\n"," if self.kernel_size == 1:\n"," return x\n"," pad_l = self.kernel_size - 1\n"," pad_r = 0\n"," padding = [[0, 0], [0, 0], [pad_l, pad_r]]\n"," x = F.pad(x, convert_pad_shape(padding))\n"," return x\n","\n"," def _same_padding(self, x):\n"," if self.kernel_size == 1:\n"," return x\n"," pad_l = (self.kernel_size - 1) // 2\n"," pad_r = self.kernel_size // 2\n"," padding = [[0, 0], [0, 0], [pad_l, pad_r]]\n"," x = F.pad(x, convert_pad_shape(padding))\n"," return x"],"metadata":{"id":"kOERIARzKp3y"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Models\n","import math, pdb, os\n","from time import time as ttime\n","import torch\n","from torch import nn\n","from torch.nn import functional as F\n","from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d\n","from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm\n","import numpy as np\n","\n","\n","class TextEncoder256(nn.Module):\n"," def __init__(\n"," self,\n"," out_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=True,\n"," ):\n"," super().__init__()\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.emb_phone = nn.Linear(256, hidden_channels)\n"," self.lrelu = nn.LeakyReLU(0.1, inplace=True)\n"," if f0 == True:\n"," self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256\n"," self.encoder = Encoder(\n"," hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, phone, pitch, lengths):\n"," if pitch == None:\n"," x = self.emb_phone(phone)\n"," else:\n"," x = self.emb_phone(phone) + self.emb_pitch(pitch)\n"," x = x * math.sqrt(self.hidden_channels) # [b, t, h]\n"," x = self.lrelu(x)\n"," x = torch.transpose(x, 1, -1) # [b, h, t]\n"," x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.encoder(x * x_mask, x_mask)\n"," stats = self.proj(x) * x_mask\n","\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," return m, logs, x_mask\n","\n","\n","class TextEncoder768(nn.Module):\n"," def __init__(\n"," self,\n"," out_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=True,\n"," ):\n"," super().__init__()\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.emb_phone = nn.Linear(768, hidden_channels)\n"," self.lrelu = nn.LeakyReLU(0.1, inplace=True)\n"," if f0 == True:\n"," self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256\n"," self.encoder = Encoder(\n"," hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, phone, pitch, lengths):\n"," if pitch == None:\n"," x = self.emb_phone(phone)\n"," else:\n"," x = self.emb_phone(phone) + self.emb_pitch(pitch)\n"," x = x * math.sqrt(self.hidden_channels) # [b, t, h]\n"," x = self.lrelu(x)\n"," x = torch.transpose(x, 1, -1) # [b, h, t]\n"," x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.encoder(x * x_mask, x_mask)\n"," stats = self.proj(x) * x_mask\n","\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," return m, logs, x_mask\n","\n","\n","class ResidualCouplingBlock(nn.Module):\n"," def __init__(\n"," self,\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," n_flows=4,\n"," gin_channels=0,\n"," ):\n"," super().__init__()\n"," self.channels = channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.n_flows = n_flows\n"," self.gin_channels = gin_channels\n","\n"," self.flows = nn.ModuleList()\n"," for i in range(n_flows):\n"," self.flows.append(\n"," ResidualCouplingLayer(\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=gin_channels,\n"," mean_only=True,\n"," )\n"," )\n"," self.flows.append(Flip())\n","\n"," def forward(self, x, x_mask, g=None, reverse=False):\n"," if not reverse:\n"," for flow in self.flows:\n"," x, _ = flow(x, x_mask, g=g, reverse=reverse)\n"," else:\n"," for flow in reversed(self.flows):\n"," x = flow(x, x_mask, g=g, reverse=reverse)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for i in range(self.n_flows):\n"," self.flows[i * 2].remove_weight_norm()\n","\n","\n","class PosteriorEncoder(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," out_channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=0,\n"," ):\n"," super().__init__()\n"," self.in_channels = in_channels\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.gin_channels = gin_channels\n","\n"," self.pre = nn.Conv1d(in_channels, hidden_channels, 1)\n"," self.enc = WN(\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=gin_channels,\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, x, x_lengths, g=None):\n"," x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.pre(x) * x_mask\n"," x = self.enc(x, x_mask, g=g)\n"," stats = self.proj(x) * x_mask\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask\n"," return z, m, logs, x_mask\n","\n"," def remove_weight_norm(self):\n"," self.enc.remove_weight_norm()\n","\n","\n","class Generator(torch.nn.Module):\n"," def __init__(\n"," self,\n"," initial_channel,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=0,\n"," ):\n"," super(Generator, self).__init__()\n"," self.num_kernels = len(resblock_kernel_sizes)\n"," self.num_upsamples = len(upsample_rates)\n"," self.conv_pre = Conv1d(\n"," initial_channel, upsample_initial_channel, 7, 1, padding=3\n"," )\n"," resblock = ResBlock1 if resblock == \"1\" else ResBlock2\n","\n"," self.ups = nn.ModuleList()\n"," for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):\n"," self.ups.append(\n"," weight_norm(\n"," ConvTranspose1d(\n"," upsample_initial_channel // (2**i),\n"," upsample_initial_channel // (2 ** (i + 1)),\n"," k,\n"," u,\n"," padding=(k - u) // 2,\n"," )\n"," )\n"," )\n","\n"," self.resblocks = nn.ModuleList()\n"," for i in range(len(self.ups)):\n"," ch = upsample_initial_channel // (2 ** (i + 1))\n"," for j, (k, d) in enumerate(\n"," zip(resblock_kernel_sizes, resblock_dilation_sizes)\n"," ):\n"," self.resblocks.append(resblock(ch, k, d))\n","\n"," self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)\n"," self.ups.apply(init_weights)\n","\n"," if gin_channels != 0:\n"," self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)\n","\n"," def forward(self, x, g=None):\n"," x = self.conv_pre(x)\n"," if g is not None:\n"," x = x + self.cond(g)\n","\n"," for i in range(self.num_upsamples):\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," x = self.ups[i](x)\n"," xs = None\n"," for j in range(self.num_kernels):\n"," if xs is None:\n"," xs = self.resblocks[i * self.num_kernels + j](x)\n"," else:\n"," xs += self.resblocks[i * self.num_kernels + j](x)\n"," x = xs / self.num_kernels\n"," x = F.leaky_relu(x)\n"," x = self.conv_post(x)\n"," x = torch.tanh(x)\n","\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.ups:\n"," remove_weight_norm(l)\n"," for l in self.resblocks:\n"," l.remove_weight_norm()\n","\n","\n","class SineGen(torch.nn.Module):\n"," \"\"\"Definition of sine generator\n"," SineGen(samp_rate, harmonic_num = 0,\n"," sine_amp = 0.1, noise_std = 0.003,\n"," voiced_threshold = 0,\n"," flag_for_pulse=False)\n"," samp_rate: sampling rate in Hz\n"," harmonic_num: number of harmonic overtones (default 0)\n"," sine_amp: amplitude of sine-wavefrom (default 0.1)\n"," noise_std: std of Gaussian noise (default 0.003)\n"," voiced_thoreshold: F0 threshold for U/V classification (default 0)\n"," flag_for_pulse: this SinGen is used inside PulseGen (default False)\n"," Note: when flag_for_pulse is True, the first time step of a voiced\n"," segment is always sin(np.pi) or cos(0)\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," samp_rate,\n"," harmonic_num=0,\n"," sine_amp=0.1,\n"," noise_std=0.003,\n"," voiced_threshold=0,\n"," flag_for_pulse=False,\n"," ):\n"," super(SineGen, self).__init__()\n"," self.sine_amp = sine_amp\n"," self.noise_std = noise_std\n"," self.harmonic_num = harmonic_num\n"," self.dim = self.harmonic_num + 1\n"," self.sampling_rate = samp_rate\n"," self.voiced_threshold = voiced_threshold\n","\n"," def _f02uv(self, f0):\n"," # generate uv signal\n"," uv = torch.ones_like(f0)\n"," uv = uv * (f0 > self.voiced_threshold)\n"," return uv\n","\n"," def forward(self, f0, upp):\n"," \"\"\"sine_tensor, uv = forward(f0)\n"," input F0: tensor(batchsize=1, length, dim=1)\n"," f0 for unvoiced steps should be 0\n"," output sine_tensor: tensor(batchsize=1, length, dim)\n"," output uv: tensor(batchsize=1, length, 1)\n"," \"\"\"\n"," with torch.no_grad():\n"," f0 = f0[:, None].transpose(1, 2)\n"," f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)\n"," # fundamental component\n"," f0_buf[:, :, 0] = f0[:, :, 0]\n"," for idx in np.arange(self.harmonic_num):\n"," f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (\n"," idx + 2\n"," ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic\n"," rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化\n"," rand_ini = torch.rand(\n"," f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device\n"," )\n"," rand_ini[:, 0] = 0\n"," rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini\n"," tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化\n"," tmp_over_one *= upp\n"," tmp_over_one = F.interpolate(\n"," tmp_over_one.transpose(2, 1),\n"," scale_factor=upp,\n"," mode=\"linear\",\n"," align_corners=True,\n"," ).transpose(2, 1)\n"," rad_values = F.interpolate(\n"," rad_values.transpose(2, 1), scale_factor=upp, mode=\"nearest\"\n"," ).transpose(\n"," 2, 1\n"," ) #######\n"," tmp_over_one %= 1\n"," tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0\n"," cumsum_shift = torch.zeros_like(rad_values)\n"," cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0\n"," sine_waves = torch.sin(\n"," torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi\n"," )\n"," sine_waves = sine_waves * self.sine_amp\n"," uv = self._f02uv(f0)\n"," uv = F.interpolate(\n"," uv.transpose(2, 1), scale_factor=upp, mode=\"nearest\"\n"," ).transpose(2, 1)\n"," noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3\n"," noise = noise_amp * torch.randn_like(sine_waves)\n"," sine_waves = sine_waves * uv + noise\n"," return sine_waves, uv, noise\n","\n","\n","class SourceModuleHnNSF(torch.nn.Module):\n"," \"\"\"SourceModule for hn-nsf\n"," SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,\n"," add_noise_std=0.003, voiced_threshod=0)\n"," sampling_rate: sampling_rate in Hz\n"," harmonic_num: number of harmonic above F0 (default: 0)\n"," sine_amp: amplitude of sine source signal (default: 0.1)\n"," add_noise_std: std of additive Gaussian noise (default: 0.003)\n"," note that amplitude of noise in unvoiced is decided\n"," by sine_amp\n"," voiced_threshold: threhold to set U/V given F0 (default: 0)\n"," Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)\n"," F0_sampled (batchsize, length, 1)\n"," Sine_source (batchsize, length, 1)\n"," noise_source (batchsize, length 1)\n"," uv (batchsize, length, 1)\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," sampling_rate,\n"," harmonic_num=0,\n"," sine_amp=0.1,\n"," add_noise_std=0.003,\n"," voiced_threshod=0,\n"," is_half=True,\n"," ):\n"," super(SourceModuleHnNSF, self).__init__()\n","\n"," self.sine_amp = sine_amp\n"," self.noise_std = add_noise_std\n"," self.is_half = is_half\n"," # to produce sine waveforms\n"," self.l_sin_gen = SineGen(\n"," sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod\n"," )\n","\n"," # to merge source harmonics into a single excitation\n"," self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)\n"," self.l_tanh = torch.nn.Tanh()\n","\n"," def forward(self, x, upp=None):\n"," sine_wavs, uv, _ = self.l_sin_gen(x, upp)\n"," if self.is_half:\n"," sine_wavs = sine_wavs.half()\n"," sine_merge = self.l_tanh(self.l_linear(sine_wavs))\n"," return sine_merge, None, None # noise, uv\n","\n","\n","class GeneratorNSF(torch.nn.Module):\n"," def __init__(\n"," self,\n"," initial_channel,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels,\n"," sr,\n"," is_half=False,\n"," ):\n"," super(GeneratorNSF, self).__init__()\n"," self.num_kernels = len(resblock_kernel_sizes)\n"," self.num_upsamples = len(upsample_rates)\n","\n"," self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))\n"," self.m_source = SourceModuleHnNSF(\n"," sampling_rate=sr, harmonic_num=0, is_half=is_half\n"," )\n"," self.noise_convs = nn.ModuleList()\n"," self.conv_pre = Conv1d(\n"," initial_channel, upsample_initial_channel, 7, 1, padding=3\n"," )\n"," resblock = ResBlock1 if resblock == \"1\" else ResBlock2\n","\n"," self.ups = nn.ModuleList()\n"," for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):\n"," c_cur = upsample_initial_channel // (2 ** (i + 1))\n"," self.ups.append(\n"," weight_norm(\n"," ConvTranspose1d(\n"," upsample_initial_channel // (2**i),\n"," upsample_initial_channel // (2 ** (i + 1)),\n"," k,\n"," u,\n"," padding=(k - u) // 2,\n"," )\n"," )\n"," )\n"," if i + 1 < len(upsample_rates):\n"," stride_f0 = np.prod(upsample_rates[i + 1 :])\n"," self.noise_convs.append(\n"," Conv1d(\n"," 1,\n"," c_cur,\n"," kernel_size=stride_f0 * 2,\n"," stride=stride_f0,\n"," padding=stride_f0 // 2,\n"," )\n"," )\n"," else:\n"," self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))\n","\n"," self.resblocks = nn.ModuleList()\n"," for i in range(len(self.ups)):\n"," ch = upsample_initial_channel // (2 ** (i + 1))\n"," for j, (k, d) in enumerate(\n"," zip(resblock_kernel_sizes, resblock_dilation_sizes)\n"," ):\n"," self.resblocks.append(resblock(ch, k, d))\n","\n"," self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)\n"," self.ups.apply(init_weights)\n","\n"," if gin_channels != 0:\n"," self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)\n","\n"," self.upp = np.prod(upsample_rates)\n","\n"," def forward(self, x, f0, g=None):\n"," har_source, noi_source, uv = self.m_source(f0, self.upp)\n"," har_source = har_source.transpose(1, 2)\n"," x = self.conv_pre(x)\n"," if g is not None:\n"," x = x + self.cond(g)\n","\n"," for i in range(self.num_upsamples):\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," x = self.ups[i](x)\n"," x_source = self.noise_convs[i](har_source)\n"," x = x + x_source\n"," xs = None\n"," for j in range(self.num_kernels):\n"," if xs is None:\n"," xs = self.resblocks[i * self.num_kernels + j](x)\n"," else:\n"," xs += self.resblocks[i * self.num_kernels + j](x)\n"," x = xs / self.num_kernels\n"," x = F.leaky_relu(x)\n"," x = self.conv_post(x)\n"," x = torch.tanh(x)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.ups:\n"," remove_weight_norm(l)\n"," for l in self.resblocks:\n"," l.remove_weight_norm()\n","\n","\n","sr2sr = {\n"," \"32k\": 32000,\n"," \"40k\": 40000,\n"," \"48k\": 48000,\n","}\n","\n","\n","class SynthesizerTrnMs256NSFsid(nn.Module):\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," gin_channels,\n"," sr,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," if type(sr) == type(\"strr\"):\n"," sr = sr2sr[sr]\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," self.enc_p = TextEncoder256(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," )\n"," self.dec = GeneratorNSF(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," sr=sr,\n"," is_half=kwargs[\"is_half\"],\n"," )\n"," self.enc_q = PosteriorEncoder(\n"," spec_channels,\n"," inter_channels,\n"," hidden_channels,\n"," 5,\n"," 1,\n"," 16,\n"," gin_channels=gin_channels,\n"," )\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def forward(\n"," self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds\n"," ): # 这里ds是id,[bs,1]\n"," # print(1,pitch.shape)#[bs,t]\n"," g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的\n"," m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)\n"," z_p = self.flow(z, y_mask, g=g)\n"," z_slice, ids_slice = rand_slice_segments(\n"," z, y_lengths, self.segment_size\n"," )\n"," # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)\n"," pitchf = slice_segments2(pitchf, ids_slice, self.segment_size)\n"," # print(-2,pitchf.shape,z_slice.shape)\n"," o = self.dec(z_slice, pitchf, g=g)\n"," return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)\n","\n"," def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):\n"," g = self.emb_g(sid).unsqueeze(-1)\n"," m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask\n"," z = self.flow(z_p, x_mask, g=g, reverse=True)\n"," o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)\n"," return o, x_mask, (z, z_p, m_p, logs_p)\n","\n","\n","class SynthesizerTrnMs768NSFsid(nn.Module):\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," gin_channels,\n"," sr,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," if type(sr) == type(\"strr\"):\n"," sr = sr2sr[sr]\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," self.enc_p = TextEncoder768(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," )\n"," self.dec = GeneratorNSF(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," sr=sr,\n"," is_half=kwargs[\"is_half\"],\n"," )\n"," self.enc_q = PosteriorEncoder(\n"," spec_channels,\n"," inter_channels,\n"," hidden_channels,\n"," 5,\n"," 1,\n"," 16,\n"," gin_channels=gin_channels,\n"," )\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def forward(\n"," self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds\n"," ): # 这里ds是id,[bs,1]\n"," # print(1,pitch.shape)#[bs,t]\n"," g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的\n"," m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)\n"," z_p = self.flow(z, y_mask, g=g)\n"," z_slice, ids_slice = rand_slice_segments(\n"," z, y_lengths, self.segment_size\n"," )\n"," # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)\n"," pitchf = slice_segments2(pitchf, ids_slice, self.segment_size)\n"," # print(-2,pitchf.shape,z_slice.shape)\n"," o = self.dec(z_slice, pitchf, g=g)\n"," return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)\n","\n"," def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):\n"," g = self.emb_g(sid).unsqueeze(-1)\n"," m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask\n"," z = self.flow(z_p, x_mask, g=g, reverse=True)\n"," o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)\n"," return o, x_mask, (z, z_p, m_p, logs_p)\n","\n","\n","class SynthesizerTrnMs256NSFsid_nono(nn.Module):\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," gin_channels,\n"," sr=None,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," self.enc_p = TextEncoder256(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=False,\n"," )\n"," self.dec = Generator(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," )\n"," self.enc_q = PosteriorEncoder(\n"," spec_channels,\n"," inter_channels,\n"," hidden_channels,\n"," 5,\n"," 1,\n"," 16,\n"," gin_channels=gin_channels,\n"," )\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]\n"," g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的\n"," m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)\n"," z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)\n"," z_p = self.flow(z, y_mask, g=g)\n"," z_slice, ids_slice = rand_slice_segments(\n"," z, y_lengths, self.segment_size\n"," )\n"," o = self.dec(z_slice, g=g)\n"," return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)\n","\n"," def infer(self, phone, phone_lengths, sid, max_len=None):\n"," g = self.emb_g(sid).unsqueeze(-1)\n"," m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)\n"," z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask\n"," z = self.flow(z_p, x_mask, g=g, reverse=True)\n"," o = self.dec((z * x_mask)[:, :, :max_len], g=g)\n"," return o, x_mask, (z, z_p, m_p, logs_p)\n","\n","\n","class SynthesizerTrnMs768NSFsid_nono(nn.Module):\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," gin_channels,\n"," sr=None,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," self.enc_p = TextEncoder768(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=False,\n"," )\n"," self.dec = Generator(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," )\n"," self.enc_q = PosteriorEncoder(\n"," spec_channels,\n"," inter_channels,\n"," hidden_channels,\n"," 5,\n"," 1,\n"," 16,\n"," gin_channels=gin_channels,\n"," )\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]\n"," g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的\n"," m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)\n"," z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)\n"," z_p = self.flow(z, y_mask, g=g)\n"," z_slice, ids_slice = rand_slice_segments(\n"," z, y_lengths, self.segment_size\n"," )\n"," o = self.dec(z_slice, g=g)\n"," return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)\n","\n"," def infer(self, phone, phone_lengths, sid, max_len=None):\n"," g = self.emb_g(sid).unsqueeze(-1)\n"," m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)\n"," z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask\n"," z = self.flow(z_p, x_mask, g=g, reverse=True)\n"," o = self.dec((z * x_mask)[:, :, :max_len], g=g)\n"," return o, x_mask, (z, z_p, m_p, logs_p)\n","\n","\n","class MultiPeriodDiscriminator(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(MultiPeriodDiscriminator, self).__init__()\n"," periods = [2, 3, 5, 7, 11, 17]\n"," # periods = [3, 5, 7, 11, 17, 23, 37]\n","\n"," discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]\n"," discs = discs + [\n"," DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods\n"," ]\n"," self.discriminators = nn.ModuleList(discs)\n","\n"," def forward(self, y, y_hat):\n"," y_d_rs = [] #\n"," y_d_gs = []\n"," fmap_rs = []\n"," fmap_gs = []\n"," for i, d in enumerate(self.discriminators):\n"," y_d_r, fmap_r = d(y)\n"," y_d_g, fmap_g = d(y_hat)\n"," # for j in range(len(fmap_r)):\n"," # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)\n"," y_d_rs.append(y_d_r)\n"," y_d_gs.append(y_d_g)\n"," fmap_rs.append(fmap_r)\n"," fmap_gs.append(fmap_g)\n","\n"," return y_d_rs, y_d_gs, fmap_rs, fmap_gs\n","\n","\n","class MultiPeriodDiscriminatorV2(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(MultiPeriodDiscriminatorV2, self).__init__()\n"," # periods = [2, 3, 5, 7, 11, 17]\n"," periods = [2, 3, 5, 7, 11, 17, 23, 37]\n","\n"," discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]\n"," discs = discs + [\n"," DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods\n"," ]\n"," self.discriminators = nn.ModuleList(discs)\n","\n"," def forward(self, y, y_hat):\n"," y_d_rs = [] #\n"," y_d_gs = []\n"," fmap_rs = []\n"," fmap_gs = []\n"," for i, d in enumerate(self.discriminators):\n"," y_d_r, fmap_r = d(y)\n"," y_d_g, fmap_g = d(y_hat)\n"," # for j in range(len(fmap_r)):\n"," # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)\n"," y_d_rs.append(y_d_r)\n"," y_d_gs.append(y_d_g)\n"," fmap_rs.append(fmap_r)\n"," fmap_gs.append(fmap_g)\n","\n"," return y_d_rs, y_d_gs, fmap_rs, fmap_gs\n","\n","\n","class DiscriminatorS(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(DiscriminatorS, self).__init__()\n"," norm_f = weight_norm if use_spectral_norm == False else spectral_norm\n"," self.convs = nn.ModuleList(\n"," [\n"," norm_f(Conv1d(1, 16, 15, 1, padding=7)),\n"," norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),\n"," norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),\n"," norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),\n"," norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),\n"," norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),\n"," ]\n"," )\n"," self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))\n","\n"," def forward(self, x):\n"," fmap = []\n","\n"," for l in self.convs:\n"," x = l(x)\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," fmap.append(x)\n"," x = self.conv_post(x)\n"," fmap.append(x)\n"," x = torch.flatten(x, 1, -1)\n","\n"," return x, fmap\n","\n","\n","class DiscriminatorP(torch.nn.Module):\n"," def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):\n"," super(DiscriminatorP, self).__init__()\n"," self.period = period\n"," self.use_spectral_norm = use_spectral_norm\n"," norm_f = weight_norm if use_spectral_norm == False else spectral_norm\n"," self.convs = nn.ModuleList(\n"," [\n"," norm_f(\n"," Conv2d(\n"," 1,\n"," 32,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 32,\n"," 128,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 128,\n"," 512,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 512,\n"," 1024,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 1024,\n"," 1024,\n"," (kernel_size, 1),\n"," 1,\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," ]\n"," )\n"," self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))\n","\n"," def forward(self, x):\n"," fmap = []\n","\n"," # 1d to 2d\n"," b, c, t = x.shape\n"," if t % self.period != 0: # pad first\n"," n_pad = self.period - (t % self.period)\n"," x = F.pad(x, (0, n_pad), \"reflect\")\n"," t = t + n_pad\n"," x = x.view(b, c, t // self.period, self.period)\n","\n"," for l in self.convs:\n"," x = l(x)\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," fmap.append(x)\n"," x = self.conv_post(x)\n"," fmap.append(x)\n"," x = torch.flatten(x, 1, -1)\n","\n"," return x, fmap"],"metadata":{"id":"MtiZdZTWMTDE"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Models_Onnx\n","import math, pdb, os\n","from time import time as ttime\n","import torch\n","from torch import nn\n","from torch.nn import functional as F\n","from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d\n","from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm\n","import numpy as np\n","\n","\n","class TextEncoder256(nn.Module):\n"," def __init__(\n"," self,\n"," out_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=True,\n"," ):\n"," super().__init__()\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.emb_phone = nn.Linear(256, hidden_channels)\n"," self.lrelu = nn.LeakyReLU(0.1, inplace=True)\n"," if f0 == True:\n"," self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256\n"," self.encoder = Encoder(\n"," hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, phone, pitch, lengths):\n"," if pitch == None:\n"," x = self.emb_phone(phone)\n"," else:\n"," x = self.emb_phone(phone) + self.emb_pitch(pitch)\n"," x = x * math.sqrt(self.hidden_channels) # [b, t, h]\n"," x = self.lrelu(x)\n"," x = torch.transpose(x, 1, -1) # [b, h, t]\n"," x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.encoder(x * x_mask, x_mask)\n"," stats = self.proj(x) * x_mask\n","\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," return m, logs, x_mask\n","\n","\n","class TextEncoder768(nn.Module):\n"," def __init__(\n"," self,\n"," out_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=True,\n"," ):\n"," super().__init__()\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.emb_phone = nn.Linear(768, hidden_channels)\n"," self.lrelu = nn.LeakyReLU(0.1, inplace=True)\n"," if f0 == True:\n"," self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256\n"," self.encoder = Encoder(\n"," hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, phone, pitch, lengths):\n"," if pitch == None:\n"," x = self.emb_phone(phone)\n"," else:\n"," x = self.emb_phone(phone) + self.emb_pitch(pitch)\n"," x = x * math.sqrt(self.hidden_channels) # [b, t, h]\n"," x = self.lrelu(x)\n"," x = torch.transpose(x, 1, -1) # [b, h, t]\n"," x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.encoder(x * x_mask, x_mask)\n"," stats = self.proj(x) * x_mask\n","\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," return m, logs, x_mask\n","\n","\n","class ResidualCouplingBlock(nn.Module):\n"," def __init__(\n"," self,\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," n_flows=4,\n"," gin_channels=0,\n"," ):\n"," super().__init__()\n"," self.channels = channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.n_flows = n_flows\n"," self.gin_channels = gin_channels\n","\n"," self.flows = nn.ModuleList()\n"," for i in range(n_flows):\n"," self.flows.append(\n"," ResidualCouplingLayer(\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=gin_channels,\n"," mean_only=True,\n"," )\n"," )\n"," self.flows.append(Flip())\n","\n"," def forward(self, x, x_mask, g=None, reverse=False):\n"," if not reverse:\n"," for flow in self.flows:\n"," x, _ = flow(x, x_mask, g=g, reverse=reverse)\n"," else:\n"," for flow in reversed(self.flows):\n"," x = flow(x, x_mask, g=g, reverse=reverse)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for i in range(self.n_flows):\n"," self.flows[i * 2].remove_weight_norm()\n","\n","\n","class PosteriorEncoder(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," out_channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=0,\n"," ):\n"," super().__init__()\n"," self.in_channels = in_channels\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.gin_channels = gin_channels\n","\n"," self.pre = nn.Conv1d(in_channels, hidden_channels, 1)\n"," self.enc = WN(\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=gin_channels,\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, x, x_lengths, g=None):\n"," x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.pre(x) * x_mask\n"," x = self.enc(x, x_mask, g=g)\n"," stats = self.proj(x) * x_mask\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask\n"," return z, m, logs, x_mask\n","\n"," def remove_weight_norm(self):\n"," self.enc.remove_weight_norm()\n","\n","\n","class Generator(torch.nn.Module):\n"," def __init__(\n"," self,\n"," initial_channel,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=0,\n"," ):\n"," super(Generator, self).__init__()\n"," self.num_kernels = len(resblock_kernel_sizes)\n"," self.num_upsamples = len(upsample_rates)\n"," self.conv_pre = Conv1d(\n"," initial_channel, upsample_initial_channel, 7, 1, padding=3\n"," )\n"," resblock = ResBlock1 if resblock == \"1\" else ResBlock2\n","\n"," self.ups = nn.ModuleList()\n"," for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):\n"," self.ups.append(\n"," weight_norm(\n"," ConvTranspose1d(\n"," upsample_initial_channel // (2**i),\n"," upsample_initial_channel // (2 ** (i + 1)),\n"," k,\n"," u,\n"," padding=(k - u) // 2,\n"," )\n"," )\n"," )\n","\n"," self.resblocks = nn.ModuleList()\n"," for i in range(len(self.ups)):\n"," ch = upsample_initial_channel // (2 ** (i + 1))\n"," for j, (k, d) in enumerate(\n"," zip(resblock_kernel_sizes, resblock_dilation_sizes)\n"," ):\n"," self.resblocks.append(resblock(ch, k, d))\n","\n"," self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)\n"," self.ups.apply(init_weights)\n","\n"," if gin_channels != 0:\n"," self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)\n","\n"," def forward(self, x, g=None):\n"," x = self.conv_pre(x)\n"," if g is not None:\n"," x = x + self.cond(g)\n","\n"," for i in range(self.num_upsamples):\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," x = self.ups[i](x)\n"," xs = None\n"," for j in range(self.num_kernels):\n"," if xs is None:\n"," xs = self.resblocks[i * self.num_kernels + j](x)\n"," else:\n"," xs += self.resblocks[i * self.num_kernels + j](x)\n"," x = xs / self.num_kernels\n"," x = F.leaky_relu(x)\n"," x = self.conv_post(x)\n"," x = torch.tanh(x)\n","\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.ups:\n"," remove_weight_norm(l)\n"," for l in self.resblocks:\n"," l.remove_weight_norm()\n","\n","\n","class SineGen(torch.nn.Module):\n"," \"\"\"Definition of sine generator\n"," SineGen(samp_rate, harmonic_num = 0,\n"," sine_amp = 0.1, noise_std = 0.003,\n"," voiced_threshold = 0,\n"," flag_for_pulse=False)\n"," samp_rate: sampling rate in Hz\n"," harmonic_num: number of harmonic overtones (default 0)\n"," sine_amp: amplitude of sine-wavefrom (default 0.1)\n"," noise_std: std of Gaussian noise (default 0.003)\n"," voiced_thoreshold: F0 threshold for U/V classification (default 0)\n"," flag_for_pulse: this SinGen is used inside PulseGen (default False)\n"," Note: when flag_for_pulse is True, the first time step of a voiced\n"," segment is always sin(np.pi) or cos(0)\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," samp_rate,\n"," harmonic_num=0,\n"," sine_amp=0.1,\n"," noise_std=0.003,\n"," voiced_threshold=0,\n"," flag_for_pulse=False,\n"," ):\n"," super(SineGen, self).__init__()\n"," self.sine_amp = sine_amp\n"," self.noise_std = noise_std\n"," self.harmonic_num = harmonic_num\n"," self.dim = self.harmonic_num + 1\n"," self.sampling_rate = samp_rate\n"," self.voiced_threshold = voiced_threshold\n","\n"," def _f02uv(self, f0):\n"," # generate uv signal\n"," uv = torch.ones_like(f0)\n"," uv = uv * (f0 > self.voiced_threshold)\n"," return uv\n","\n"," def forward(self, f0, upp):\n"," \"\"\"sine_tensor, uv = forward(f0)\n"," input F0: tensor(batchsize=1, length, dim=1)\n"," f0 for unvoiced steps should be 0\n"," output sine_tensor: tensor(batchsize=1, length, dim)\n"," output uv: tensor(batchsize=1, length, 1)\n"," \"\"\"\n"," with torch.no_grad():\n"," f0 = f0[:, None].transpose(1, 2)\n"," f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)\n"," # fundamental component\n"," f0_buf[:, :, 0] = f0[:, :, 0]\n"," for idx in np.arange(self.harmonic_num):\n"," f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (\n"," idx + 2\n"," ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic\n"," rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化\n"," rand_ini = torch.rand(\n"," f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device\n"," )\n"," rand_ini[:, 0] = 0\n"," rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini\n"," tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化\n"," tmp_over_one *= upp\n"," tmp_over_one = F.interpolate(\n"," tmp_over_one.transpose(2, 1),\n"," scale_factor=upp,\n"," mode=\"linear\",\n"," align_corners=True,\n"," ).transpose(2, 1)\n"," rad_values = F.interpolate(\n"," rad_values.transpose(2, 1), scale_factor=upp, mode=\"nearest\"\n"," ).transpose(\n"," 2, 1\n"," ) #######\n"," tmp_over_one %= 1\n"," tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0\n"," cumsum_shift = torch.zeros_like(rad_values)\n"," cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0\n"," sine_waves = torch.sin(\n"," torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi\n"," )\n"," sine_waves = sine_waves * self.sine_amp\n"," uv = self._f02uv(f0)\n"," uv = F.interpolate(\n"," uv.transpose(2, 1), scale_factor=upp, mode=\"nearest\"\n"," ).transpose(2, 1)\n"," noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3\n"," noise = noise_amp * torch.randn_like(sine_waves)\n"," sine_waves = sine_waves * uv + noise\n"," return sine_waves, uv, noise\n","\n","\n","class SourceModuleHnNSF(torch.nn.Module):\n"," \"\"\"SourceModule for hn-nsf\n"," SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,\n"," add_noise_std=0.003, voiced_threshod=0)\n"," sampling_rate: sampling_rate in Hz\n"," harmonic_num: number of harmonic above F0 (default: 0)\n"," sine_amp: amplitude of sine source signal (default: 0.1)\n"," add_noise_std: std of additive Gaussian noise (default: 0.003)\n"," note that amplitude of noise in unvoiced is decided\n"," by sine_amp\n"," voiced_threshold: threhold to set U/V given F0 (default: 0)\n"," Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)\n"," F0_sampled (batchsize, length, 1)\n"," Sine_source (batchsize, length, 1)\n"," noise_source (batchsize, length 1)\n"," uv (batchsize, length, 1)\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," sampling_rate,\n"," harmonic_num=0,\n"," sine_amp=0.1,\n"," add_noise_std=0.003,\n"," voiced_threshod=0,\n"," is_half=True,\n"," ):\n"," super(SourceModuleHnNSF, self).__init__()\n","\n"," self.sine_amp = sine_amp\n"," self.noise_std = add_noise_std\n"," self.is_half = is_half\n"," # to produce sine waveforms\n"," self.l_sin_gen = SineGen(\n"," sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod\n"," )\n","\n"," # to merge source harmonics into a single excitation\n"," self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)\n"," self.l_tanh = torch.nn.Tanh()\n","\n"," def forward(self, x, upp=None):\n"," sine_wavs, uv, _ = self.l_sin_gen(x, upp)\n"," if self.is_half:\n"," sine_wavs = sine_wavs.half()\n"," sine_merge = self.l_tanh(self.l_linear(sine_wavs))\n"," return sine_merge, None, None # noise, uv\n","\n","\n","class GeneratorNSF(torch.nn.Module):\n"," def __init__(\n"," self,\n"," initial_channel,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels,\n"," sr,\n"," is_half=False,\n"," ):\n"," super(GeneratorNSF, self).__init__()\n"," self.num_kernels = len(resblock_kernel_sizes)\n"," self.num_upsamples = len(upsample_rates)\n","\n"," self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))\n"," self.m_source = SourceModuleHnNSF(\n"," sampling_rate=sr, harmonic_num=0, is_half=is_half\n"," )\n"," self.noise_convs = nn.ModuleList()\n"," self.conv_pre = Conv1d(\n"," initial_channel, upsample_initial_channel, 7, 1, padding=3\n"," )\n"," resblock = ResBlock1 if resblock == \"1\" else ResBlock2\n","\n"," self.ups = nn.ModuleList()\n"," for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):\n"," c_cur = upsample_initial_channel // (2 ** (i + 1))\n"," self.ups.append(\n"," weight_norm(\n"," ConvTranspose1d(\n"," upsample_initial_channel // (2**i),\n"," upsample_initial_channel // (2 ** (i + 1)),\n"," k,\n"," u,\n"," padding=(k - u) // 2,\n"," )\n"," )\n"," )\n"," if i + 1 < len(upsample_rates):\n"," stride_f0 = np.prod(upsample_rates[i + 1 :])\n"," self.noise_convs.append(\n"," Conv1d(\n"," 1,\n"," c_cur,\n"," kernel_size=stride_f0 * 2,\n"," stride=stride_f0,\n"," padding=stride_f0 // 2,\n"," )\n"," )\n"," else:\n"," self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))\n","\n"," self.resblocks = nn.ModuleList()\n"," for i in range(len(self.ups)):\n"," ch = upsample_initial_channel // (2 ** (i + 1))\n"," for j, (k, d) in enumerate(\n"," zip(resblock_kernel_sizes, resblock_dilation_sizes)\n"," ):\n"," self.resblocks.append(resblock(ch, k, d))\n","\n"," self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)\n"," self.ups.apply(init_weights)\n","\n"," if gin_channels != 0:\n"," self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)\n","\n"," self.upp = np.prod(upsample_rates)\n","\n"," def forward(self, x, f0, g=None):\n"," har_source, noi_source, uv = self.m_source(f0, self.upp)\n"," har_source = har_source.transpose(1, 2)\n"," x = self.conv_pre(x)\n"," if g is not None:\n"," x = x + self.cond(g)\n","\n"," for i in range(self.num_upsamples):\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," x = self.ups[i](x)\n"," x_source = self.noise_convs[i](har_source)\n"," x = x + x_source\n"," xs = None\n"," for j in range(self.num_kernels):\n"," if xs is None:\n"," xs = self.resblocks[i * self.num_kernels + j](x)\n"," else:\n"," xs += self.resblocks[i * self.num_kernels + j](x)\n"," x = xs / self.num_kernels\n"," x = F.leaky_relu(x)\n"," x = self.conv_post(x)\n"," x = torch.tanh(x)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.ups:\n"," remove_weight_norm(l)\n"," for l in self.resblocks:\n"," l.remove_weight_norm()\n","\n","\n","sr2sr = {\n"," \"32k\": 32000,\n"," \"40k\": 40000,\n"," \"48k\": 48000,\n","}\n","\n","\n","class SynthesizerTrnMsNSFsidM(nn.Module):\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," gin_channels,\n"," sr,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," if type(sr) == type(\"strr\"):\n"," sr = sr2sr[sr]\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," if self.gin_channels == 256:\n"," self.enc_p = TextEncoder256(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," )\n"," else:\n"," self.enc_p = TextEncoder768(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," )\n"," self.dec = GeneratorNSF(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," sr=sr,\n"," is_half=kwargs[\"is_half\"],\n"," )\n"," self.enc_q = PosteriorEncoder(\n"," spec_channels,\n"," inter_channels,\n"," hidden_channels,\n"," 5,\n"," 1,\n"," 16,\n"," gin_channels=gin_channels,\n"," )\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," self.speaker_map = None\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def construct_spkmixmap(self, n_speaker):\n"," self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))\n"," for i in range(n_speaker):\n"," self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))\n"," self.speaker_map = self.speaker_map.unsqueeze(0)\n","\n"," def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):\n"," if self.speaker_map is not None: # [N, S] * [S, B, 1, H]\n"," g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]\n"," g = g * self.speaker_map # [N, S, B, 1, H]\n"," g = torch.sum(g, dim=1) # [N, 1, B, 1, H]\n"," g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]\n"," else:\n"," g = g.unsqueeze(0)\n"," g = self.emb_g(g).transpose(1, 2)\n","\n"," m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask\n"," z = self.flow(z_p, x_mask, g=g, reverse=True)\n"," o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)\n"," return o\n","\n","\n","class MultiPeriodDiscriminator(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(MultiPeriodDiscriminator, self).__init__()\n"," periods = [2, 3, 5, 7, 11, 17]\n"," # periods = [3, 5, 7, 11, 17, 23, 37]\n","\n"," discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]\n"," discs = discs + [\n"," DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods\n"," ]\n"," self.discriminators = nn.ModuleList(discs)\n","\n"," def forward(self, y, y_hat):\n"," y_d_rs = [] #\n"," y_d_gs = []\n"," fmap_rs = []\n"," fmap_gs = []\n"," for i, d in enumerate(self.discriminators):\n"," y_d_r, fmap_r = d(y)\n"," y_d_g, fmap_g = d(y_hat)\n"," # for j in range(len(fmap_r)):\n"," # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)\n"," y_d_rs.append(y_d_r)\n"," y_d_gs.append(y_d_g)\n"," fmap_rs.append(fmap_r)\n"," fmap_gs.append(fmap_g)\n","\n"," return y_d_rs, y_d_gs, fmap_rs, fmap_gs\n","\n","\n","class MultiPeriodDiscriminatorV2(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(MultiPeriodDiscriminatorV2, self).__init__()\n"," # periods = [2, 3, 5, 7, 11, 17]\n"," periods = [2, 3, 5, 7, 11, 17, 23, 37]\n","\n"," discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]\n"," discs = discs + [\n"," DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods\n"," ]\n"," self.discriminators = nn.ModuleList(discs)\n","\n"," def forward(self, y, y_hat):\n"," y_d_rs = [] #\n"," y_d_gs = []\n"," fmap_rs = []\n"," fmap_gs = []\n"," for i, d in enumerate(self.discriminators):\n"," y_d_r, fmap_r = d(y)\n"," y_d_g, fmap_g = d(y_hat)\n"," # for j in range(len(fmap_r)):\n"," # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)\n"," y_d_rs.append(y_d_r)\n"," y_d_gs.append(y_d_g)\n"," fmap_rs.append(fmap_r)\n"," fmap_gs.append(fmap_g)\n","\n"," return y_d_rs, y_d_gs, fmap_rs, fmap_gs\n","\n","\n","class DiscriminatorS(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(DiscriminatorS, self).__init__()\n"," norm_f = weight_norm if use_spectral_norm == False else spectral_norm\n"," self.convs = nn.ModuleList(\n"," [\n"," norm_f(Conv1d(1, 16, 15, 1, padding=7)),\n"," norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),\n"," norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),\n"," norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),\n"," norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),\n"," norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),\n"," ]\n"," )\n"," self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))\n","\n"," def forward(self, x):\n"," fmap = []\n","\n"," for l in self.convs:\n"," x = l(x)\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," fmap.append(x)\n"," x = self.conv_post(x)\n"," fmap.append(x)\n"," x = torch.flatten(x, 1, -1)\n","\n"," return x, fmap\n","\n","\n","class DiscriminatorP(torch.nn.Module):\n"," def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):\n"," super(DiscriminatorP, self).__init__()\n"," self.period = period\n"," self.use_spectral_norm = use_spectral_norm\n"," norm_f = weight_norm if use_spectral_norm == False else spectral_norm\n"," self.convs = nn.ModuleList(\n"," [\n"," norm_f(\n"," Conv2d(\n"," 1,\n"," 32,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 32,\n"," 128,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 128,\n"," 512,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 512,\n"," 1024,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 1024,\n"," 1024,\n"," (kernel_size, 1),\n"," 1,\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," ]\n"," )\n"," self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))\n","\n"," def forward(self, x):\n"," fmap = []\n","\n"," # 1d to 2d\n"," b, c, t = x.shape\n"," if t % self.period != 0: # pad first\n"," n_pad = self.period - (t % self.period)\n"," x = F.pad(x, (0, n_pad), \"reflect\")\n"," t = t + n_pad\n"," x = x.view(b, c, t // self.period, self.period)\n","\n"," for l in self.convs:\n"," x = l(x)\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," fmap.append(x)\n"," x = self.conv_post(x)\n"," fmap.append(x)\n"," x = torch.flatten(x, 1, -1)\n","\n"," return x, fmap"],"metadata":{"id":"O09z3RXFMnT3"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Models_Onnx_Moess\n","import math, pdb, os\n","from time import time as ttime\n","import torch\n","from torch import nn\n","from torch.nn import functional as F\n","from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d\n","from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm\n","import numpy as np\n","\n","\n","class TextEncoder256(nn.Module):\n"," def __init__(\n"," self,\n"," out_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=True,\n"," ):\n"," super().__init__()\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.emb_phone = nn.Linear(256, hidden_channels)\n"," self.lrelu = nn.LeakyReLU(0.1, inplace=True)\n"," if f0 == True:\n"," self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256\n"," self.encoder = Encoder(\n"," hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, phone, pitch, lengths):\n"," if pitch == None:\n"," x = self.emb_phone(phone)\n"," else:\n"," x = self.emb_phone(phone) + self.emb_pitch(pitch)\n"," x = x * math.sqrt(self.hidden_channels) # [b, t, h]\n"," x = self.lrelu(x)\n"," x = torch.transpose(x, 1, -1) # [b, h, t]\n"," x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.encoder(x * x_mask, x_mask)\n"," stats = self.proj(x) * x_mask\n","\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," return m, logs, x_mask\n","\n","\n","class TextEncoder256Sim(nn.Module):\n"," def __init__(\n"," self,\n"," out_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," f0=True,\n"," ):\n"," super().__init__()\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.emb_phone = nn.Linear(256, hidden_channels)\n"," self.lrelu = nn.LeakyReLU(0.1, inplace=True)\n"," if f0 == True:\n"," self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256\n"," self.encoder = Encoder(\n"," hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels, 1)\n","\n"," def forward(self, phone, pitch, lengths):\n"," if pitch == None:\n"," x = self.emb_phone(phone)\n"," else:\n"," x = self.emb_phone(phone) + self.emb_pitch(pitch)\n"," x = x * math.sqrt(self.hidden_channels) # [b, t, h]\n"," x = self.lrelu(x)\n"," x = torch.transpose(x, 1, -1) # [b, h, t]\n"," x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.encoder(x * x_mask, x_mask)\n"," x = self.proj(x) * x_mask\n"," return x, x_mask\n","\n","\n","class ResidualCouplingBlock(nn.Module):\n"," def __init__(\n"," self,\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," n_flows=4,\n"," gin_channels=0,\n"," ):\n"," super().__init__()\n"," self.channels = channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.n_flows = n_flows\n"," self.gin_channels = gin_channels\n","\n"," self.flows = nn.ModuleList()\n"," for i in range(n_flows):\n"," self.flows.append(\n"," ResidualCouplingLayer(\n"," channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=gin_channels,\n"," mean_only=True,\n"," )\n"," )\n"," self.flows.append(Flip())\n","\n"," def forward(self, x, x_mask, g=None, reverse=False):\n"," if not reverse:\n"," for flow in self.flows:\n"," x, _ = flow(x, x_mask, g=g, reverse=reverse)\n"," else:\n"," for flow in reversed(self.flows):\n"," x = flow(x, x_mask, g=g, reverse=reverse)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for i in range(self.n_flows):\n"," self.flows[i * 2].remove_weight_norm()\n","\n","\n","class PosteriorEncoder(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," out_channels,\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=0,\n"," ):\n"," super().__init__()\n"," self.in_channels = in_channels\n"," self.out_channels = out_channels\n"," self.hidden_channels = hidden_channels\n"," self.kernel_size = kernel_size\n"," self.dilation_rate = dilation_rate\n"," self.n_layers = n_layers\n"," self.gin_channels = gin_channels\n","\n"," self.pre = nn.Conv1d(in_channels, hidden_channels, 1)\n"," self.enc = WN(\n"," hidden_channels,\n"," kernel_size,\n"," dilation_rate,\n"," n_layers,\n"," gin_channels=gin_channels,\n"," )\n"," self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)\n","\n"," def forward(self, x, x_lengths, g=None):\n"," x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(\n"," x.dtype\n"," )\n"," x = self.pre(x) * x_mask\n"," x = self.enc(x, x_mask, g=g)\n"," stats = self.proj(x) * x_mask\n"," m, logs = torch.split(stats, self.out_channels, dim=1)\n"," z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask\n"," return z, m, logs, x_mask\n","\n"," def remove_weight_norm(self):\n"," self.enc.remove_weight_norm()\n","\n","\n","class Generator(torch.nn.Module):\n"," def __init__(\n"," self,\n"," initial_channel,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=0,\n"," ):\n"," super(Generator, self).__init__()\n"," self.num_kernels = len(resblock_kernel_sizes)\n"," self.num_upsamples = len(upsample_rates)\n"," self.conv_pre = Conv1d(\n"," initial_channel, upsample_initial_channel, 7, 1, padding=3\n"," )\n"," resblock = ResBlock1 if resblock == \"1\" else ResBlock2\n","\n"," self.ups = nn.ModuleList()\n"," for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):\n"," self.ups.append(\n"," weight_norm(\n"," ConvTranspose1d(\n"," upsample_initial_channel // (2**i),\n"," upsample_initial_channel // (2 ** (i + 1)),\n"," k,\n"," u,\n"," padding=(k - u) // 2,\n"," )\n"," )\n"," )\n","\n"," self.resblocks = nn.ModuleList()\n"," for i in range(len(self.ups)):\n"," ch = upsample_initial_channel // (2 ** (i + 1))\n"," for j, (k, d) in enumerate(\n"," zip(resblock_kernel_sizes, resblock_dilation_sizes)\n"," ):\n"," self.resblocks.append(resblock(ch, k, d))\n","\n"," self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)\n"," self.ups.apply(init_weights)\n","\n"," if gin_channels != 0:\n"," self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)\n","\n"," def forward(self, x, g=None):\n"," x = self.conv_pre(x)\n"," if g is not None:\n"," x = x + self.cond(g)\n","\n"," for i in range(self.num_upsamples):\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," x = self.ups[i](x)\n"," xs = None\n"," for j in range(self.num_kernels):\n"," if xs is None:\n"," xs = self.resblocks[i * self.num_kernels + j](x)\n"," else:\n"," xs += self.resblocks[i * self.num_kernels + j](x)\n"," x = xs / self.num_kernels\n"," x = F.leaky_relu(x)\n"," x = self.conv_post(x)\n"," x = torch.tanh(x)\n","\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.ups:\n"," remove_weight_norm(l)\n"," for l in self.resblocks:\n"," l.remove_weight_norm()\n","\n","\n","class SineGen(torch.nn.Module):\n"," \"\"\"Definition of sine generator\n"," SineGen(samp_rate, harmonic_num = 0,\n"," sine_amp = 0.1, noise_std = 0.003,\n"," voiced_threshold = 0,\n"," flag_for_pulse=False)\n"," samp_rate: sampling rate in Hz\n"," harmonic_num: number of harmonic overtones (default 0)\n"," sine_amp: amplitude of sine-wavefrom (default 0.1)\n"," noise_std: std of Gaussian noise (default 0.003)\n"," voiced_thoreshold: F0 threshold for U/V classification (default 0)\n"," flag_for_pulse: this SinGen is used inside PulseGen (default False)\n"," Note: when flag_for_pulse is True, the first time step of a voiced\n"," segment is always sin(np.pi) or cos(0)\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," samp_rate,\n"," harmonic_num=0,\n"," sine_amp=0.1,\n"," noise_std=0.003,\n"," voiced_threshold=0,\n"," flag_for_pulse=False,\n"," ):\n"," super(SineGen, self).__init__()\n"," self.sine_amp = sine_amp\n"," self.noise_std = noise_std\n"," self.harmonic_num = harmonic_num\n"," self.dim = self.harmonic_num + 1\n"," self.sampling_rate = samp_rate\n"," self.voiced_threshold = voiced_threshold\n","\n"," def _f02uv(self, f0):\n"," # generate uv signal\n"," uv = torch.ones_like(f0)\n"," uv = uv * (f0 > self.voiced_threshold)\n"," return uv\n","\n"," def forward(self, f0, upp):\n"," \"\"\"sine_tensor, uv = forward(f0)\n"," input F0: tensor(batchsize=1, length, dim=1)\n"," f0 for unvoiced steps should be 0\n"," output sine_tensor: tensor(batchsize=1, length, dim)\n"," output uv: tensor(batchsize=1, length, 1)\n"," \"\"\"\n"," with torch.no_grad():\n"," f0 = f0[:, None].transpose(1, 2)\n"," f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)\n"," # fundamental component\n"," f0_buf[:, :, 0] = f0[:, :, 0]\n"," for idx in np.arange(self.harmonic_num):\n"," f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (\n"," idx + 2\n"," ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic\n"," rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化\n"," rand_ini = torch.rand(\n"," f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device\n"," )\n"," rand_ini[:, 0] = 0\n"," rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini\n"," tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化\n"," tmp_over_one *= upp\n"," tmp_over_one = F.interpolate(\n"," tmp_over_one.transpose(2, 1),\n"," scale_factor=upp,\n"," mode=\"linear\",\n"," align_corners=True,\n"," ).transpose(2, 1)\n"," rad_values = F.interpolate(\n"," rad_values.transpose(2, 1), scale_factor=upp, mode=\"nearest\"\n"," ).transpose(\n"," 2, 1\n"," ) #######\n"," tmp_over_one %= 1\n"," tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0\n"," cumsum_shift = torch.zeros_like(rad_values)\n"," cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0\n"," sine_waves = torch.sin(\n"," torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi\n"," )\n"," sine_waves = sine_waves * self.sine_amp\n"," uv = self._f02uv(f0)\n"," uv = F.interpolate(\n"," uv.transpose(2, 1), scale_factor=upp, mode=\"nearest\"\n"," ).transpose(2, 1)\n"," noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3\n"," noise = noise_amp * torch.randn_like(sine_waves)\n"," sine_waves = sine_waves * uv + noise\n"," return sine_waves, uv, noise\n","\n","\n","class SourceModuleHnNSF(torch.nn.Module):\n"," \"\"\"SourceModule for hn-nsf\n"," SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,\n"," add_noise_std=0.003, voiced_threshod=0)\n"," sampling_rate: sampling_rate in Hz\n"," harmonic_num: number of harmonic above F0 (default: 0)\n"," sine_amp: amplitude of sine source signal (default: 0.1)\n"," add_noise_std: std of additive Gaussian noise (default: 0.003)\n"," note that amplitude of noise in unvoiced is decided\n"," by sine_amp\n"," voiced_threshold: threhold to set U/V given F0 (default: 0)\n"," Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)\n"," F0_sampled (batchsize, length, 1)\n"," Sine_source (batchsize, length, 1)\n"," noise_source (batchsize, length 1)\n"," uv (batchsize, length, 1)\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," sampling_rate,\n"," harmonic_num=0,\n"," sine_amp=0.1,\n"," add_noise_std=0.003,\n"," voiced_threshod=0,\n"," is_half=True,\n"," ):\n"," super(SourceModuleHnNSF, self).__init__()\n","\n"," self.sine_amp = sine_amp\n"," self.noise_std = add_noise_std\n"," self.is_half = is_half\n"," # to produce sine waveforms\n"," self.l_sin_gen = SineGen(\n"," sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod\n"," )\n","\n"," # to merge source harmonics into a single excitation\n"," self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)\n"," self.l_tanh = torch.nn.Tanh()\n","\n"," def forward(self, x, upp=None):\n"," sine_wavs, uv, _ = self.l_sin_gen(x, upp)\n"," if self.is_half:\n"," sine_wavs = sine_wavs.half()\n"," sine_merge = self.l_tanh(self.l_linear(sine_wavs))\n"," return sine_merge, None, None # noise, uv\n","\n","\n","class GeneratorNSF(torch.nn.Module):\n"," def __init__(\n"," self,\n"," initial_channel,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels,\n"," sr,\n"," is_half=False,\n"," ):\n"," super(GeneratorNSF, self).__init__()\n"," self.num_kernels = len(resblock_kernel_sizes)\n"," self.num_upsamples = len(upsample_rates)\n","\n"," self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))\n"," self.m_source = SourceModuleHnNSF(\n"," sampling_rate=sr, harmonic_num=0, is_half=is_half\n"," )\n"," self.noise_convs = nn.ModuleList()\n"," self.conv_pre = Conv1d(\n"," initial_channel, upsample_initial_channel, 7, 1, padding=3\n"," )\n"," resblock = ResBlock1 if resblock == \"1\" else ResBlock2\n","\n"," self.ups = nn.ModuleList()\n"," for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):\n"," c_cur = upsample_initial_channel // (2 ** (i + 1))\n"," self.ups.append(\n"," weight_norm(\n"," ConvTranspose1d(\n"," upsample_initial_channel // (2**i),\n"," upsample_initial_channel // (2 ** (i + 1)),\n"," k,\n"," u,\n"," padding=(k - u) // 2,\n"," )\n"," )\n"," )\n"," if i + 1 < len(upsample_rates):\n"," stride_f0 = np.prod(upsample_rates[i + 1 :])\n"," self.noise_convs.append(\n"," Conv1d(\n"," 1,\n"," c_cur,\n"," kernel_size=stride_f0 * 2,\n"," stride=stride_f0,\n"," padding=stride_f0 // 2,\n"," )\n"," )\n"," else:\n"," self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))\n","\n"," self.resblocks = nn.ModuleList()\n"," for i in range(len(self.ups)):\n"," ch = upsample_initial_channel // (2 ** (i + 1))\n"," for j, (k, d) in enumerate(\n"," zip(resblock_kernel_sizes, resblock_dilation_sizes)\n"," ):\n"," self.resblocks.append(resblock(ch, k, d))\n","\n"," self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)\n"," self.ups.apply(init_weights)\n","\n"," if gin_channels != 0:\n"," self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)\n","\n"," self.upp = np.prod(upsample_rates)\n","\n"," def forward(self, x, f0, g=None):\n"," har_source, noi_source, uv = self.m_source(f0, self.upp)\n"," har_source = har_source.transpose(1, 2)\n"," x = self.conv_pre(x)\n"," if g is not None:\n"," x = x + self.cond(g)\n","\n"," for i in range(self.num_upsamples):\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," x = self.ups[i](x)\n"," x_source = self.noise_convs[i](har_source)\n"," x = x + x_source\n"," xs = None\n"," for j in range(self.num_kernels):\n"," if xs is None:\n"," xs = self.resblocks[i * self.num_kernels + j](x)\n"," else:\n"," xs += self.resblocks[i * self.num_kernels + j](x)\n"," x = xs / self.num_kernels\n"," x = F.leaky_relu(x)\n"," x = self.conv_post(x)\n"," x = torch.tanh(x)\n"," return x\n","\n"," def remove_weight_norm(self):\n"," for l in self.ups:\n"," remove_weight_norm(l)\n"," for l in self.resblocks:\n"," l.remove_weight_norm()\n","\n","\n","sr2sr = {\n"," \"32k\": 32000,\n"," \"40k\": 40000,\n"," \"48k\": 48000,\n","}\n","\n","\n","class SynthesizerTrnMs256NSFsidM(nn.Module):\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," gin_channels,\n"," sr,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," if type(sr) == type(\"strr\"):\n"," sr = sr2sr[sr]\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," self.enc_p = TextEncoder256(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," )\n"," self.dec = GeneratorNSF(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," sr=sr,\n"," is_half=kwargs[\"is_half\"],\n"," )\n"," self.enc_q = PosteriorEncoder(\n"," spec_channels,\n"," inter_channels,\n"," hidden_channels,\n"," 5,\n"," 1,\n"," 16,\n"," gin_channels=gin_channels,\n"," )\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):\n"," g = self.emb_g(sid).unsqueeze(-1)\n"," m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask\n"," z = self.flow(z_p, x_mask, g=g, reverse=True)\n"," o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)\n"," return o\n","\n","\n","class SynthesizerTrnMs256NSFsid_sim(nn.Module):\n"," \"\"\"\n"," Synthesizer for Training\n"," \"\"\"\n","\n"," def __init__(\n"," self,\n"," spec_channels,\n"," segment_size,\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," spk_embed_dim,\n"," # hop_length,\n"," gin_channels=0,\n"," use_sdp=True,\n"," **kwargs\n"," ):\n"," super().__init__()\n"," self.spec_channels = spec_channels\n"," self.inter_channels = inter_channels\n"," self.hidden_channels = hidden_channels\n"," self.filter_channels = filter_channels\n"," self.n_heads = n_heads\n"," self.n_layers = n_layers\n"," self.kernel_size = kernel_size\n"," self.p_dropout = p_dropout\n"," self.resblock = resblock\n"," self.resblock_kernel_sizes = resblock_kernel_sizes\n"," self.resblock_dilation_sizes = resblock_dilation_sizes\n"," self.upsample_rates = upsample_rates\n"," self.upsample_initial_channel = upsample_initial_channel\n"," self.upsample_kernel_sizes = upsample_kernel_sizes\n"," self.segment_size = segment_size\n"," self.gin_channels = gin_channels\n"," # self.hop_length = hop_length#\n"," self.spk_embed_dim = spk_embed_dim\n"," self.enc_p = TextEncoder256Sim(\n"," inter_channels,\n"," hidden_channels,\n"," filter_channels,\n"," n_heads,\n"," n_layers,\n"," kernel_size,\n"," p_dropout,\n"," )\n"," self.dec = GeneratorNSF(\n"," inter_channels,\n"," resblock,\n"," resblock_kernel_sizes,\n"," resblock_dilation_sizes,\n"," upsample_rates,\n"," upsample_initial_channel,\n"," upsample_kernel_sizes,\n"," gin_channels=gin_channels,\n"," is_half=kwargs[\"is_half\"],\n"," )\n","\n"," self.flow = ResidualCouplingBlock(\n"," inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels\n"," )\n"," self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)\n"," print(\"gin_channels:\", gin_channels, \"self.spk_embed_dim:\", self.spk_embed_dim)\n","\n"," def remove_weight_norm(self):\n"," self.dec.remove_weight_norm()\n"," self.flow.remove_weight_norm()\n"," self.enc_q.remove_weight_norm()\n","\n"," def forward(\n"," self, phone, phone_lengths, pitch, pitchf, ds, max_len=None\n"," ): # y是spec不需要了现在\n"," g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的\n"," x, x_mask = self.enc_p(phone, pitch, phone_lengths)\n"," x = self.flow(x, x_mask, g=g, reverse=True)\n"," o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)\n"," return o\n","\n","\n","class MultiPeriodDiscriminator(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(MultiPeriodDiscriminator, self).__init__()\n"," periods = [2, 3, 5, 7, 11, 17]\n"," # periods = [3, 5, 7, 11, 17, 23, 37]\n","\n"," discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]\n"," discs = discs + [\n"," DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods\n"," ]\n"," self.discriminators = nn.ModuleList(discs)\n","\n"," def forward(self, y, y_hat):\n"," y_d_rs = [] #\n"," y_d_gs = []\n"," fmap_rs = []\n"," fmap_gs = []\n"," for i, d in enumerate(self.discriminators):\n"," y_d_r, fmap_r = d(y)\n"," y_d_g, fmap_g = d(y_hat)\n"," # for j in range(len(fmap_r)):\n"," # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)\n"," y_d_rs.append(y_d_r)\n"," y_d_gs.append(y_d_g)\n"," fmap_rs.append(fmap_r)\n"," fmap_gs.append(fmap_g)\n","\n"," return y_d_rs, y_d_gs, fmap_rs, fmap_gs\n","\n","\n","class DiscriminatorS(torch.nn.Module):\n"," def __init__(self, use_spectral_norm=False):\n"," super(DiscriminatorS, self).__init__()\n"," norm_f = weight_norm if use_spectral_norm == False else spectral_norm\n"," self.convs = nn.ModuleList(\n"," [\n"," norm_f(Conv1d(1, 16, 15, 1, padding=7)),\n"," norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),\n"," norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),\n"," norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),\n"," norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),\n"," norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),\n"," ]\n"," )\n"," self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))\n","\n"," def forward(self, x):\n"," fmap = []\n","\n"," for l in self.convs:\n"," x = l(x)\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," fmap.append(x)\n"," x = self.conv_post(x)\n"," fmap.append(x)\n"," x = torch.flatten(x, 1, -1)\n","\n"," return x, fmap\n","\n","\n","class DiscriminatorP(torch.nn.Module):\n"," def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):\n"," super(DiscriminatorP, self).__init__()\n"," self.period = period\n"," self.use_spectral_norm = use_spectral_norm\n"," norm_f = weight_norm if use_spectral_norm == False else spectral_norm\n"," self.convs = nn.ModuleList(\n"," [\n"," norm_f(\n"," Conv2d(\n"," 1,\n"," 32,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 32,\n"," 128,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 128,\n"," 512,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 512,\n"," 1024,\n"," (kernel_size, 1),\n"," (stride, 1),\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," norm_f(\n"," Conv2d(\n"," 1024,\n"," 1024,\n"," (kernel_size, 1),\n"," 1,\n"," padding=(get_padding(kernel_size, 1), 0),\n"," )\n"," ),\n"," ]\n"," )\n"," self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))\n","\n"," def forward(self, x):\n"," fmap = []\n","\n"," # 1d to 2d\n"," b, c, t = x.shape\n"," if t % self.period != 0: # pad first\n"," n_pad = self.period - (t % self.period)\n"," x = F.pad(x, (0, n_pad), \"reflect\")\n"," t = t + n_pad\n"," x = x.view(b, c, t // self.period, self.period)\n","\n"," for l in self.convs:\n"," x = l(x)\n"," x = F.leaky_relu(x, LRELU_SLOPE)\n"," fmap.append(x)\n"," x = self.conv_post(x)\n"," fmap.append(x)\n"," x = torch.flatten(x, 1, -1)\n","\n"," return x, fmap"],"metadata":{"id":"316I36iLM9Ab"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Transforms\n","import torch\n","from torch.nn import functional as F\n","\n","import numpy as np\n","\n","\n","DEFAULT_MIN_BIN_WIDTH = 1e-3\n","DEFAULT_MIN_BIN_HEIGHT = 1e-3\n","DEFAULT_MIN_DERIVATIVE = 1e-3\n","\n","\n","def piecewise_rational_quadratic_transform(\n"," inputs,\n"," unnormalized_widths,\n"," unnormalized_heights,\n"," unnormalized_derivatives,\n"," inverse=False,\n"," tails=None,\n"," tail_bound=1.0,\n"," min_bin_width=DEFAULT_MIN_BIN_WIDTH,\n"," min_bin_height=DEFAULT_MIN_BIN_HEIGHT,\n"," min_derivative=DEFAULT_MIN_DERIVATIVE,\n","):\n"," if tails is None:\n"," spline_fn = rational_quadratic_spline\n"," spline_kwargs = {}\n"," else:\n"," spline_fn = unconstrained_rational_quadratic_spline\n"," spline_kwargs = {\"tails\": tails, \"tail_bound\": tail_bound}\n","\n"," outputs, logabsdet = spline_fn(\n"," inputs=inputs,\n"," unnormalized_widths=unnormalized_widths,\n"," unnormalized_heights=unnormalized_heights,\n"," unnormalized_derivatives=unnormalized_derivatives,\n"," inverse=inverse,\n"," min_bin_width=min_bin_width,\n"," min_bin_height=min_bin_height,\n"," min_derivative=min_derivative,\n"," **spline_kwargs\n"," )\n"," return outputs, logabsdet\n","\n","\n","def searchsorted(bin_locations, inputs, eps=1e-6):\n"," bin_locations[..., -1] += eps\n"," return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1\n","\n","\n","def unconstrained_rational_quadratic_spline(\n"," inputs,\n"," unnormalized_widths,\n"," unnormalized_heights,\n"," unnormalized_derivatives,\n"," inverse=False,\n"," tails=\"linear\",\n"," tail_bound=1.0,\n"," min_bin_width=DEFAULT_MIN_BIN_WIDTH,\n"," min_bin_height=DEFAULT_MIN_BIN_HEIGHT,\n"," min_derivative=DEFAULT_MIN_DERIVATIVE,\n","):\n"," inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)\n"," outside_interval_mask = ~inside_interval_mask\n","\n"," outputs = torch.zeros_like(inputs)\n"," logabsdet = torch.zeros_like(inputs)\n","\n"," if tails == \"linear\":\n"," unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))\n"," constant = np.log(np.exp(1 - min_derivative) - 1)\n"," unnormalized_derivatives[..., 0] = constant\n"," unnormalized_derivatives[..., -1] = constant\n","\n"," outputs[outside_interval_mask] = inputs[outside_interval_mask]\n"," logabsdet[outside_interval_mask] = 0\n"," else:\n"," raise RuntimeError(\"{} tails are not implemented.\".format(tails))\n","\n"," (\n"," outputs[inside_interval_mask],\n"," logabsdet[inside_interval_mask],\n"," ) = rational_quadratic_spline(\n"," inputs=inputs[inside_interval_mask],\n"," unnormalized_widths=unnormalized_widths[inside_interval_mask, :],\n"," unnormalized_heights=unnormalized_heights[inside_interval_mask, :],\n"," unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],\n"," inverse=inverse,\n"," left=-tail_bound,\n"," right=tail_bound,\n"," bottom=-tail_bound,\n"," top=tail_bound,\n"," min_bin_width=min_bin_width,\n"," min_bin_height=min_bin_height,\n"," min_derivative=min_derivative,\n"," )\n","\n"," return outputs, logabsdet\n","\n","\n","def rational_quadratic_spline(\n"," inputs,\n"," unnormalized_widths,\n"," unnormalized_heights,\n"," unnormalized_derivatives,\n"," inverse=False,\n"," left=0.0,\n"," right=1.0,\n"," bottom=0.0,\n"," top=1.0,\n"," min_bin_width=DEFAULT_MIN_BIN_WIDTH,\n"," min_bin_height=DEFAULT_MIN_BIN_HEIGHT,\n"," min_derivative=DEFAULT_MIN_DERIVATIVE,\n","):\n"," if torch.min(inputs) < left or torch.max(inputs) > right:\n"," raise ValueError(\"Input to a transform is not within its domain\")\n","\n"," num_bins = unnormalized_widths.shape[-1]\n","\n"," if min_bin_width * num_bins > 1.0:\n"," raise ValueError(\"Minimal bin width too large for the number of bins\")\n"," if min_bin_height * num_bins > 1.0:\n"," raise ValueError(\"Minimal bin height too large for the number of bins\")\n","\n"," widths = F.softmax(unnormalized_widths, dim=-1)\n"," widths = min_bin_width + (1 - min_bin_width * num_bins) * widths\n"," cumwidths = torch.cumsum(widths, dim=-1)\n"," cumwidths = F.pad(cumwidths, pad=(1, 0), mode=\"constant\", value=0.0)\n"," cumwidths = (right - left) * cumwidths + left\n"," cumwidths[..., 0] = left\n"," cumwidths[..., -1] = right\n"," widths = cumwidths[..., 1:] - cumwidths[..., :-1]\n","\n"," derivatives = min_derivative + F.softplus(unnormalized_derivatives)\n","\n"," heights = F.softmax(unnormalized_heights, dim=-1)\n"," heights = min_bin_height + (1 - min_bin_height * num_bins) * heights\n"," cumheights = torch.cumsum(heights, dim=-1)\n"," cumheights = F.pad(cumheights, pad=(1, 0), mode=\"constant\", value=0.0)\n"," cumheights = (top - bottom) * cumheights + bottom\n"," cumheights[..., 0] = bottom\n"," cumheights[..., -1] = top\n"," heights = cumheights[..., 1:] - cumheights[..., :-1]\n","\n"," if inverse:\n"," bin_idx = searchsorted(cumheights, inputs)[..., None]\n"," else:\n"," bin_idx = searchsorted(cumwidths, inputs)[..., None]\n","\n"," input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]\n"," input_bin_widths = widths.gather(-1, bin_idx)[..., 0]\n","\n"," input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]\n"," delta = heights / widths\n"," input_delta = delta.gather(-1, bin_idx)[..., 0]\n","\n"," input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]\n"," input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]\n","\n"," input_heights = heights.gather(-1, bin_idx)[..., 0]\n","\n"," if inverse:\n"," a = (inputs - input_cumheights) * (\n"," input_derivatives + input_derivatives_plus_one - 2 * input_delta\n"," ) + input_heights * (input_delta - input_derivatives)\n"," b = input_heights * input_derivatives - (inputs - input_cumheights) * (\n"," input_derivatives + input_derivatives_plus_one - 2 * input_delta\n"," )\n"," c = -input_delta * (inputs - input_cumheights)\n","\n"," discriminant = b.pow(2) - 4 * a * c\n"," assert (discriminant >= 0).all()\n","\n"," root = (2 * c) / (-b - torch.sqrt(discriminant))\n"," outputs = root * input_bin_widths + input_cumwidths\n","\n"," theta_one_minus_theta = root * (1 - root)\n"," denominator = input_delta + (\n"," (input_derivatives + input_derivatives_plus_one - 2 * input_delta)\n"," * theta_one_minus_theta\n"," )\n"," derivative_numerator = input_delta.pow(2) * (\n"," input_derivatives_plus_one * root.pow(2)\n"," + 2 * input_delta * theta_one_minus_theta\n"," + input_derivatives * (1 - root).pow(2)\n"," )\n"," logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)\n","\n"," return outputs, -logabsdet\n"," else:\n"," theta = (inputs - input_cumwidths) / input_bin_widths\n"," theta_one_minus_theta = theta * (1 - theta)\n","\n"," numerator = input_heights * (\n"," input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta\n"," )\n"," denominator = input_delta + (\n"," (input_derivatives + input_derivatives_plus_one - 2 * input_delta)\n"," * theta_one_minus_theta\n"," )\n"," outputs = input_cumheights + numerator / denominator\n","\n"," derivative_numerator = input_delta.pow(2) * (\n"," input_derivatives_plus_one * theta.pow(2)\n"," + 2 * input_delta * theta_one_minus_theta\n"," + input_derivatives * (1 - theta).pow(2)\n"," )\n"," logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)\n","\n"," return outputs, logabsdet"],"metadata":{"id":"oBQQ3shXNMTb"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# MAIN"],"metadata":{"id":"Dcjd_DmWIs5-"}},{"cell_type":"code","source":["!mkdir /content/ERPISI\n","!mkdir /content/ERPISI/mdxnet_models\n","!mkdir /content/ERPISI/rvc_models\n","!mkdir /content/ERPISI/src\n","!mkdir /content/ERPISI/src/config\n","!wget https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/mdxnet_models/model_data.json -O /content/ERPISI/mdxnet_models/model_data.json\n","!wget https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/32k.json -O /content/ERPISI/src/config/32k.json\n","!wget https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/32k_v2.json -O /content/ERPISI/src/config/32k_v2.json\n","!wget https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/40k.json -O /content/ERPISI/src/config/40k.json\n","!wget https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/48k.json -O /content/ERPISI/src/config/48k.json\n","!wget https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/48k_v2.json -O /content/ERPISI/src/config/48k_v2.json"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jlgRRlKJPWQU","executionInfo":{"status":"ok","timestamp":1695448686485,"user_tz":-420,"elapsed":3672,"user":{"displayName":"ana taqa126","userId":"16118498281411814981"}},"outputId":"0907d7bc-8d18-4dae-946d-19f2c15498ce"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["--2023-09-23 05:58:03-- https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/mdxnet_models/model_data.json\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 10111 (9.9K) [text/plain]\n","Saving to: ‘/content/ERPISI/mdxnet_models/model_data.json’\n","\n","/content/ERPISI/mdx 100%[===================>] 9.87K --.-KB/s in 0s \n","\n","2023-09-23 05:58:03 (91.4 MB/s) - ‘/content/ERPISI/mdxnet_models/model_data.json’ saved [10111/10111]\n","\n","--2023-09-23 05:58:03-- https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/32k.json\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1049 (1.0K) [text/plain]\n","Saving to: ‘/content/ERPISI/src/config/32k.json’\n","\n","/content/ERPISI/src 100%[===================>] 1.02K --.-KB/s in 0s \n","\n","2023-09-23 05:58:04 (78.3 MB/s) - ‘/content/ERPISI/src/config/32k.json’ saved [1049/1049]\n","\n","--2023-09-23 05:58:04-- https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/32k_v2.json\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1044 (1.0K) [text/plain]\n","Saving to: ‘/content/ERPISI/src/config/32k_v2.json’\n","\n","/content/ERPISI/src 100%[===================>] 1.02K --.-KB/s in 0s \n","\n","2023-09-23 05:58:04 (90.0 MB/s) - ‘/content/ERPISI/src/config/32k_v2.json’ saved [1044/1044]\n","\n","--2023-09-23 05:58:04-- https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/40k.json\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1047 (1.0K) [text/plain]\n","Saving to: ‘/content/ERPISI/src/config/40k.json’\n","\n","/content/ERPISI/src 100%[===================>] 1.02K --.-KB/s in 0s \n","\n","2023-09-23 05:58:04 (75.7 MB/s) - ‘/content/ERPISI/src/config/40k.json’ saved [1047/1047]\n","\n","--2023-09-23 05:58:05-- https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/48k.json\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1050 (1.0K) [text/plain]\n","Saving to: ‘/content/ERPISI/src/config/48k.json’\n","\n","/content/ERPISI/src 100%[===================>] 1.03K --.-KB/s in 0s \n","\n","2023-09-23 05:58:05 (79.5 MB/s) - ‘/content/ERPISI/src/config/48k.json’ saved [1050/1050]\n","\n","--2023-09-23 05:58:05-- https://raw.githubusercontent.com/SociallyIneptWeeb/AICoverGen/main/src/configs/48k_v2.json\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 1046 (1.0K) [text/plain]\n","Saving to: ‘/content/ERPISI/src/config/48k_v2.json’\n","\n","/content/ERPISI/src 100%[===================>] 1.02K --.-KB/s in 0s \n","\n","2023-09-23 05:58:05 (65.3 MB/s) - ‘/content/ERPISI/src/config/48k_v2.json’ saved [1046/1046]\n","\n"]}]},{"cell_type":"code","source":["#@title Download MDXNet Vocal Separation and Hubert Base Models\n","\n","VCWU = (\"Voice\", \"Conversion\", \"Web\", \"UI\")\n","VCWU = \"\".join(VCWU)\n","\n","from pathlib import Path\n","import requests\n","\n","MDX_DOWNLOAD_LINK = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/'\n","RVC_DOWNLOAD_LINK = 'https://huggingface.co/lj1995/'+VCWU+'/resolve/main/'\n","\n","BASE_DIR = Path(\"./ERPISI\").resolve()#.parent.parent\n","mdxnet_models_dir = BASE_DIR / 'mdxnet_models'\n","rvc_models_dir = BASE_DIR / 'rvc_models'\n","\n","\n","def dl_model(link, model_name, dir_name):\n"," with requests.get(f'{link}{model_name}') as r:\n"," r.raise_for_status()\n"," with open(dir_name / model_name, 'wb') as f:\n"," for chunk in r.iter_content(chunk_size=8192):\n"," f.write(chunk)\n","\n","\n","if __name__ == '__main__':\n"," mdx_model_names = ['UVR-MDX-NET-Voc_FT.onnx', 'UVR_MDXNET_KARA_2.onnx', 'Reverb_HQ_By_FoxJoy.onnx']\n"," for model in mdx_model_names:\n"," print(f'Downloading {model}...')\n"," dl_model(MDX_DOWNLOAD_LINK, model, mdxnet_models_dir)\n","\n"," rvc_model_names = ['hubert_base.pt', 'rmvpe.pt']\n"," for model in rvc_model_names:\n"," print(f'Downloading {model}...')\n"," dl_model(RVC_DOWNLOAD_LINK, model, rvc_models_dir)\n","\n"," print('All models downloaded!')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"cellView":"form","id":"8UDwl4GZNcUx","executionInfo":{"status":"ok","timestamp":1695448698067,"user_tz":-420,"elapsed":11632,"user":{"displayName":"ana taqa126","userId":"16118498281411814981"}},"outputId":"02e7a187-650e-4b3b-ef59-4c836deee78c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Downloading UVR-MDX-NET-Voc_FT.onnx...\n","Downloading UVR_MDXNET_KARA_2.onnx...\n","Downloading Reverb_HQ_By_FoxJoy.onnx...\n","Downloading hubert_base.pt...\n","Downloading rmvpe.pt...\n","All models downloaded!\n"]}]},{"cell_type":"code","source":["#@title MDX\n","\n","import gc\n","import hashlib\n","import os\n","import queue\n","import threading\n","import warnings\n","\n","import librosa\n","import numpy as np\n","import onnxruntime as ort\n","import soundfile as sf\n","import torch\n","from tqdm import tqdm\n","\n","warnings.filterwarnings(\"ignore\")\n","stem_naming = {'Vocals': 'Instrumental', 'Other': 'Instruments', 'Instrumental': 'Vocals', 'Drums': 'Drumless', 'Bass': 'Bassless'}\n","\n","\n","class MDXModel:\n"," def __init__(self, device, dim_f, dim_t, n_fft, hop=1024, stem_name=None, compensation=1.000):\n"," self.dim_f = dim_f\n"," self.dim_t = dim_t\n"," self.dim_c = 4\n"," self.n_fft = n_fft\n"," self.hop = hop\n"," self.stem_name = stem_name\n"," self.compensation = compensation\n","\n"," self.n_bins = self.n_fft // 2 + 1\n"," self.chunk_size = hop * (self.dim_t - 1)\n"," self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device)\n","\n"," out_c = self.dim_c\n","\n"," self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device)\n","\n"," def stft(self, x):\n"," x = x.reshape([-1, self.chunk_size])\n"," x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True)\n"," x = torch.view_as_real(x)\n"," x = x.permute([0, 3, 1, 2])\n"," x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 4, self.n_bins, self.dim_t])\n"," return x[:, :, :self.dim_f]\n","\n"," def istft(self, x, freq_pad=None):\n"," freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad\n"," x = torch.cat([x, freq_pad], -2)\n"," # c = 4*2 if self.target_name=='*' else 2\n"," x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])\n"," x = x.permute([0, 2, 3, 1])\n"," x = x.contiguous()\n"," x = torch.view_as_complex(x)\n"," x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)\n"," return x.reshape([-1, 2, self.chunk_size])\n","\n","\n","class MDX:\n"," DEFAULT_SR = 44100\n"," # Unit: seconds\n"," DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR\n"," DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR\n","\n"," DEFAULT_PROCESSOR = 0\n","\n"," def __init__(self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR):\n","\n"," # Set the device and the provider (CPU or CUDA)\n"," self.device = torch.device(f'cuda:{processor}') if processor >= 0 else torch.device('cpu')\n"," self.provider = ['CUDAExecutionProvider'] if processor >= 0 else ['CPUExecutionProvider']\n","\n"," self.model = params\n","\n"," # Load the ONNX model using ONNX Runtime\n"," self.ort = ort.InferenceSession(model_path, providers=self.provider)\n"," # Preload the model for faster performance\n"," self.ort.run(None, {'input': torch.rand(1, 4, params.dim_f, params.dim_t).numpy()})\n"," self.process = lambda spec: self.ort.run(None, {'input': spec.cpu().numpy()})[0]\n","\n"," self.prog = None\n","\n"," @staticmethod\n"," def get_hash(model_path):\n"," try:\n"," with open(model_path, 'rb') as f:\n"," f.seek(- 10000 * 1024, 2)\n"," model_hash = hashlib.md5(f.read()).hexdigest()\n"," except:\n"," model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest()\n","\n"," return model_hash\n","\n"," @staticmethod\n"," def segment(wave, combine=True, chunk_size=DEFAULT_CHUNK_SIZE, margin_size=DEFAULT_MARGIN_SIZE):\n"," \"\"\"\n"," Segment or join segmented wave array\n","\n"," Args:\n"," wave: (np.array) Wave array to be segmented or joined\n"," combine: (bool) If True, combines segmented wave array. If False, segments wave array.\n"," chunk_size: (int) Size of each segment (in samples)\n"," margin_size: (int) Size of margin between segments (in samples)\n","\n"," Returns:\n"," numpy array: Segmented or joined wave array\n"," \"\"\"\n","\n"," if combine:\n"," processed_wave = None # Initializing as None instead of [] for later numpy array concatenation\n"," for segment_count, segment in enumerate(wave):\n"," start = 0 if segment_count == 0 else margin_size\n"," end = None if segment_count == len(wave) - 1 else -margin_size\n"," if margin_size == 0:\n"," end = None\n"," if processed_wave is None: # Create array for first segment\n"," processed_wave = segment[:, start:end]\n"," else: # Concatenate to existing array for subsequent segments\n"," processed_wave = np.concatenate((processed_wave, segment[:, start:end]), axis=-1)\n","\n"," else:\n"," processed_wave = []\n"," sample_count = wave.shape[-1]\n","\n"," if chunk_size <= 0 or chunk_size > sample_count:\n"," chunk_size = sample_count\n","\n"," if margin_size > chunk_size:\n"," margin_size = chunk_size\n","\n"," for segment_count, skip in enumerate(range(0, sample_count, chunk_size)):\n","\n"," margin = 0 if segment_count == 0 else margin_size\n"," end = min(skip + chunk_size + margin_size, sample_count)\n"," start = skip - margin\n","\n"," cut = wave[:, start:end].copy()\n"," processed_wave.append(cut)\n","\n"," if end == sample_count:\n"," break\n","\n"," return processed_wave\n","\n"," def pad_wave(self, wave):\n"," \"\"\"\n"," Pad the wave array to match the required chunk size\n","\n"," Args:\n"," wave: (np.array) Wave array to be padded\n","\n"," Returns:\n"," tuple: (padded_wave, pad, trim)\n"," - padded_wave: Padded wave array\n"," - pad: Number of samples that were padded\n"," - trim: Number of samples that were trimmed\n"," \"\"\"\n"," n_sample = wave.shape[1]\n"," trim = self.model.n_fft // 2\n"," gen_size = self.model.chunk_size - 2 * trim\n"," pad = gen_size - n_sample % gen_size\n","\n"," # Padded wave\n"," wave_p = np.concatenate((np.zeros((2, trim)), wave, np.zeros((2, pad)), np.zeros((2, trim))), 1)\n","\n"," mix_waves = []\n"," for i in range(0, n_sample + pad, gen_size):\n"," waves = np.array(wave_p[:, i:i + self.model.chunk_size])\n"," mix_waves.append(waves)\n","\n"," mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)\n","\n"," return mix_waves, pad, trim\n","\n"," def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):\n"," \"\"\"\n"," Process each wave segment in a multi-threaded environment\n","\n"," Args:\n"," mix_waves: (torch.Tensor) Wave segments to be processed\n"," trim: (int) Number of samples trimmed during padding\n"," pad: (int) Number of samples padded during padding\n"," q: (queue.Queue) Queue to hold the processed wave segments\n"," _id: (int) Identifier of the processed wave segment\n","\n"," Returns:\n"," numpy array: Processed wave segment\n"," \"\"\"\n"," mix_waves = mix_waves.split(1)\n"," with torch.no_grad():\n"," pw = []\n"," for mix_wave in mix_waves:\n"," self.prog.update()\n"," spec = self.model.stft(mix_wave)\n"," processed_spec = torch.tensor(self.process(spec))\n"," processed_wav = self.model.istft(processed_spec.to(self.device))\n"," processed_wav = processed_wav[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).cpu().numpy()\n"," pw.append(processed_wav)\n"," processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]\n"," q.put({_id: processed_signal})\n"," return processed_signal\n","\n"," def process_wave(self, wave: np.array, mt_threads=1):\n"," \"\"\"\n"," Process the wave array in a multi-threaded environment\n","\n"," Args:\n"," wave: (np.array) Wave array to be processed\n"," mt_threads: (int) Number of threads to be used for processing\n","\n"," Returns:\n"," numpy array: Processed wave array\n"," \"\"\"\n"," self.prog = tqdm(total=0)\n"," chunk = wave.shape[-1] // mt_threads\n"," waves = self.segment(wave, False, chunk)\n","\n"," # Create a queue to hold the processed wave segments\n"," q = queue.Queue()\n"," threads = []\n"," for c, batch in enumerate(waves):\n"," mix_waves, pad, trim = self.pad_wave(batch)\n"," self.prog.total = len(mix_waves) * mt_threads\n"," thread = threading.Thread(target=self._process_wave, args=(mix_waves, trim, pad, q, c))\n"," thread.start()\n"," threads.append(thread)\n"," for thread in threads:\n"," thread.join()\n"," self.prog.close()\n","\n"," processed_batches = []\n"," while not q.empty():\n"," processed_batches.append(q.get())\n"," processed_batches = [list(wave.values())[0] for wave in\n"," sorted(processed_batches, key=lambda d: list(d.keys())[0])]\n"," assert len(processed_batches) == len(waves), 'Incomplete processed batches, please reduce batch size!'\n"," return self.segment(processed_batches, True, chunk)\n","\n","\n","def run_mdx(model_params, output_dir, model_path, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2):\n"," device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')\n","\n"," device_properties = torch.cuda.get_device_properties(device)\n"," vram_gb = device_properties.total_memory / 1024**3\n"," m_threads = 1 if vram_gb < 8 else 2\n","\n"," model_hash = MDX.get_hash(model_path)\n"," mp = model_params.get(model_hash)\n"," model = MDXModel(\n"," device,\n"," dim_f=mp[\"mdx_dim_f_set\"],\n"," dim_t=2 ** mp[\"mdx_dim_t_set\"],\n"," n_fft=mp[\"mdx_n_fft_scale_set\"],\n"," stem_name=mp[\"primary_stem\"],\n"," compensation=mp[\"compensate\"]\n"," )\n","\n"," mdx_sess = MDX(model_path, model)\n"," wave, sr = librosa.load(filename, mono=False, sr=44100)\n"," # normalizing input wave gives better output\n"," peak = max(np.max(wave), abs(np.min(wave)))\n"," wave /= peak\n"," if denoise:\n"," wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads))\n"," wave_processed *= 0.5\n"," else:\n"," wave_processed = mdx_sess.process_wave(wave, m_threads)\n"," # return to previous peak\n"," wave_processed *= peak\n"," stem_name = model.stem_name if suffix is None else suffix\n","\n"," main_filepath = None\n"," if not exclude_main:\n"," main_filepath = os.path.join(output_dir, f\"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav\")\n"," sf.write(main_filepath, wave_processed.T, sr)\n","\n"," invert_filepath = None\n"," if not exclude_inversion:\n"," diff_stem_name = stem_naming.get(stem_name) if invert_suffix is None else invert_suffix\n"," stem_name = f\"{stem_name}_diff\" if diff_stem_name is None else diff_stem_name\n"," invert_filepath = os.path.join(output_dir, f\"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav\")\n"," sf.write(invert_filepath, (-wave_processed.T * model.compensation) + wave.T, sr)\n","\n"," if not keep_orig:\n"," os.remove(filename)\n","\n"," del mdx_sess, wave_processed, wave\n"," gc.collect()\n"," return main_filepath, invert_filepath"],"metadata":{"id":"5hi6hYHvQd9h"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title MYUTILS\n","\n","import ffmpeg\n","import numpy as np\n","\n","\n","def load_audio(file, sr):\n"," try:\n"," # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26\n"," # This launches a subprocess to decode audio while down-mixing and resampling as necessary.\n"," # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.\n"," file = (\n"," file.strip(\" \").strip('\"').strip(\"\\n\").strip('\"').strip(\" \")\n"," ) # 防止小白拷路径头尾带了空格和\"和回车\n"," out, _ = (\n"," ffmpeg.input(file, threads=0)\n"," .output(\"-\", format=\"f32le\", acodec=\"pcm_f32le\", ac=1, ar=sr)\n"," .run(cmd=[\"ffmpeg\", \"-nostdin\"], capture_stdout=True, capture_stderr=True)\n"," )\n"," except Exception as e:\n"," raise RuntimeError(f\"Failed to load audio: {e}\")\n","\n"," return np.frombuffer(out, np.float32).flatten()"],"metadata":{"id":"GhycXVlkQsK_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title RMVPE\n","import numpy as np\n","import torch\n","import torch.nn as nn\n","import torch.nn.functional as F\n","from librosa.filters import mel\n","\n","\n","class BiGRU(nn.Module):\n"," def __init__(self, input_features, hidden_features, num_layers):\n"," super(BiGRU, self).__init__()\n"," self.gru = nn.GRU(\n"," input_features,\n"," hidden_features,\n"," num_layers=num_layers,\n"," batch_first=True,\n"," bidirectional=True,\n"," )\n","\n"," def forward(self, x):\n"," return self.gru(x)[0]\n","\n","\n","class ConvBlockRes(nn.Module):\n"," def __init__(self, in_channels, out_channels, momentum=0.01):\n"," super(ConvBlockRes, self).__init__()\n"," self.conv = nn.Sequential(\n"," nn.Conv2d(\n"," in_channels=in_channels,\n"," out_channels=out_channels,\n"," kernel_size=(3, 3),\n"," stride=(1, 1),\n"," padding=(1, 1),\n"," bias=False,\n"," ),\n"," nn.BatchNorm2d(out_channels, momentum=momentum),\n"," nn.ReLU(),\n"," nn.Conv2d(\n"," in_channels=out_channels,\n"," out_channels=out_channels,\n"," kernel_size=(3, 3),\n"," stride=(1, 1),\n"," padding=(1, 1),\n"," bias=False,\n"," ),\n"," nn.BatchNorm2d(out_channels, momentum=momentum),\n"," nn.ReLU(),\n"," )\n"," if in_channels != out_channels:\n"," self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))\n"," self.is_shortcut = True\n"," else:\n"," self.is_shortcut = False\n","\n"," def forward(self, x):\n"," if self.is_shortcut:\n"," return self.conv(x) + self.shortcut(x)\n"," else:\n"," return self.conv(x) + x\n","\n","\n","class Encoder(nn.Module):\n"," def __init__(\n"," self,\n"," in_channels,\n"," in_size,\n"," n_encoders,\n"," kernel_size,\n"," n_blocks,\n"," out_channels=16,\n"," momentum=0.01,\n"," ):\n"," super(Encoder, self).__init__()\n"," self.n_encoders = n_encoders\n"," self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)\n"," self.layers = nn.ModuleList()\n"," self.latent_channels = []\n"," for i in range(self.n_encoders):\n"," self.layers.append(\n"," ResEncoderBlock(\n"," in_channels, out_channels, kernel_size, n_blocks, momentum=momentum\n"," )\n"," )\n"," self.latent_channels.append([out_channels, in_size])\n"," in_channels = out_channels\n"," out_channels *= 2\n"," in_size //= 2\n"," self.out_size = in_size\n"," self.out_channel = out_channels\n","\n"," def forward(self, x):\n"," concat_tensors = []\n"," x = self.bn(x)\n"," for i in range(self.n_encoders):\n"," _, x = self.layers[i](x)\n"," concat_tensors.append(_)\n"," return x, concat_tensors\n","\n","\n","class ResEncoderBlock(nn.Module):\n"," def __init__(\n"," self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01\n"," ):\n"," super(ResEncoderBlock, self).__init__()\n"," self.n_blocks = n_blocks\n"," self.conv = nn.ModuleList()\n"," self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))\n"," for i in range(n_blocks - 1):\n"," self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))\n"," self.kernel_size = kernel_size\n"," if self.kernel_size is not None:\n"," self.pool = nn.AvgPool2d(kernel_size=kernel_size)\n","\n"," def forward(self, x):\n"," for i in range(self.n_blocks):\n"," x = self.conv[i](x)\n"," if self.kernel_size is not None:\n"," return x, self.pool(x)\n"," else:\n"," return x\n","\n","\n","class Intermediate(nn.Module): #\n"," def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):\n"," super(Intermediate, self).__init__()\n"," self.n_inters = n_inters\n"," self.layers = nn.ModuleList()\n"," self.layers.append(\n"," ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)\n"," )\n"," for i in range(self.n_inters - 1):\n"," self.layers.append(\n"," ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)\n"," )\n","\n"," def forward(self, x):\n"," for i in range(self.n_inters):\n"," x = self.layers[i](x)\n"," return x\n","\n","\n","class ResDecoderBlock(nn.Module):\n"," def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):\n"," super(ResDecoderBlock, self).__init__()\n"," out_padding = (0, 1) if stride == (1, 2) else (1, 1)\n"," self.n_blocks = n_blocks\n"," self.conv1 = nn.Sequential(\n"," nn.ConvTranspose2d(\n"," in_channels=in_channels,\n"," out_channels=out_channels,\n"," kernel_size=(3, 3),\n"," stride=stride,\n"," padding=(1, 1),\n"," output_padding=out_padding,\n"," bias=False,\n"," ),\n"," nn.BatchNorm2d(out_channels, momentum=momentum),\n"," nn.ReLU(),\n"," )\n"," self.conv2 = nn.ModuleList()\n"," self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))\n"," for i in range(n_blocks - 1):\n"," self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))\n","\n"," def forward(self, x, concat_tensor):\n"," x = self.conv1(x)\n"," x = torch.cat((x, concat_tensor), dim=1)\n"," for i in range(self.n_blocks):\n"," x = self.conv2[i](x)\n"," return x\n","\n","\n","class Decoder(nn.Module):\n"," def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):\n"," super(Decoder, self).__init__()\n"," self.layers = nn.ModuleList()\n"," self.n_decoders = n_decoders\n"," for i in range(self.n_decoders):\n"," out_channels = in_channels // 2\n"," self.layers.append(\n"," ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)\n"," )\n"," in_channels = out_channels\n","\n"," def forward(self, x, concat_tensors):\n"," for i in range(self.n_decoders):\n"," x = self.layers[i](x, concat_tensors[-1 - i])\n"," return x\n","\n","\n","class DeepUnet(nn.Module):\n"," def __init__(\n"," self,\n"," kernel_size,\n"," n_blocks,\n"," en_de_layers=5,\n"," inter_layers=4,\n"," in_channels=1,\n"," en_out_channels=16,\n"," ):\n"," super(DeepUnet, self).__init__()\n"," self.encoder = Encoder(\n"," in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels\n"," )\n"," self.intermediate = Intermediate(\n"," self.encoder.out_channel // 2,\n"," self.encoder.out_channel,\n"," inter_layers,\n"," n_blocks,\n"," )\n"," self.decoder = Decoder(\n"," self.encoder.out_channel, en_de_layers, kernel_size, n_blocks\n"," )\n","\n"," def forward(self, x):\n"," x, concat_tensors = self.encoder(x)\n"," x = self.intermediate(x)\n"," x = self.decoder(x, concat_tensors)\n"," return x\n","\n","\n","class E2E(nn.Module):\n"," def __init__(\n"," self,\n"," n_blocks,\n"," n_gru,\n"," kernel_size,\n"," en_de_layers=5,\n"," inter_layers=4,\n"," in_channels=1,\n"," en_out_channels=16,\n"," ):\n"," super(E2E, self).__init__()\n"," self.unet = DeepUnet(\n"," kernel_size,\n"," n_blocks,\n"," en_de_layers,\n"," inter_layers,\n"," in_channels,\n"," en_out_channels,\n"," )\n"," self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))\n"," if n_gru:\n"," self.fc = nn.Sequential(\n"," BiGRU(3 * 128, 256, n_gru),\n"," nn.Linear(512, 360),\n"," nn.Dropout(0.25),\n"," nn.Sigmoid(),\n"," )\n"," else:\n"," self.fc = nn.Sequential(\n"," nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()\n"," )\n","\n"," def forward(self, mel):\n"," mel = mel.transpose(-1, -2).unsqueeze(1)\n"," x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)\n"," x = self.fc(x)\n"," return x\n","\n","\n","class MelSpectrogram(torch.nn.Module):\n"," def __init__(\n"," self,\n"," is_half,\n"," n_mel_channels,\n"," sampling_rate,\n"," win_length,\n"," hop_length,\n"," n_fft=None,\n"," mel_fmin=0,\n"," mel_fmax=None,\n"," clamp=1e-5,\n"," ):\n"," super().__init__()\n"," n_fft = win_length if n_fft is None else n_fft\n"," self.hann_window = {}\n"," mel_basis = mel(\n"," sr=sampling_rate,\n"," n_fft=n_fft,\n"," n_mels=n_mel_channels,\n"," fmin=mel_fmin,\n"," fmax=mel_fmax,\n"," htk=True,\n"," )\n"," mel_basis = torch.from_numpy(mel_basis).float()\n"," self.register_buffer(\"mel_basis\", mel_basis)\n"," self.n_fft = win_length if n_fft is None else n_fft\n"," self.hop_length = hop_length\n"," self.win_length = win_length\n"," self.sampling_rate = sampling_rate\n"," self.n_mel_channels = n_mel_channels\n"," self.clamp = clamp\n"," self.is_half = is_half\n","\n"," def forward(self, audio, keyshift=0, speed=1, center=True):\n"," factor = 2 ** (keyshift / 12)\n"," n_fft_new = int(np.round(self.n_fft * factor))\n"," win_length_new = int(np.round(self.win_length * factor))\n"," hop_length_new = int(np.round(self.hop_length * speed))\n"," keyshift_key = str(keyshift) + \"_\" + str(audio.device)\n"," if keyshift_key not in self.hann_window:\n"," self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(\n"," audio.device\n"," )\n"," fft = torch.stft(\n"," audio,\n"," n_fft=n_fft_new,\n"," hop_length=hop_length_new,\n"," win_length=win_length_new,\n"," window=self.hann_window[keyshift_key],\n"," center=center,\n"," return_complex=True,\n"," )\n"," magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))\n"," if keyshift != 0:\n"," size = self.n_fft // 2 + 1\n"," resize = magnitude.size(1)\n"," if resize < size:\n"," magnitude = F.pad(magnitude, (0, 0, 0, size - resize))\n"," magnitude = magnitude[:, :size, :] * self.win_length / win_length_new\n"," mel_output = torch.matmul(self.mel_basis, magnitude)\n"," if self.is_half == True:\n"," mel_output = mel_output.half()\n"," log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))\n"," return log_mel_spec\n","\n","\n","class RMVPE:\n"," def __init__(self, model_path, is_half, device=None):\n"," self.resample_kernel = {}\n"," model = E2E(4, 1, (2, 2))\n"," ckpt = torch.load(model_path, map_location=\"cpu\")\n"," model.load_state_dict(ckpt)\n"," model.eval()\n"," if is_half == True:\n"," model = model.half()\n"," self.model = model\n"," self.resample_kernel = {}\n"," self.is_half = is_half\n"," if device is None:\n"," device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n"," self.device = device\n"," self.mel_extractor = MelSpectrogram(\n"," is_half, 128, 16000, 1024, 160, None, 30, 8000\n"," ).to(device)\n"," self.model = self.model.to(device)\n"," cents_mapping = 20 * np.arange(360) + 1997.3794084376191\n"," self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368\n","\n"," def mel2hidden(self, mel):\n"," with torch.no_grad():\n"," n_frames = mel.shape[-1]\n"," mel = F.pad(\n"," mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode=\"reflect\"\n"," )\n"," hidden = self.model(mel)\n"," return hidden[:, :n_frames]\n","\n"," def decode(self, hidden, thred=0.03):\n"," cents_pred = self.to_local_average_cents(hidden, thred=thred)\n"," f0 = 10 * (2 ** (cents_pred / 1200))\n"," f0[f0 == 10] = 0\n"," # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])\n"," return f0\n","\n"," def infer_from_audio(self, audio, thred=0.03):\n"," audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)\n"," # torch.cuda.synchronize()\n"," # t0=ttime()\n"," mel = self.mel_extractor(audio, center=True)\n"," # torch.cuda.synchronize()\n"," # t1=ttime()\n"," hidden = self.mel2hidden(mel)\n"," # torch.cuda.synchronize()\n"," # t2=ttime()\n"," hidden = hidden.squeeze(0).cpu().numpy()\n"," if self.is_half == True:\n"," hidden = hidden.astype(\"float32\")\n"," f0 = self.decode(hidden, thred=thred)\n"," # torch.cuda.synchronize()\n"," # t3=ttime()\n"," # print(\"hmvpe:%s\\t%s\\t%s\\t%s\"%(t1-t0,t2-t1,t3-t2,t3-t0))\n"," return f0\n","\n"," def to_local_average_cents(self, salience, thred=0.05):\n"," # t0 = ttime()\n"," center = np.argmax(salience, axis=1) # 帧长#index\n"," salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368\n"," # t1 = ttime()\n"," center += 4\n"," todo_salience = []\n"," todo_cents_mapping = []\n"," starts = center - 4\n"," ends = center + 5\n"," for idx in range(salience.shape[0]):\n"," todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])\n"," todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])\n"," # t2 = ttime()\n"," todo_salience = np.array(todo_salience) # 帧长,9\n"," todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9\n"," product_sum = np.sum(todo_salience * todo_cents_mapping, 1)\n"," weight_sum = np.sum(todo_salience, 1) # 帧长\n"," devided = product_sum / weight_sum # 帧长\n"," # t3 = ttime()\n"," maxx = np.max(salience, axis=1) # 帧长\n"," devided[maxx <= thred] = 0\n"," # t4 = ttime()\n"," # print(\"decode:%s\\t%s\\t%s\\t%s\" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))\n"," return devided"],"metadata":{"id":"wipljx-ZQ4RW"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title VC Infer Pipeline\n","from functools import lru_cache\n","from time import time as ttime\n","\n","import faiss\n","import librosa\n","import numpy as np\n","import os\n","import parselmouth\n","import pyworld\n","import sys\n","import torch\n","import torch.nn.functional as F\n","import torchcrepe\n","import traceback\n","from scipy import signal\n","from torch import Tensor\n","\n","BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(\"./ERPISI\")))\n","now_dir = os.path.join(BASE_DIR, 'src')\n","sys.path.append(now_dir)\n","\n","bh, ah = signal.butter(N=5, Wn=48, btype=\"high\", fs=16000)\n","\n","input_audio_path2wav = {}\n","\n","\n","@lru_cache\n","def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):\n"," audio = input_audio_path2wav[input_audio_path]\n"," f0, t = pyworld.harvest(\n"," audio,\n"," fs=fs,\n"," f0_ceil=f0max,\n"," f0_floor=f0min,\n"," frame_period=frame_period,\n"," )\n"," f0 = pyworld.stonemask(audio, f0, t, fs)\n"," return f0\n","\n","\n","def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比\n"," # print(data1.max(),data2.max())\n"," rms1 = librosa.feature.rms(\n"," y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2\n"," ) # 每半秒一个点\n"," rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)\n"," rms1 = torch.from_numpy(rms1)\n"," rms1 = F.interpolate(\n"," rms1.unsqueeze(0), size=data2.shape[0], mode=\"linear\"\n"," ).squeeze()\n"," rms2 = torch.from_numpy(rms2)\n"," rms2 = F.interpolate(\n"," rms2.unsqueeze(0), size=data2.shape[0], mode=\"linear\"\n"," ).squeeze()\n"," rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)\n"," data2 *= (\n"," torch.pow(rms1, torch.tensor(1 - rate))\n"," * torch.pow(rms2, torch.tensor(rate - 1))\n"," ).numpy()\n"," return data2\n","\n","\n","class VC(object):\n"," def __init__(self, tgt_sr, config):\n"," self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (\n"," config.x_pad,\n"," config.x_query,\n"," config.x_center,\n"," config.x_max,\n"," config.is_half,\n"," )\n"," self.sr = 16000 # hubert输入采样率\n"," self.window = 160 # 每帧点数\n"," self.t_pad = self.sr * self.x_pad # 每条前后pad时间\n"," self.t_pad_tgt = tgt_sr * self.x_pad\n"," self.t_pad2 = self.t_pad * 2\n"," self.t_query = self.sr * self.x_query # 查询切点前后查询时间\n"," self.t_center = self.sr * self.x_center # 查询切点位置\n"," self.t_max = self.sr * self.x_max # 免查询时长阈值\n"," self.device = config.device\n","\n"," # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)\n"," def get_optimal_torch_device(self, index: int = 0) -> torch.device:\n"," # Get cuda device\n"," if torch.cuda.is_available():\n"," return torch.device(\n"," f\"cuda:{index % torch.cuda.device_count()}\"\n"," ) # Very fast\n"," elif torch.backends.mps.is_available():\n"," return torch.device(\"mps\")\n"," # Insert an else here to grab \"xla\" devices if available. TO DO later. Requires the torch_xla.core.xla_model library\n"," # Else wise return the \"cpu\" as a torch device,\n"," return torch.device(\"cpu\")\n","\n"," # Fork Feature: Compute f0 with the crepe method\n"," def get_f0_crepe_computation(\n"," self,\n"," x,\n"," f0_min,\n"," f0_max,\n"," p_len,\n"," hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.\n"," model=\"full\", # Either use crepe-tiny \"tiny\" or crepe \"full\". Default is full\n"," ):\n"," x = x.astype(\n"," np.float32\n"," ) # fixes the F.conv2D exception. We needed to convert double to float.\n"," x /= np.quantile(np.abs(x), 0.999)\n"," torch_device = self.get_optimal_torch_device()\n"," audio = torch.from_numpy(x).to(torch_device, copy=True)\n"," audio = torch.unsqueeze(audio, dim=0)\n"," if audio.ndim == 2 and audio.shape[0] > 1:\n"," audio = torch.mean(audio, dim=0, keepdim=True).detach()\n"," audio = audio.detach()\n"," print(\"Initiating prediction with a crepe_hop_length of: \" + str(hop_length))\n"," pitch: Tensor = torchcrepe.predict(\n"," audio,\n"," self.sr,\n"," hop_length,\n"," f0_min,\n"," f0_max,\n"," model,\n"," batch_size=hop_length * 2,\n"," device=torch_device,\n"," pad=True,\n"," )\n"," p_len = p_len or x.shape[0] // hop_length\n"," # Resize the pitch for final f0\n"," source = np.array(pitch.squeeze(0).cpu().float().numpy())\n"," source[source < 0.001] = np.nan\n"," target = np.interp(\n"," np.arange(0, len(source) * p_len, len(source)) / p_len,\n"," np.arange(0, len(source)),\n"," source,\n"," )\n"," f0 = np.nan_to_num(target)\n"," return f0 # Resized f0\n","\n"," def get_f0_official_crepe_computation(\n"," self,\n"," x,\n"," f0_min,\n"," f0_max,\n"," model=\"full\",\n"," ):\n"," # Pick a batch size that doesn't cause memory errors on your gpu\n"," batch_size = 512\n"," # Compute pitch using first gpu\n"," audio = torch.tensor(np.copy(x))[None].float()\n"," f0, pd = torchcrepe.predict(\n"," audio,\n"," self.sr,\n"," self.window,\n"," f0_min,\n"," f0_max,\n"," model,\n"," batch_size=batch_size,\n"," device=self.device,\n"," return_periodicity=True,\n"," )\n"," pd = torchcrepe.filter.median(pd, 3)\n"," f0 = torchcrepe.filter.mean(f0, 3)\n"," f0[pd < 0.1] = 0\n"," f0 = f0[0].cpu().numpy()\n"," return f0\n","\n"," # Fork Feature: Compute pYIN f0 method\n"," def get_f0_pyin_computation(self, x, f0_min, f0_max):\n"," y, sr = librosa.load(\"saudio/Sidney.wav\", self.sr, mono=True)\n"," f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)\n"," f0 = f0[1:] # Get rid of extra first frame\n"," return f0\n","\n"," # Fork Feature: Acquire median hybrid f0 estimation calculation\n"," def get_f0_hybrid_computation(\n"," self,\n"," methods_str,\n"," input_audio_path,\n"," x,\n"," f0_min,\n"," f0_max,\n"," p_len,\n"," filter_radius,\n"," crepe_hop_length,\n"," time_step,\n"," ):\n"," # Get various f0 methods from input to use in the computation stack\n"," s = methods_str\n"," s = s.split(\"hybrid\")[1]\n"," s = s.replace(\"[\", \"\").replace(\"]\", \"\")\n"," methods = s.split(\"+\")\n"," f0_computation_stack = []\n","\n"," print(\"Calculating f0 pitch estimations for methods: %s\" % str(methods))\n"," x = x.astype(np.float32)\n"," x /= np.quantile(np.abs(x), 0.999)\n"," # Get f0 calculations for all methods specified\n"," for method in methods:\n"," f0 = None\n"," if method == \"pm\":\n"," f0 = (\n"," parselmouth.Sound(x, self.sr)\n"," .to_pitch_ac(\n"," time_step=time_step / 1000,\n"," voicing_threshold=0.6,\n"," pitch_floor=f0_min,\n"," pitch_ceiling=f0_max,\n"," )\n"," .selected_array[\"frequency\"]\n"," )\n"," pad_size = (p_len - len(f0) + 1) // 2\n"," if pad_size > 0 or p_len - len(f0) - pad_size > 0:\n"," f0 = np.pad(\n"," f0, [[pad_size, p_len - len(f0) - pad_size]], mode=\"constant\"\n"," )\n"," elif method == \"crepe\":\n"," f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)\n"," f0 = f0[1:] # Get rid of extra first frame\n"," elif method == \"crepe-tiny\":\n"," f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, \"tiny\")\n"," f0 = f0[1:] # Get rid of extra first frame\n"," elif method == \"mangio-crepe\":\n"," f0 = self.get_f0_crepe_computation(\n"," x, f0_min, f0_max, p_len, crepe_hop_length\n"," )\n"," elif method == \"mangio-crepe-tiny\":\n"," f0 = self.get_f0_crepe_computation(\n"," x, f0_min, f0_max, p_len, crepe_hop_length, \"tiny\"\n"," )\n"," elif method == \"harvest\":\n"," f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)\n"," if filter_radius > 2:\n"," f0 = signal.medfilt(f0, 3)\n"," f0 = f0[1:] # Get rid of first frame.\n"," elif method == \"dio\": # Potentially buggy?\n"," f0, t = pyworld.dio(\n"," x.astype(np.double),\n"," fs=self.sr,\n"," f0_ceil=f0_max,\n"," f0_floor=f0_min,\n"," frame_period=10,\n"," )\n"," f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)\n"," f0 = signal.medfilt(f0, 3)\n"," f0 = f0[1:]\n"," # elif method == \"pyin\": Not Working just yet\n"," # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)\n"," # Push method to the stack\n"," f0_computation_stack.append(f0)\n","\n"," for fc in f0_computation_stack:\n"," print(len(fc))\n","\n"," print(\"Calculating hybrid median f0 from the stack of: %s\" % str(methods))\n"," f0_median_hybrid = None\n"," if len(f0_computation_stack) == 1:\n"," f0_median_hybrid = f0_computation_stack[0]\n"," else:\n"," f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)\n"," return f0_median_hybrid\n","\n"," def get_f0(\n"," self,\n"," input_audio_path,\n"," x,\n"," p_len,\n"," f0_up_key,\n"," f0_method,\n"," filter_radius,\n"," crepe_hop_length,\n"," inp_f0=None,\n"," ):\n"," global input_audio_path2wav\n"," time_step = self.window / self.sr * 1000\n"," f0_min = 50\n"," f0_max = 1100\n"," f0_mel_min = 1127 * np.log(1 + f0_min / 700)\n"," f0_mel_max = 1127 * np.log(1 + f0_max / 700)\n"," if f0_method == \"pm\":\n"," f0 = (\n"," parselmouth.Sound(x, self.sr)\n"," .to_pitch_ac(\n"," time_step=time_step / 1000,\n"," voicing_threshold=0.6,\n"," pitch_floor=f0_min,\n"," pitch_ceiling=f0_max,\n"," )\n"," .selected_array[\"frequency\"]\n"," )\n"," pad_size = (p_len - len(f0) + 1) // 2\n"," if pad_size > 0 or p_len - len(f0) - pad_size > 0:\n"," f0 = np.pad(\n"," f0, [[pad_size, p_len - len(f0) - pad_size]], mode=\"constant\"\n"," )\n"," elif f0_method == \"harvest\":\n"," input_audio_path2wav[input_audio_path] = x.astype(np.double)\n"," f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)\n"," if filter_radius > 2:\n"," f0 = signal.medfilt(f0, 3)\n"," elif f0_method == \"dio\": # Potentially Buggy?\n"," f0, t = pyworld.dio(\n"," x.astype(np.double),\n"," fs=self.sr,\n"," f0_ceil=f0_max,\n"," f0_floor=f0_min,\n"," frame_period=10,\n"," )\n"," f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)\n"," f0 = signal.medfilt(f0, 3)\n"," elif f0_method == \"crepe\":\n"," f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)\n"," elif f0_method == \"crepe-tiny\":\n"," f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, \"tiny\")\n"," elif f0_method == \"mangio-crepe\":\n"," f0 = self.get_f0_crepe_computation(\n"," x, f0_min, f0_max, p_len, crepe_hop_length\n"," )\n"," elif f0_method == \"mangio-crepe-tiny\":\n"," f0 = self.get_f0_crepe_computation(\n"," x, f0_min, f0_max, p_len, crepe_hop_length, \"tiny\"\n"," )\n"," elif f0_method == \"rmvpe\":\n"," if hasattr(self, \"model_rmvpe\") == False:\n"," #from rmvpe import RMVPE\n","\n"," self.model_rmvpe = RMVPE(\n"," os.path.join(BASE_DIR, 'rvc_models', 'rmvpe.pt'), is_half=self.is_half, device=self.device\n"," )\n"," f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)\n","\n"," elif \"hybrid\" in f0_method:\n"," # Perform hybrid median pitch estimation\n"," input_audio_path2wav[input_audio_path] = x.astype(np.double)\n"," f0 = self.get_f0_hybrid_computation(\n"," f0_method,\n"," input_audio_path,\n"," x,\n"," f0_min,\n"," f0_max,\n"," p_len,\n"," filter_radius,\n"," crepe_hop_length,\n"," time_step,\n"," )\n","\n"," f0 *= pow(2, f0_up_key / 12)\n"," # with open(\"test.txt\",\"w\")as f:f.write(\"\\n\".join([str(i)for i in f0.tolist()]))\n"," tf0 = self.sr // self.window # 每秒f0点数\n"," if inp_f0 is not None:\n"," delta_t = np.round(\n"," (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1\n"," ).astype(\"int16\")\n"," replace_f0 = np.interp(\n"," list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]\n"," )\n"," shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]\n"," f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[\n"," :shape\n"," ]\n"," # with open(\"test_opt.txt\",\"w\")as f:f.write(\"\\n\".join([str(i)for i in f0.tolist()]))\n"," f0bak = f0.copy()\n"," f0_mel = 1127 * np.log(1 + f0 / 700)\n"," f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (\n"," f0_mel_max - f0_mel_min\n"," ) + 1\n"," f0_mel[f0_mel <= 1] = 1\n"," f0_mel[f0_mel > 255] = 255\n"," f0_coarse = np.rint(f0_mel).astype(np.int)\n","\n"," return f0_coarse, f0bak # 1-0\n","\n"," def vc(\n"," self,\n"," model,\n"," net_g,\n"," sid,\n"," audio0,\n"," pitch,\n"," pitchf,\n"," times,\n"," index,\n"," big_npy,\n"," index_rate,\n"," version,\n"," protect,\n"," ): # ,file_index,file_big_npy\n"," feats = torch.from_numpy(audio0)\n"," if self.is_half:\n"," feats = feats.half()\n"," else:\n"," feats = feats.float()\n"," if feats.dim() == 2: # double channels\n"," feats = feats.mean(-1)\n"," assert feats.dim() == 1, feats.dim()\n"," feats = feats.view(1, -1)\n"," padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)\n","\n"," inputs = {\n"," \"source\": feats.to(self.device),\n"," \"padding_mask\": padding_mask,\n"," \"output_layer\": 9 if version == \"v1\" else 12,\n"," }\n"," t0 = ttime()\n"," with torch.no_grad():\n"," logits = model.extract_features(**inputs)\n"," feats = model.final_proj(logits[0]) if version == \"v1\" else logits[0]\n"," if protect < 0.5 and pitch != None and pitchf != None:\n"," feats0 = feats.clone()\n"," if (\n"," isinstance(index, type(None)) == False\n"," and isinstance(big_npy, type(None)) == False\n"," and index_rate != 0\n"," ):\n"," npy = feats[0].cpu().numpy()\n"," if self.is_half:\n"," npy = npy.astype(\"float32\")\n","\n"," # _, I = index.search(npy, 1)\n"," # npy = big_npy[I.squeeze()]\n","\n"," score, ix = index.search(npy, k=8)\n"," weight = np.square(1 / score)\n"," weight /= weight.sum(axis=1, keepdims=True)\n"," npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)\n","\n"," if self.is_half:\n"," npy = npy.astype(\"float16\")\n"," feats = (\n"," torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate\n"," + (1 - index_rate) * feats\n"," )\n","\n"," feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)\n"," if protect < 0.5 and pitch != None and pitchf != None:\n"," feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(\n"," 0, 2, 1\n"," )\n"," t1 = ttime()\n"," p_len = audio0.shape[0] // self.window\n"," if feats.shape[1] < p_len:\n"," p_len = feats.shape[1]\n"," if pitch != None and pitchf != None:\n"," pitch = pitch[:, :p_len]\n"," pitchf = pitchf[:, :p_len]\n","\n"," if protect < 0.5 and pitch != None and pitchf != None:\n"," pitchff = pitchf.clone()\n"," pitchff[pitchf > 0] = 1\n"," pitchff[pitchf < 1] = protect\n"," pitchff = pitchff.unsqueeze(-1)\n"," feats = feats * pitchff + feats0 * (1 - pitchff)\n"," feats = feats.to(feats0.dtype)\n"," p_len = torch.tensor([p_len], device=self.device).long()\n"," with torch.no_grad():\n"," if pitch != None and pitchf != None:\n"," audio1 = (\n"," (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])\n"," .data.cpu()\n"," .float()\n"," .numpy()\n"," )\n"," else:\n"," audio1 = (\n"," (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()\n"," )\n"," del feats, p_len, padding_mask\n"," if torch.cuda.is_available():\n"," torch.cuda.empty_cache()\n"," t2 = ttime()\n"," times[0] += t1 - t0\n"," times[2] += t2 - t1\n"," return audio1\n","\n"," def pipeline(\n"," self,\n"," model,\n"," net_g,\n"," sid,\n"," audio,\n"," input_audio_path,\n"," times,\n"," f0_up_key,\n"," f0_method,\n"," file_index,\n"," # file_big_npy,\n"," index_rate,\n"," if_f0,\n"," filter_radius,\n"," tgt_sr,\n"," resample_sr,\n"," rms_mix_rate,\n"," version,\n"," protect,\n"," crepe_hop_length,\n"," f0_file=None,\n"," ):\n"," if (\n"," file_index != \"\"\n"," # and file_big_npy != \"\"\n"," # and os.path.exists(file_big_npy) == True\n"," and os.path.exists(file_index) == True\n"," and index_rate != 0\n"," ):\n"," try:\n"," index = faiss.read_index(file_index)\n"," # big_npy = np.load(file_big_npy)\n"," big_npy = index.reconstruct_n(0, index.ntotal)\n"," except:\n"," traceback.print_exc()\n"," index = big_npy = None\n"," else:\n"," index = big_npy = None\n"," audio = signal.filtfilt(bh, ah, audio)\n"," audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode=\"reflect\")\n"," opt_ts = []\n"," if audio_pad.shape[0] > self.t_max:\n"," audio_sum = np.zeros_like(audio)\n"," for i in range(self.window):\n"," audio_sum += audio_pad[i : i - self.window]\n"," for t in range(self.t_center, audio.shape[0], self.t_center):\n"," opt_ts.append(\n"," t\n"," - self.t_query\n"," + np.where(\n"," np.abs(audio_sum[t - self.t_query : t + self.t_query])\n"," == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()\n"," )[0][0]\n"," )\n"," s = 0\n"," audio_opt = []\n"," t = None\n"," t1 = ttime()\n"," audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode=\"reflect\")\n"," p_len = audio_pad.shape[0] // self.window\n"," inp_f0 = None\n"," if hasattr(f0_file, \"name\") == True:\n"," try:\n"," with open(f0_file.name, \"r\") as f:\n"," lines = f.read().strip(\"\\n\").split(\"\\n\")\n"," inp_f0 = []\n"," for line in lines:\n"," inp_f0.append([float(i) for i in line.split(\",\")])\n"," inp_f0 = np.array(inp_f0, dtype=\"float32\")\n"," except:\n"," traceback.print_exc()\n"," sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()\n"," pitch, pitchf = None, None\n"," if if_f0 == 1:\n"," pitch, pitchf = self.get_f0(\n"," input_audio_path,\n"," audio_pad,\n"," p_len,\n"," f0_up_key,\n"," f0_method,\n"," filter_radius,\n"," crepe_hop_length,\n"," inp_f0,\n"," )\n"," pitch = pitch[:p_len]\n"," pitchf = pitchf[:p_len]\n"," if self.device == \"mps\":\n"," pitchf = pitchf.astype(np.float32)\n"," pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()\n"," pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()\n"," t2 = ttime()\n"," times[1] += t2 - t1\n"," for t in opt_ts:\n"," t = t // self.window * self.window\n"," if if_f0 == 1:\n"," audio_opt.append(\n"," self.vc(\n"," model,\n"," net_g,\n"," sid,\n"," audio_pad[s : t + self.t_pad2 + self.window],\n"," pitch[:, s // self.window : (t + self.t_pad2) // self.window],\n"," pitchf[:, s // self.window : (t + self.t_pad2) // self.window],\n"," times,\n"," index,\n"," big_npy,\n"," index_rate,\n"," version,\n"," protect,\n"," )[self.t_pad_tgt : -self.t_pad_tgt]\n"," )\n"," else:\n"," audio_opt.append(\n"," self.vc(\n"," model,\n"," net_g,\n"," sid,\n"," audio_pad[s : t + self.t_pad2 + self.window],\n"," None,\n"," None,\n"," times,\n"," index,\n"," big_npy,\n"," index_rate,\n"," version,\n"," protect,\n"," )[self.t_pad_tgt : -self.t_pad_tgt]\n"," )\n"," s = t\n"," if if_f0 == 1:\n"," audio_opt.append(\n"," self.vc(\n"," model,\n"," net_g,\n"," sid,\n"," audio_pad[t:],\n"," pitch[:, t // self.window :] if t is not None else pitch,\n"," pitchf[:, t // self.window :] if t is not None else pitchf,\n"," times,\n"," index,\n"," big_npy,\n"," index_rate,\n"," version,\n"," protect,\n"," )[self.t_pad_tgt : -self.t_pad_tgt]\n"," )\n"," else:\n"," audio_opt.append(\n"," self.vc(\n"," model,\n"," net_g,\n"," sid,\n"," audio_pad[t:],\n"," None,\n"," None,\n"," times,\n"," index,\n"," big_npy,\n"," index_rate,\n"," version,\n"," protect,\n"," )[self.t_pad_tgt : -self.t_pad_tgt]\n"," )\n"," audio_opt = np.concatenate(audio_opt)\n"," if rms_mix_rate != 1:\n"," audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)\n"," if resample_sr >= 16000 and tgt_sr != resample_sr:\n"," audio_opt = librosa.resample(\n"," audio_opt, orig_sr=tgt_sr, target_sr=resample_sr\n"," )\n"," audio_max = np.abs(audio_opt).max() / 0.99\n"," max_int16 = 32768\n"," if audio_max > 1:\n"," max_int16 /= audio_max\n"," audio_opt = (audio_opt * max_int16).astype(np.int16)\n"," del pitch, pitchf, sid\n"," if torch.cuda.is_available():\n"," torch.cuda.empty_cache()\n"," return audio_opt"],"metadata":{"id":"mFDUMezkRRlH"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title RVC\n","from multiprocessing import cpu_count\n","from pathlib import Path\n","\n","import torch\n","from fairseq import checkpoint_utils\n","from scipy.io import wavfile\n","\n","'''\n","from infer_pack.models import (\n"," SynthesizerTrnMs256NSFsid,\n"," SynthesizerTrnMs256NSFsid_nono,\n"," SynthesizerTrnMs768NSFsid,\n"," SynthesizerTrnMs768NSFsid_nono,\n",")\n","from my_utils import load_audio\n","'''\n","#from vc_infer_pipeline import VC\n","\n","BASE_DIR = Path(\"/content/ERPISI/\")#.resolve().parent.parent\n","\n","\n","class Config:\n"," def __init__(self, device, is_half):\n"," self.device = device\n"," self.is_half = is_half\n"," self.n_cpu = 0\n"," self.gpu_name = None\n"," self.gpu_mem = None\n"," self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()\n","\n"," def device_config(self) -> tuple:\n"," if torch.cuda.is_available():\n"," i_device = int(self.device.split(\":\")[-1])\n"," self.gpu_name = torch.cuda.get_device_name(i_device)\n"," if (\n"," (\"16\" in self.gpu_name and \"V100\" not in self.gpu_name.upper())\n"," or \"P40\" in self.gpu_name.upper()\n"," or \"1060\" in self.gpu_name\n"," or \"1070\" in self.gpu_name\n"," or \"1080\" in self.gpu_name\n"," ):\n"," print(\"16 series/10 series P40 forced single precision\")\n"," self.is_half = False\n"," for config_file in [\"32k.json\", \"40k.json\", \"48k.json\"]:\n"," with open(BASE_DIR / \"src\" / \"configs\" / config_file, \"r\") as f:\n"," strr = f.read().replace(\"true\", \"false\")\n"," with open(BASE_DIR / \"src\" / \"configs\" / config_file, \"w\") as f:\n"," f.write(strr)\n"," with open(BASE_DIR / \"src\" / \"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n"," strr = f.read().replace(\"3.7\", \"3.0\")\n"," with open(BASE_DIR / \"src\" / \"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n"," f.write(strr)\n"," else:\n"," self.gpu_name = None\n"," self.gpu_mem = int(\n"," torch.cuda.get_device_properties(i_device).total_memory\n"," / 1024\n"," / 1024\n"," / 1024\n"," + 0.4\n"," )\n"," if self.gpu_mem <= 4:\n"," with open(BASE_DIR / \"src\" / \"trainset_preprocess_pipeline_print.py\", \"r\") as f:\n"," strr = f.read().replace(\"3.7\", \"3.0\")\n"," with open(BASE_DIR / \"src\" / \"trainset_preprocess_pipeline_print.py\", \"w\") as f:\n"," f.write(strr)\n"," elif torch.backends.mps.is_available():\n"," print(\"No supported N-card found, use MPS for inference\")\n"," self.device = \"mps\"\n"," else:\n"," print(\"No supported N-card found, use CPU for inference\")\n"," self.device = \"cpu\"\n"," self.is_half = True\n","\n"," if self.n_cpu == 0:\n"," self.n_cpu = cpu_count()\n","\n"," if self.is_half:\n"," # 6G memory config\n"," x_pad = 3\n"," x_query = 10\n"," x_center = 60\n"," x_max = 65\n"," else:\n"," # 5G memory config\n"," x_pad = 1\n"," x_query = 6\n"," x_center = 38\n"," x_max = 41\n","\n"," if self.gpu_mem != None and self.gpu_mem <= 4:\n"," x_pad = 1\n"," x_query = 5\n"," x_center = 30\n"," x_max = 32\n","\n"," return x_pad, x_query, x_center, x_max\n","\n","\n","def load_hubert(device, is_half, model_path):\n"," models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([model_path], suffix='', )\n"," hubert = models[0]\n"," hubert = hubert.to(device)\n","\n"," if is_half:\n"," hubert = hubert.half()\n"," else:\n"," hubert = hubert.float()\n","\n"," hubert.eval()\n"," return hubert\n","\n","\n","def get_vc(device, is_half, config, model_path):\n"," cpt = torch.load(model_path, map_location='cpu')\n"," if \"config\" not in cpt or \"weight\" not in cpt:\n"," raise ValueError(f'Incorrect format for {model_path}. Use a voice model trained using RVC v2 instead.')\n","\n"," tgt_sr = cpt[\"config\"][-1]\n"," cpt[\"config\"][-3] = cpt[\"weight\"][\"emb_g.weight\"].shape[0]\n"," if_f0 = cpt.get(\"f0\", 1)\n"," version = cpt.get(\"version\", \"v1\")\n","\n"," if version == \"v1\":\n"," if if_f0 == 1:\n"," net_g = SynthesizerTrnMs256NSFsid(*cpt[\"config\"], is_half=is_half)\n"," else:\n"," net_g = SynthesizerTrnMs256NSFsid_nono(*cpt[\"config\"])\n"," elif version == \"v2\":\n"," if if_f0 == 1:\n"," net_g = SynthesizerTrnMs768NSFsid(*cpt[\"config\"], is_half=is_half)\n"," else:\n"," net_g = SynthesizerTrnMs768NSFsid_nono(*cpt[\"config\"])\n","\n"," del net_g.enc_q\n"," print(net_g.load_state_dict(cpt[\"weight\"], strict=False))\n"," net_g.eval().to(device)\n","\n"," if is_half:\n"," net_g = net_g.half()\n"," else:\n"," net_g = net_g.float()\n","\n"," vc = VC(tgt_sr, config)\n"," return cpt, version, net_g, tgt_sr, vc\n","\n","\n","def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model):\n"," audio = load_audio(input_path, 16000)\n"," times = [0, 0, 0]\n"," if_f0 = cpt.get('f0', 1)\n"," print(\"Error Disini ?\")\n"," audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, input_path, times, pitch_change, f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr, 0, rms_mix_rate, version, protect, crepe_hop_length)\n"," wavfile.write(output_path, tgt_sr, audio_opt)"],"metadata":{"id":"xrxg4rhcRFBY"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Main Process\n","import argparse\n","import gc\n","import hashlib\n","import json\n","import os\n","import shlex\n","import subprocess\n","from contextlib import suppress\n","from urllib.parse import urlparse, parse_qs\n","\n","import gradio as gr\n","import librosa\n","import numpy as np\n","import soundfile as sf\n","import sox\n","import yt_dlp\n","from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter\n","from pedalboard.io import AudioFile\n","from pydub import AudioSegment\n","\n","BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(\"/content/ERPISI/\")))\n","\n","mdxnet_models_dir = os.path.join(\"/content/ERPISI/\", 'mdxnet_models')\n","rvc_models_dir = os.path.join(\"/content/ERPISI/\", 'rvc_models')\n","output_dir = os.path.join(\"/content/ERPISI/\", 'song_output')\n","\n","\n","def get_youtube_video_id(url, ignore_playlist=True):\n"," \"\"\"\n"," Examples:\n"," http://youtu.be/SA2iWivDJiE\n"," http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu\n"," http://www.youtube.com/embed/SA2iWivDJiE\n"," http://www.youtube.com/v/SA2iWivDJiE?version=3&hl=en_US\n"," \"\"\"\n"," query = urlparse(url)\n"," if query.hostname == 'youtu.be':\n"," if query.path[1:] == 'watch':\n"," return query.query[2:]\n"," return query.path[1:]\n","\n"," if query.hostname in {'www.youtube.com', 'youtube.com', 'music.youtube.com'}:\n"," if not ignore_playlist:\n"," # use case: get playlist id not current video in playlist\n"," with suppress(KeyError):\n"," return parse_qs(query.query)['list'][0]\n"," if query.path == '/watch':\n"," return parse_qs(query.query)['v'][0]\n"," if query.path[:7] == '/watch/':\n"," return query.path.split('/')[1]\n"," if query.path[:7] == '/embed/':\n"," return query.path.split('/')[2]\n"," if query.path[:3] == '/v/':\n"," return query.path.split('/')[2]\n","\n"," # returns None for invalid YouTube url\n"," return None\n","\n","\n","def yt_download(link):\n"," ydl_opts = {\n"," 'format': 'bestaudio',\n"," 'outtmpl': '%(title)s',\n"," 'nocheckcertificate': True,\n"," 'ignoreerrors': True,\n"," 'no_warnings': True,\n"," 'quiet': True,\n"," 'extractaudio': True,\n"," 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3'}],\n"," }\n"," with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n"," result = ydl.extract_info(link, download=True)\n"," download_path = ydl.prepare_filename(result, outtmpl='%(title)s.mp3')\n","\n"," return download_path\n","\n","\n","def raise_exception(error_msg, is_webui):\n"," if is_webui:\n"," raise gr.Error(error_msg)\n"," else:\n"," raise Exception(error_msg)\n","\n","\n","def get_rvc_model(voice_model, is_webui):\n"," rvc_model_filename, rvc_index_filename = None, None\n"," model_dir = os.path.join(rvc_models_dir, voice_model)\n"," for file in os.listdir(model_dir):\n"," ext = os.path.splitext(file)[1]\n"," if ext == '.pth':\n"," rvc_model_filename = file\n"," if ext == '.index':\n"," rvc_index_filename = file\n","\n"," if rvc_model_filename is None:\n"," error_msg = f'No model file exists in {model_dir}.'\n"," raise_exception(error_msg, is_webui)\n","\n"," return os.path.join(model_dir, rvc_model_filename), os.path.join(model_dir, rvc_index_filename) if rvc_index_filename else ''\n","\n","\n","def get_audio_paths(song_dir):\n"," orig_song_path = None\n"," instrumentals_path = None\n"," main_vocals_dereverb_path = None\n"," backup_vocals_path = None\n","\n"," for file in os.listdir(song_dir):\n"," if file.endswith('_Instrumental.wav'):\n"," instrumentals_path = os.path.join(song_dir, file)\n"," orig_song_path = instrumentals_path.replace('_Instrumental', '')\n","\n"," elif file.endswith('_Vocals_Main_DeReverb.wav'):\n"," main_vocals_dereverb_path = os.path.join(song_dir, file)\n","\n"," elif file.endswith('_Vocals_Backup.wav'):\n"," backup_vocals_path = os.path.join(song_dir, file)\n","\n"," return orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path\n","\n","\n","def convert_to_stereo(audio_path):\n"," wave, sr = librosa.load(audio_path, mono=False, sr=44100)\n","\n"," # check if mono\n"," if type(wave[0]) != np.ndarray:\n"," stereo_path = f'{os.path.splitext(audio_path)[0]}_stereo.wav'\n"," command = shlex.split(f'ffmpeg -y -loglevel error -i \"{audio_path}\" -ac 2 -f wav \"{stereo_path}\"')\n"," subprocess.run(command)\n"," return stereo_path\n"," else:\n"," return audio_path\n","\n","\n","def pitch_shift(audio_path, pitch_change):\n"," output_path = f'{os.path.splitext(audio_path)[0]}_p{pitch_change}.wav'\n"," if not os.path.exists(output_path):\n"," y, sr = sf.read(audio_path)\n"," tfm = sox.Transformer()\n"," tfm.pitch(pitch_change)\n"," y_shifted = tfm.build_array(input_array=y, sample_rate_in=sr)\n"," sf.write(output_path, y_shifted, sr)\n","\n"," return output_path\n","\n","\n","def get_hash(filepath):\n"," with open(filepath, 'rb') as f:\n"," file_hash = hashlib.blake2b()\n"," while chunk := f.read(8192):\n"," file_hash.update(chunk)\n","\n"," return file_hash.hexdigest()[:11]\n","\n","\n","def display_progress(message, percent, is_webui, progress=None):\n"," if is_webui:\n"," progress(percent, desc=message)\n"," else:\n"," print(message)\n","\n","\n","def preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress=None):\n"," keep_orig = False\n"," if input_type == 'yt':\n"," display_progress('[~] Downloading song...', 0, is_webui, progress)\n"," song_link = song_input.split('&')[0]\n"," orig_song_path = yt_download(song_link)\n"," elif input_type == 'local':\n"," orig_song_path = song_input\n"," keep_orig = True\n"," else:\n"," orig_song_path = None\n","\n"," song_output_dir = os.path.join(output_dir, song_id)\n"," orig_song_path = convert_to_stereo(orig_song_path)\n","\n"," display_progress('[~] Separating Vocals from Instrumental...', 0.1, is_webui, progress)\n"," vocals_path, instrumentals_path = run_mdx(mdx_model_params, song_output_dir, os.path.join(mdxnet_models_dir, 'UVR-MDX-NET-Voc_FT.onnx'), orig_song_path, denoise=True, keep_orig=keep_orig)\n","\n"," display_progress('[~] Separating Main Vocals from Backup Vocals...', 0.2, is_webui, progress)\n"," backup_vocals_path, main_vocals_path = run_mdx(mdx_model_params, song_output_dir, os.path.join(mdxnet_models_dir, 'UVR_MDXNET_KARA_2.onnx'), vocals_path, suffix='Backup', invert_suffix='Main', denoise=True)\n","\n"," display_progress('[~] Applying DeReverb to Vocals...', 0.3, is_webui, progress)\n"," _, main_vocals_dereverb_path = run_mdx(mdx_model_params, song_output_dir, os.path.join(mdxnet_models_dir, 'Reverb_HQ_By_FoxJoy.onnx'), main_vocals_path, invert_suffix='DeReverb', exclude_main=True, denoise=True)\n","\n"," return orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path\n","\n","\n","def voice_change(voice_model, vocals_path, output_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui):\n"," rvc_model_path, rvc_index_path = get_rvc_model(voice_model, is_webui)\n"," device = 'cuda:0'#'cuda:0'\n"," config = Config(device, True)\n"," hubert_model = load_hubert(device, config.is_half, os.path.join(rvc_models_dir, 'hubert_base.pt'))\n","\n","\n"," cpt, version, net_g, tgt_sr, vc = get_vc(device, config.is_half, config, rvc_model_path)\n","\n","\n"," # convert main vocals\n"," rvc_infer(rvc_index_path, index_rate, vocals_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model)\n"," del hubert_model, cpt\n"," gc.collect()\n","\n","\n","def add_audio_effects(audio_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping):\n"," output_path = f'{os.path.splitext(audio_path)[0]}_mixed.wav'\n","\n"," # Initialize audio effects plugins\n"," board = Pedalboard(\n"," [\n"," HighpassFilter(),\n"," Compressor(ratio=4, threshold_db=-15),\n"," Reverb(room_size=reverb_rm_size, dry_level=reverb_dry, wet_level=reverb_wet, damping=reverb_damping)\n"," ]\n"," )\n","\n"," with AudioFile(audio_path) as f:\n"," with AudioFile(output_path, 'w', f.samplerate, f.num_channels) as o:\n"," # Read one second of audio at a time, until the file is empty:\n"," while f.tell() < f.frames:\n"," chunk = f.read(int(f.samplerate))\n"," effected = board(chunk, f.samplerate, reset=False)\n"," o.write(effected)\n","\n"," return output_path\n","\n","\n","def combine_audio(audio_paths, output_path, main_gain, backup_gain, inst_gain, output_format):\n"," main_vocal_audio = AudioSegment.from_wav(audio_paths[0]) - 4 + main_gain\n"," backup_vocal_audio = AudioSegment.from_wav(audio_paths[1]) - 6 + backup_gain\n"," instrumental_audio = AudioSegment.from_wav(audio_paths[2]) - 7 + inst_gain\n"," main_vocal_audio.overlay(backup_vocal_audio).overlay(instrumental_audio).export(output_path, format=output_format)\n","\n","\n","def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,\n"," is_webui=0, main_gain=0, backup_gain=0, inst_gain=0, index_rate=0.5, filter_radius=3,\n"," rms_mix_rate=0.25, f0_method='rmvpe', crepe_hop_length=128, protect=0.33, pitch_change_all=0,\n"," reverb_rm_size=0.15, reverb_wet=0.2, reverb_dry=0.8, reverb_damping=0.7, output_format='mp3',\n"," progress=gr.Progress()):\n"," try:\n"," if not song_input or not voice_model:\n"," raise_exception('Ensure that the song input field and voice model field is filled.', is_webui)\n","\n"," display_progress('[~] Starting AI Cover Generation Pipeline...', 0, is_webui, progress)\n","\n"," with open(os.path.join(mdxnet_models_dir, 'model_data.json')) as infile:\n"," mdx_model_params = json.load(infile)\n","\n"," # if youtube url\n"," if urlparse(song_input).scheme == 'https':\n"," input_type = 'yt'\n"," song_id = get_youtube_video_id(song_input)\n"," if song_id is None:\n"," error_msg = 'Invalid YouTube url.'\n"," raise_exception(error_msg, is_webui)\n","\n"," # local audio file\n"," else:\n"," input_type = 'local'\n"," song_input = song_input.strip('\\\"')\n"," if os.path.exists(song_input):\n"," song_id = get_hash(song_input)\n"," else:\n"," error_msg = f'{song_input} does not exist.'\n"," song_id = None\n"," raise_exception(error_msg, is_webui)\n","\n"," song_dir = os.path.join(output_dir, song_id)\n","\n"," if not os.path.exists(song_dir):\n"," os.makedirs(song_dir)\n"," orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress)\n","\n"," else:\n"," vocals_path, main_vocals_path = None, None\n"," paths = get_audio_paths(song_dir)\n","\n"," # if any of the audio files aren't available or keep intermediate files, rerun preprocess\n"," if any(path is None for path in paths) or keep_files:\n"," orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress)\n"," else:\n"," orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path = paths\n","\n"," pitch_change = pitch_change * 12 + pitch_change_all\n"," ai_vocals_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}_{f0_method}{\"\" if f0_method != \"mangio-crepe\" else f\"_{crepe_hop_length}\"}.wav')\n"," ai_cover_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]} ({voice_model} Ver).{output_format}')\n","\n"," if not os.path.exists(ai_vocals_path):\n"," display_progress('[~] Converting voice using RVC...', 0.5, is_webui, progress)\n"," voice_change(voice_model, main_vocals_dereverb_path, ai_vocals_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui)\n","\n"," display_progress('[~] Applying audio effects to Vocals...', 0.8, is_webui, progress)\n"," ai_vocals_mixed_path = add_audio_effects(ai_vocals_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping)\n","\n"," if pitch_change_all != 0:\n"," display_progress('[~] Applying overall pitch change', 0.85, is_webui, progress)\n"," instrumentals_path = pitch_shift(instrumentals_path, pitch_change_all)\n"," backup_vocals_path = pitch_shift(backup_vocals_path, pitch_change_all)\n","\n"," display_progress('[~] Combining AI Vocals and Instrumentals...', 0.9, is_webui, progress)\n"," combine_audio([ai_vocals_mixed_path, backup_vocals_path, instrumentals_path], ai_cover_path, main_gain, backup_gain, inst_gain, output_format)\n","\n"," if not keep_files:\n"," display_progress('[~] Removing intermediate audio files...', 0.95, is_webui, progress)\n"," intermediate_files = [vocals_path, main_vocals_path, ai_vocals_mixed_path]\n"," if pitch_change_all != 0:\n"," intermediate_files += [instrumentals_path, backup_vocals_path]\n"," for file in intermediate_files:\n"," if file and os.path.exists(file):\n"," os.remove(file)\n","\n"," return ai_cover_path\n","\n"," except Exception as e:\n"," raise_exception(str(e), is_webui)"],"metadata":{"id":"I0OqBVgHUsSe"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#@title Model Download Function\n","\n","import os\n","import zipfile\n","import shutil\n","import urllib.request\n","\n","BASE_DIR = \"/content/ERPISI\"\n","rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models')\n","\n","def extract_zip(extraction_folder, zip_name):\n"," os.makedirs(extraction_folder)\n"," with zipfile.ZipFile(zip_name, 'r') as zip_ref:\n"," zip_ref.extractall(extraction_folder)\n"," os.remove(zip_name)\n","\n"," index_filepath, model_filepath = None, None\n"," for root, dirs, files in os.walk(extraction_folder):\n"," for name in files:\n"," if name.endswith('.index') and os.stat(os.path.join(root, name)).st_size > 1024 * 100:\n"," index_filepath = os.path.join(root, name)\n","\n"," if name.endswith('.pth') and os.stat(os.path.join(root, name)).st_size > 1024 * 1024 * 40:\n"," model_filepath = os.path.join(root, name)\n","\n"," if not model_filepath:\n"," raise Exception(f'No .pth model file was found in the extracted zip. Please check {extraction_folder}.')\n","\n"," # move model and index file to extraction folder\n"," os.rename(model_filepath, os.path.join(extraction_folder, os.path.basename(model_filepath)))\n"," if index_filepath:\n"," os.rename(index_filepath, os.path.join(extraction_folder, os.path.basename(index_filepath)))\n","\n"," # remove any unnecessary nested folders\n"," for filepath in os.listdir(extraction_folder):\n"," if os.path.isdir(os.path.join(extraction_folder, filepath)):\n"," shutil.rmtree(os.path.join(extraction_folder, filepath))\n","\n","def download_online_model(url, dir_name):\n"," try:\n"," print(f'[~] Downloading voice model with name {dir_name}...')\n"," zip_name = url.split('/')[-1]\n"," extraction_folder = os.path.join(rvc_models_dir, dir_name)\n"," if os.path.exists(extraction_folder):\n"," raise Exception(f'Voice model directory {dir_name} already exists! Choose a different name for your voice model.')\n","\n"," if 'pixeldrain.com' in url:\n"," url = f'https://pixeldrain.com/api/file/{zip_name}'\n","\n"," urllib.request.urlretrieve(url, zip_name)\n","\n"," print('[~] Extracting zip...')\n"," extract_zip(extraction_folder, zip_name)\n"," print(f'[+] {dir_name} Model successfully downloaded!')\n","\n"," except Exception as e:\n"," raise Exception(str(e))\n","\n","url = \"https://pixeldrain.com/u/3tJmABXA\" # @param {type:\"string\"}\n","dir_name = \"Gura\" # @param {type:\"string\"}\n","\n","download_online_model(url, dir_name)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"cellView":"form","id":"Ds6T-Gb0aY5S","executionInfo":{"status":"ok","timestamp":1695448739850,"user_tz":-420,"elapsed":23670,"user":{"displayName":"ana taqa126","userId":"16118498281411814981"}},"outputId":"a921ccf7-72fc-4573-8165-ea5c5b663d8c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[~] Downloading voice model with name Gura...\n","[~] Extracting zip...\n","[+] Gura Model successfully downloaded!\n"]}]},{"cell_type":"code","source":["# @title Generate | Output generated inside \"AICoverGen\\song_output\\random_number\"\n","# @markdown Main Option | You also can input audio path inside \"SONG_INPUT\"\n","\n","SONG_INPUT = \"/content/drive/MyDrive/audio/BangDream/vocals.wav\" # @param {type:\"string\"}\n","RVC_DIRNAME = \"Gura\" # @param {type:\"string\"}\n","PITCH_CHANGE = 0 # @param {type:\"integer\"}\n","PITCH_CHANGE_ALL = 0 # @param {type:\"integer\"}\n","# @markdown Voice Conversion Options\n","INDEX_RATE = 0.5 # @param {type:\"number\"}\n","FILTER_RADIUS = 3 # @param {type:\"integer\"}\n","PITCH_DETECTION_ALGO = \"crepe\" # @param [\"rmvpe\", \"mangio-crepe\", \"crepe\"]\n","CREPE_HOP_LENGTH = 128 # @param {type:\"integer\"}\n","PROTECT = 0.33 # @param {type:\"number\"}\n","REMIX_MIX_RATE = 0.25 # @param {type:\"number\"}\n","# @markdown Audio Mixing Options\n","MAIN_VOL = 0 # @param {type:\"integer\"}\n","BACKUP_VOL = 0 # @param {type:\"integer\"}\n","INST_VOL = 0 # @param {type:\"integer\"}\n","# @markdown Reverb Control\n","REVERB_SIZE = 0.15 # @param {type:\"number\"}\n","REVERB_WETNESS = 0.2 # @param {type:\"number\"}\n","REVERB_DRYNESS = 0.8 # @param {type:\"number\"}\n","REVERB_DAMPING = 0.7 # @param {type:\"number\"}\n","# @markdown Output Format\n","OUTPUT_FORMAT = \"mp3\" # @param [\"mp3\", \"wav\"]\n","\n"],"metadata":{"id":"WOmLmrMoVvF2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["rvc_dirname = RVC_DIRNAME\n","if not os.path.exists(os.path.join(rvc_models_dir, rvc_dirname)):\n"," raise Exception(f'The folder {os.path.join(rvc_models_dir, rvc_dirname)} does not exist.')\n","\n","cover_path = song_cover_pipeline(SONG_INPUT, rvc_dirname, PITCH_CHANGE, \"\",\n"," main_gain=MAIN_VOL, backup_gain=BACKUP_VOL, inst_gain=INST_VOL,\n"," index_rate=INDEX_RATE, filter_radius=FILTER_RADIUS,\n"," rms_mix_rate=REMIX_MIX_RATE, f0_method=PITCH_DETECTION_ALGO,\n"," crepe_hop_length=CREPE_HOP_LENGTH, protect=PROTECT,\n"," pitch_change_all=PITCH_CHANGE_ALL,\n"," reverb_rm_size=REVERB_SIZE, reverb_wet=REVERB_WETNESS,\n"," reverb_dry=REVERB_DRYNESS, reverb_damping=REVERB_DAMPING,\n"," output_format=OUTPUT_FORMAT)\n","\n","print(f'[+] Cover generated at {cover_path}')\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"648jWLs5X4py","executionInfo":{"status":"ok","timestamp":1695450455601,"user_tz":-420,"elapsed":108548,"user":{"displayName":"ana taqa126","userId":"16118498281411814981"}},"outputId":"cdf5b19a-e52c-446a-f6ba-24d54f31e8db"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["[~] Starting AI Cover Generation Pipeline...\n","[~] Separating Vocals from Instrumental...\n"]},{"output_type":"stream","name":"stderr","text":["100%|██████████| 30/30 [00:08<00:00, 3.50it/s]\n","100%|██████████| 30/30 [00:10<00:00, 2.94it/s]\n"]},{"output_type":"stream","name":"stdout","text":["[~] Separating Main Vocals from Backup Vocals...\n"]},{"output_type":"stream","name":"stderr","text":["100%|██████████| 30/30 [00:08<00:00, 3.64it/s]\n","100%|██████████| 30/30 [00:06<00:00, 4.58it/s]\n"]},{"output_type":"stream","name":"stdout","text":["[~] Applying DeReverb to Vocals...\n"]},{"output_type":"stream","name":"stderr","text":["100%|██████████| 16/16 [00:09<00:00, 1.72it/s]\n","100%|██████████| 16/16 [00:10<00:00, 1.50it/s]\n"]},{"output_type":"stream","name":"stdout","text":["[~] Converting voice using RVC...\n","gin_channels: 256 self.spk_embed_dim: 109\n","\n","Error Disini ?\n","[~] Applying audio effects to Vocals...\n","[~] Combining AI Vocals and Instrumentals...\n","[~] Removing intermediate audio files...\n","[+] Cover generated at /content/ERPISI/song_output/c2714dcdc7c/vocals (Gura Ver).mp3\n"]}]}]} \ No newline at end of file diff --git a/README.md b/README.md index 089f96b..ce5d93c 100644 --- a/README.md +++ b/README.md @@ -29,5 +29,8 @@ All in One Repository: Youtube WAV Download, Separating Vocal, Splitting Audio, - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ardha27/AICoverGen-NoUI-Colab/blob/main/CoverGen_No_UI.ipynb) (Without UI/Gradio, Prevent Banning) - [![Open In Kaggle](https://img.shields.io/badge/-Open%20in%20Kaggle-blue?style=flat&logo=kaggle&logoColor=white&labelColor=grey)](https://www.kaggle.com/code/ardhasemaranatha/aicovergen-kaggle) +### Inference AICoverGen Interactive Notebook +- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ardha27/AICoverGen-NoUI-Colab/blob/main/ERPISI_EXPERIMENT.ipynb) (Without UI/Gradio, Prevent Banning, rmvpe not supported) + ### Training V2 and Youtube Audio Download & Splitting Audio all combined (Stored in GDrive) - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MinatoIsuki/AI-Song-Cover-RVC/blob/main/Training_V2_and_Youtube_Audio_Download_%26_Splitting_Audio_combined.ipynb)