feat(preprocessing): add 2 more preprocessing commands (#123)

voicepaw · Mar 26, 2023 · 45eba0f · 45eba0f
1 parent 6a7e8ba
commit 45eba0f
Show file tree

Hide file tree

Showing 4 changed files with 279 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -59,10 +59,15 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
 Install this via pip (or your favourite package manager that uses pip):
 
 ```shell
+python -m pip install -U pip setuptools wheel
 pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117
 pip install -U so-vits-svc-fork
 ```
 
+- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`.
+- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
+- If `fairseq` raises an error that [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed or that some dll is missing, please (re)install it.
+
 ### Update
 
 Please update this package regularly to get the latest features and bug fixes.
@@ -108,6 +113,14 @@ svc --model-path <model-path> source.wav
 
 ### Training
 
+#### Before training
+
+- If your dataset has BGM, please remove the BGM using software such as [Ultimate Vocal Remover](https://ultimatevocalremover.com/). `3_HP-Vocal-UVR.pth` or `UVR-MDX-NET Main` is recommended. [^1]
+- If your dataset is a long audio file with multiple speakers, use `svc sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`.
+- If your dataset is a long audio file with a single speaker, use `svc split` to split the dataset into multiple files (using `librosa`).
+
+[^1]: https://ytpmv.info/how-to-use-uvr/
+
 #### Google Colab
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb)
@@ -119,14 +132,14 @@ Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (s
 ```shell
 svc pre-resample
 svc pre-config
-svc pre-hubert -fm dio
+svc pre-hubert
 svc train
 ```
 
 #### Notes
 
 - Dataset audio duration per file should be <~ 10s or VRAM will run out.
-- To change the f0 inference method to CREPE, replace `svc pre-hubert -fm dio` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
+- To change the f0 inference method to CREPE, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
 - It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
 
 ### Further help
@@ -139,7 +152,7 @@ Usage: svc [OPTIONS] COMMAND [ARGS]...
 
   so-vits-svc allows any folder structure for training data.
   However, the following folder structure is recommended.
-      When training: dataset_raw/{speaker_name}/{wav_name}.wav
+      When training: dataset_raw/{speaker_name}/**/{wav_name}.{any_format}
       When inference: configs/44k/config.json, logs/44k/G_XXXX.pth
   If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc.
   (The latest model will be automatically loaded.)
@@ -156,6 +169,8 @@ Commands:
   pre-config     Preprocessing part 2: config
   pre-hubert     Preprocessing part 3: hubert If the HuBERT model is not found, it will be...
   pre-resample   Preprocessing part 1: resample
+  pre-sd         Speech diarization using pyannote.audio
+  pre-split      Split audio files into multiple files
   train          Train model If D_0.pth or G_0.pth not found, automatically download from hub.
   train-cluster  Train k-means clustering
   vc             Realtime inference from microphone

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -530,6 +530,115 @@ def pre_hubert(
     )
 
 
+@cli.command()
+@click.option(
+    "-i",
+    "--input-dir",
+    type=click.Path(exists=True),
+    default=Path("./dataset_raw_raw/"),
+    help="path to source dir",
+)
+@click.option(
+    "-o",
+    "--output-dir",
+    type=click.Path(),
+    default=Path("./dataset_raw/"),
+    help="path to output dir",
+)
+@click.option(
+    "-n",
+    "--n-jobs",
+    type=int,
+    default=-1,
+    help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
+)
+@click.option("-min", "--min-speakers", type=int, default=2, help="min speakers")
+@click.option("-max", "--max-speakers", type=int, default=2, help="max speakers")
+@click.option(
+    "-t", "--huggingface-token", type=str, default=None, help="huggingface token"
+)
+def pre_sd(
+    input_dir: Path | str,
+    output_dir: Path | str,
+    min_speakers: int,
+    max_speakers: int,
+    huggingface_token: str | None,
+    n_jobs: int,
+):
+    """Speech diarization using pyannote.audio"""
+    if huggingface_token is None:
+        huggingface_token = os.environ.get("HUGGINGFACE_TOKEN", None)
+    if huggingface_token is None:
+        huggingface_token = click.prompt(
+            "Please enter your HuggingFace token", hide_input=True
+        )
+    if os.environ.get("HUGGINGFACE_TOKEN", None) is None:
+        LOG.info("You can also set the HUGGINGFACE_TOKEN environment variable.")
+    assert huggingface_token is not None
+    huggingface_token = huggingface_token.rstrip(" \n\r\t\0")
+    if len(huggingface_token) <= 1:
+        raise ValueError("HuggingFace token is empty: " + huggingface_token)
+
+    if max_speakers == 1:
+        LOG.warning("Consider using pre-split if max_speakers == 1")
+    from .preprocess_speaker_diarization import preprocess_speaker_diarization
+
+    preprocess_speaker_diarization(
+        input_dir=input_dir,
+        output_dir=output_dir,
+        min_speakers=min_speakers,
+        max_speakers=max_speakers,
+        huggingface_token=huggingface_token,
+        n_jobs=n_jobs,
+    )
+
+
+@cli.command()
+@click.option(
+    "-i",
+    "--input-dir",
+    type=click.Path(exists=True),
+    default=Path("./dataset_raw_raw/"),
+    help="path to source dir",
+)
+@click.option(
+    "-o",
+    "--output-dir",
+    type=click.Path(),
+    default=Path("./dataset_raw/"),
+    help="path to output dir",
+)
+@click.option(
+    "-n",
+    "--n-jobs",
+    type=int,
+    default=-1,
+    help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)",
+)
+@click.option("-d", "--top-db", type=float, default=30, help="top db")
+@click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds")
+@click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds")
+def pre_split(
+    input_dir: Path | str,
+    output_dir: Path | str,
+    top_db: int,
+    frame_seconds: float,
+    hop_seconds: float,
+    n_jobs: int,
+):
+    """Split audio files into multiple files"""
+    from .preprocess_split import preprocess_split
+
+    preprocess_split(
+        input_dir=input_dir,
+        output_dir=output_dir,
+        top_db=top_db,
+        frame_seconds=frame_seconds,
+        hop_seconds=hop_seconds,
+        n_jobs=n_jobs,
+    )
+
+
 @cli.command
 def clean():
     """Clean up files, only useful if you are using the default file structure"""

diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocess_speaker_diarization.py
@@ -0,0 +1,87 @@
+from collections import defaultdict
+from logging import getLogger
+from pathlib import Path
+
+import soundfile as sf
+import torch
+from joblib import Parallel, delayed
+from pyannote.audio import Pipeline
+from tqdm import tqdm
+from tqdm_joblib import tqdm_joblib
+
+LOG = getLogger(__name__)
+
+
+def _process_one(
+    input_path: Path,
+    output_dir: Path,
+    *,
+    min_speakers: int = 1,
+    max_speakers: int = 1,
+    huggingface_token: str | None = None,
+) -> None:
+    try:
+        audio, sr = sf.read(input_path)
+    except Exception as e:
+        LOG.warning(f"Failed to read {input_path}: {e}")
+        return
+    pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization", use_auth_token=huggingface_token
+    )
+    if pipeline is None:
+        raise ValueError("Failed to load pipeline")
+
+    LOG.info(f"Processing {input_path}. This may take a while...")
+    diarization = pipeline(
+        input_path, min_speakers=min_speakers, max_speakers=max_speakers
+    )
+
+    LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}")
+    speaker_count = defaultdict(int)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for segment, track, speaker in tqdm(
+        list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}"
+    ):
+        if segment.end - segment.start < 1:
+            continue
+        speaker_count[speaker] += 1
+        audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)]
+        sf.write(
+            (output_dir / f"{speaker}_{speaker_count[speaker]}.wav"),
+            audio_cut,
+            sr,
+        )
+
+    LOG.info(f"Speaker count: {speaker_count}")
+
+
+def preprocess_speaker_diarization(
+    input_dir: Path | str,
+    output_dir: Path | str,
+    *,
+    min_speakers: int = 1,
+    max_speakers: int = 1,
+    huggingface_token: str | None = None,
+    n_jobs: int = -1,
+) -> None:
+    if huggingface_token is not None and not huggingface_token.startswith("hf_"):
+        LOG.warning("Huggingface token probably should start with hf_")
+    if not torch.cuda.is_available():
+        LOG.warning("CUDA is not available. This will be extremely slow.")
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    input_dir.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    input_paths = list(input_dir.rglob("*.*"))
+    with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)):
+        Parallel(n_jobs=n_jobs)(
+            delayed(_process_one)(
+                input_path,
+                output_dir / input_path.relative_to(input_dir).parent / input_path.stem,
+                max_speakers=max_speakers,
+                min_speakers=min_speakers,
+                huggingface_token=huggingface_token,
+            )
+            for input_path in input_paths
+        )
diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocess_split.py
@@ -0,0 +1,65 @@
+from logging import getLogger
+from pathlib import Path
+
+import librosa
+import soundfile as sf
+from joblib import Parallel, delayed
+from tqdm import tqdm
+from tqdm_joblib import tqdm_joblib
+
+LOG = getLogger(__name__)
+
+
+def _process_one(
+    input_path: Path,
+    output_dir: Path,
+    *,
+    top_db: int = 30,
+    frame_seconds: float = 0.5,
+    hop_seconds: float = 0.1,
+):
+    try:
+        audio, sr = librosa.load(input_path)
+    except Exception as e:
+        LOG.warning(f"Failed to read {input_path}: {e}")
+        return
+    intervals = librosa.effects.split(
+        audio,
+        top_db=top_db,
+        frame_length=int(sr * frame_seconds),
+        hop_length=int(sr * hop_seconds),
+    )
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for start, end in tqdm(intervals, desc=f"Writing {input_path}"):
+        audio_cut = audio[start:end]
+        sf.write(
+            (output_dir / f"{input_path.stem}_{start / sr:.3f}_{end / sr:.3f}.wav"),
+            audio_cut,
+            sr,
+        )
+
+
+def preprocess_split(
+    input_dir: Path | str,
+    output_dir: Path | str,
+    *,
+    top_db: int = 30,
+    frame_seconds: float = 0.5,
+    hop_seconds: float = 0.1,
+    n_jobs: int = -1,
+):
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    input_paths = list(input_dir.rglob("*.*"))
+    with tqdm_joblib(desc="Splitting", total=len(input_paths)):
+        Parallel(n_jobs=n_jobs)(
+            delayed(_process_one)(
+                input_path,
+                output_dir / input_path.relative_to(input_dir).parent,
+                top_db=top_db,
+                frame_seconds=frame_seconds,
+                hop_seconds=hop_seconds,
+            )
+            for input_path in input_paths
+        )