Skip to content

Commit

Permalink
feat(preprocessing): add 2 more preprocessing commands (#123)
Browse files Browse the repository at this point in the history
  • Loading branch information
34j authored Mar 26, 2023
1 parent 6a7e8ba commit 45eba0f
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 3 deletions.
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,15 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with
Install this via pip (or your favourite package manager that uses pip):

```shell
python -m pip install -U pip setuptools wheel
pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117
pip install -U so-vits-svc-fork
```

- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`.
- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`.
- If `fairseq` raises an error that [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed or that some dll is missing, please (re)install it.

### Update

Please update this package regularly to get the latest features and bug fixes.
Expand Down Expand Up @@ -108,6 +113,14 @@ svc --model-path <model-path> source.wav

### Training

#### Before training

- If your dataset has BGM, please remove the BGM using software such as [Ultimate Vocal Remover](https://ultimatevocalremover.com/). `3_HP-Vocal-UVR.pth` or `UVR-MDX-NET Main` is recommended. [^1]
- If your dataset is a long audio file with multiple speakers, use `svc sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`.
- If your dataset is a long audio file with a single speaker, use `svc split` to split the dataset into multiple files (using `librosa`).

[^1]: https://ytpmv.info/how-to-use-uvr/

#### Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb)
Expand All @@ -119,14 +132,14 @@ Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (s
```shell
svc pre-resample
svc pre-config
svc pre-hubert -fm dio
svc pre-hubert
svc train
```

#### Notes

- Dataset audio duration per file should be <~ 10s or VRAM will run out.
- To change the f0 inference method to CREPE, replace `svc pre-hubert -fm dio` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
- To change the f0 inference method to CREPE, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.

### Further help
Expand All @@ -139,7 +152,7 @@ Usage: svc [OPTIONS] COMMAND [ARGS]...

so-vits-svc allows any folder structure for training data.
However, the following folder structure is recommended.
When training: dataset_raw/{speaker_name}/{wav_name}.wav
When training: dataset_raw/{speaker_name}/**/{wav_name}.{any_format}
When inference: configs/44k/config.json, logs/44k/G_XXXX.pth
If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc.
(The latest model will be automatically loaded.)
Expand All @@ -156,6 +169,8 @@ Commands:
pre-config Preprocessing part 2: config
pre-hubert Preprocessing part 3: hubert If the HuBERT model is not found, it will be...
pre-resample Preprocessing part 1: resample
pre-sd Speech diarization using pyannote.audio
pre-split Split audio files into multiple files
train Train model If D_0.pth or G_0.pth not found, automatically download from hub.
train-cluster Train k-means clustering
vc Realtime inference from microphone
Expand Down
109 changes: 109 additions & 0 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,115 @@ def pre_hubert(
)


@cli.command()
@click.option(
"-i",
"--input-dir",
type=click.Path(exists=True),
default=Path("./dataset_raw_raw/"),
help="path to source dir",
)
@click.option(
"-o",
"--output-dir",
type=click.Path(),
default=Path("./dataset_raw/"),
help="path to output dir",
)
@click.option(
"-n",
"--n-jobs",
type=int,
default=-1,
help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
)
@click.option("-min", "--min-speakers", type=int, default=2, help="min speakers")
@click.option("-max", "--max-speakers", type=int, default=2, help="max speakers")
@click.option(
"-t", "--huggingface-token", type=str, default=None, help="huggingface token"
)
def pre_sd(
input_dir: Path | str,
output_dir: Path | str,
min_speakers: int,
max_speakers: int,
huggingface_token: str | None,
n_jobs: int,
):
"""Speech diarization using pyannote.audio"""
if huggingface_token is None:
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN", None)
if huggingface_token is None:
huggingface_token = click.prompt(
"Please enter your HuggingFace token", hide_input=True
)
if os.environ.get("HUGGINGFACE_TOKEN", None) is None:
LOG.info("You can also set the HUGGINGFACE_TOKEN environment variable.")
assert huggingface_token is not None
huggingface_token = huggingface_token.rstrip(" \n\r\t\0")
if len(huggingface_token) <= 1:
raise ValueError("HuggingFace token is empty: " + huggingface_token)

if max_speakers == 1:
LOG.warning("Consider using pre-split if max_speakers == 1")
from .preprocess_speaker_diarization import preprocess_speaker_diarization

preprocess_speaker_diarization(
input_dir=input_dir,
output_dir=output_dir,
min_speakers=min_speakers,
max_speakers=max_speakers,
huggingface_token=huggingface_token,
n_jobs=n_jobs,
)


@cli.command()
@click.option(
"-i",
"--input-dir",
type=click.Path(exists=True),
default=Path("./dataset_raw_raw/"),
help="path to source dir",
)
@click.option(
"-o",
"--output-dir",
type=click.Path(),
default=Path("./dataset_raw/"),
help="path to output dir",
)
@click.option(
"-n",
"--n-jobs",
type=int,
default=-1,
help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)",
)
@click.option("-d", "--top-db", type=float, default=30, help="top db")
@click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds")
@click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds")
def pre_split(
input_dir: Path | str,
output_dir: Path | str,
top_db: int,
frame_seconds: float,
hop_seconds: float,
n_jobs: int,
):
"""Split audio files into multiple files"""
from .preprocess_split import preprocess_split

preprocess_split(
input_dir=input_dir,
output_dir=output_dir,
top_db=top_db,
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
n_jobs=n_jobs,
)


@cli.command
def clean():
"""Clean up files, only useful if you are using the default file structure"""
Expand Down
87 changes: 87 additions & 0 deletions src/so_vits_svc_fork/preprocess_speaker_diarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from collections import defaultdict
from logging import getLogger
from pathlib import Path

import soundfile as sf
import torch
from joblib import Parallel, delayed
from pyannote.audio import Pipeline
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

LOG = getLogger(__name__)


def _process_one(
input_path: Path,
output_dir: Path,
*,
min_speakers: int = 1,
max_speakers: int = 1,
huggingface_token: str | None = None,
) -> None:
try:
audio, sr = sf.read(input_path)
except Exception as e:
LOG.warning(f"Failed to read {input_path}: {e}")
return
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization", use_auth_token=huggingface_token
)
if pipeline is None:
raise ValueError("Failed to load pipeline")

LOG.info(f"Processing {input_path}. This may take a while...")
diarization = pipeline(
input_path, min_speakers=min_speakers, max_speakers=max_speakers
)

LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}")
speaker_count = defaultdict(int)

output_dir.mkdir(parents=True, exist_ok=True)
for segment, track, speaker in tqdm(
list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}"
):
if segment.end - segment.start < 1:
continue
speaker_count[speaker] += 1
audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)]
sf.write(
(output_dir / f"{speaker}_{speaker_count[speaker]}.wav"),
audio_cut,
sr,
)

LOG.info(f"Speaker count: {speaker_count}")


def preprocess_speaker_diarization(
input_dir: Path | str,
output_dir: Path | str,
*,
min_speakers: int = 1,
max_speakers: int = 1,
huggingface_token: str | None = None,
n_jobs: int = -1,
) -> None:
if huggingface_token is not None and not huggingface_token.startswith("hf_"):
LOG.warning("Huggingface token probably should start with hf_")
if not torch.cuda.is_available():
LOG.warning("CUDA is not available. This will be extremely slow.")
input_dir = Path(input_dir)
output_dir = Path(output_dir)
input_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)
input_paths = list(input_dir.rglob("*.*"))
with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)):
Parallel(n_jobs=n_jobs)(
delayed(_process_one)(
input_path,
output_dir / input_path.relative_to(input_dir).parent / input_path.stem,
max_speakers=max_speakers,
min_speakers=min_speakers,
huggingface_token=huggingface_token,
)
for input_path in input_paths
)
65 changes: 65 additions & 0 deletions src/so_vits_svc_fork/preprocess_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from logging import getLogger
from pathlib import Path

import librosa
import soundfile as sf
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

LOG = getLogger(__name__)


def _process_one(
input_path: Path,
output_dir: Path,
*,
top_db: int = 30,
frame_seconds: float = 0.5,
hop_seconds: float = 0.1,
):
try:
audio, sr = librosa.load(input_path)
except Exception as e:
LOG.warning(f"Failed to read {input_path}: {e}")
return
intervals = librosa.effects.split(
audio,
top_db=top_db,
frame_length=int(sr * frame_seconds),
hop_length=int(sr * hop_seconds),
)
output_dir.mkdir(parents=True, exist_ok=True)
for start, end in tqdm(intervals, desc=f"Writing {input_path}"):
audio_cut = audio[start:end]
sf.write(
(output_dir / f"{input_path.stem}_{start / sr:.3f}_{end / sr:.3f}.wav"),
audio_cut,
sr,
)


def preprocess_split(
input_dir: Path | str,
output_dir: Path | str,
*,
top_db: int = 30,
frame_seconds: float = 0.5,
hop_seconds: float = 0.1,
n_jobs: int = -1,
):
input_dir = Path(input_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
input_paths = list(input_dir.rglob("*.*"))
with tqdm_joblib(desc="Splitting", total=len(input_paths)):
Parallel(n_jobs=n_jobs)(
delayed(_process_one)(
input_path,
output_dir / input_path.relative_to(input_dir).parent,
top_db=top_db,
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
)
for input_path in input_paths
)

0 comments on commit 45eba0f

Please sign in to comment.