-
Notifications
You must be signed in to change notification settings - Fork 110
/
Copy pathconfig.yaml
167 lines (146 loc) · 3.9 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
dataset:
train:
wav_scp: './train/wav.scp'
mel_scp: './train/mel.scp'
dur_scp: './train/dur.scp'
emb_type1:
_name: 'pinyin'
scp: './train/py.scp'
vocab: 'py.vocab'
emb_type2:
_name: 'graphic'
scp: './train/gp.scp'
vocab: 'gp.vocab'
emb_type3:
_name: 'speaker'
scp: './train/spk.scp'
vocab: ~ # dosn't need vocab
# NOTE: you can add more embedding here without changing the code.
eval:
# NOTE: this is not used for now, i.e., just training, no evaluation.
# You can use synthesize.py to check the training goes well, for now.
training:
batch_size: 1024
batch_split: 64
epochs: 100000
grad_clip_thresh: 1.0
acc_step: 1
checkpoint_path: "./checkpoints/"
log_path: "./log/"
checkpoint_step: 5000
synth_step: 5000
log_step: 20
num_workers: 8
evaluation_step: 1000
optimizer: # NOTE: if use SGD, params should change too, as it has different arguments.
type: Adam
n_warm_up_step: 2000
lr_decrease_step: 10000
lr_decrease_factor:
params:
betas: [0.9,0.98]
eps: !!float 1e-9
weight_decay: !!float 0.0
lr: !!float 1e-4
lr_scheduler:
type: CyclicLR
params:
base_lr: !!float 1e-8
max_lr: !!float 1e-6
step_size_up: 5000
step_size_down: 5000
cycle_momentum: False
fbank: # this is used for wav2mel.py
sample_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
max_wav_value: 32768.0
n_mels: 80
fmin: 0.0
fmax: 8000.0 # should be 11025 ?
mel_mean: -6.0304103
encoder:
encoder_type: 'FS2TransformerEncoder'
conf:
n_layers: 4
n_heads: 2
hidden_dim: 256
dropout: 0.25
d_inner: 1024
max_len: 2048
decoder:
decoder_type: 'FS2TransformerDecoder'
input_dim: 256 # should be the same as the output of encoder
n_layers: 4
n_heads: 2
hidden_dim: 256
d_inner: 1024
dropout: 0.25
max_len: 2048 # max len of seq, for position embedding pre-computation
#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
speaker_embedding:
enable: True
vocab: #None
vocab_size: 218 # aishell3 has 218 speakers
weight: 1.0 # you can play with weight here
dim: 256
utterence_embedding:
enable: False # not implemented
type: 'lstm' # resnet
feature_config:
type: 'mel'
n_mels: 80
sampling_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
hanzi_embedding:
enable: True
type: embedding
vocab: './gp.vocab'
dim: 256
weight: 0.5 # you can play with weight here
max_seq_len: 100
pinyin_embedding:
enable: True
type: embedding
vocab: './py.vocab'
dim: 256
weight: 1.0
max_seq_len: 100
duration_predictor:
input_dim: 256 # should be the same as encoder hiddien_dim
filter_size: 256
kernel_size: 3
dropout: 0.15 # important to set dropout here
duration_mean: 21.517294924096635 #for aishell3
f0_predictor:
enable: False # currently not supported
filter_size: 256
kernel_size: 3
dropout: 0.5
n_bins: 256
vocoder:
type: VocGan # choose one of the following
MelGAN:
checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
config: ~/checkpoints/melgan/default.yaml
device: cpu
VocGan:
checkpoint: ~/checkpoints/vctk_pretrained_model_3180.pt #~/checkpoints/ljspeech_29de09d_4000.pt
denoise: True
device: cpu
HiFiGAN:
checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here
device: cpu
Waveglow:
checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
sigma: 1.0
denoiser_strength: 0.0 # try 0.1
device: cpu #try cpu if out of memory
synthesis:
normalize: True # normalize the sound volume