-
Notifications
You must be signed in to change notification settings - Fork 1
/
ge2e_hparams.py
103 lines (90 loc) · 2.77 KB
/
ge2e_hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import tensorflow as tf
import numpy as np
# Default hyperparameters:
hparams = tf.contrib.training.HParams(
name="GE2E",
# Presets known to work good.
# NOTE: If specified, override hyper parameters with preset
preset="",
presets={},
# Input type:
input_type="raw",
# Train and test
train_path='/home/zeng/work/mywork/Speaker_Verification/train_tisv',
test_path='./data/test',
tdsv_frame=80,
tisv_frame=180,
# Audio:
sample_rate=22050,
# this is only valid for mulaw is True
silence_threshold=2,
num_mels=40,
fmin=125,
fmax=7600,
fft_size=512,
# shift can be specified by either hop_size or frame_shift_ms
hop=100,
frame_shift_ms=None,
min_level_db=-100,
ref_level_db=20,
window=0.025,
rescaling=True,
rescaling_max=0.999,
# mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
# happen depends on min_level_db and ref_level_db, causing clipping noise.
# If False, assertion is added to ensure no clipping happens.
allow_clipping_in_normalization=True,
# Mixture of logistic distributions:
log_scale_min=float(np.log(1e-14)),
# Model:
N=10,
M=5,
mode='TI-SV',
hidden_size_tisv=768,
project_size_tisv=256,
hidden_size_tidv=128,
project_size_tidv=64,
# If True, apply weight normalization as same as DeepVoice3
weight_normalization=True,
# this should only be enabled for multi-speaker dataset
n_speakers=7, # 7 for CMU ARCTIC
# Data loader
pin_memory=True,
num_workers=2,
# train/test
# test size can be specified as portion or num samples
test_size=0.0441, # 50 for CMU ARCTIC single speaker
test_num_samples=None,
random_state=1234,
# Loss
loss_type='softmax',
# Training:
batch_size=1, # real batch_size = N*M
steps=3 * 1e6,
initial_learning_rate=1e-2,
# see lrschedule.py for available lr_schedule
nepochs=2000,
learning_rate_decay=0.5,
clip_thresh=3,
# max time steps can either be specified as sec or steps
# This is needed for those who don't have huge GPU memory...
# if both are None, then full audio samples are used
max_time_sec=None,
max_time_steps=8000,
# Hold moving averaged parameters and use them for evaluation
exponential_moving_average=True,
# averaged = decay * averaged + (1 - decay) * x
ema_decay=0.9999,
# Save
# per-step intervals
checkpoint_interval=1000,
train_eval_interval=1000,
# per-epoch interval
test_eval_epoch_interval=5,
save_optimizer_state=True,
# Eval:
)
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
return 'Hyperparameters:\n' + '\n'.join(hp)