forked from s3prl/s3prl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaalbert_libri_fbank3L.yaml
54 lines (47 loc) · 5.38 KB
/
aalbert_libri_fbank3L.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
transformer:
input_dim: 80 # `int`, 39 for mfcc, 40 for fmllr, 80 for fbank, 160 for mel
downsample_rate: 3 # stacked consecutive features vectors to reduce the length of input sequences by this factor.
hidden_size: 768 # Size of the encoder layers and the pooler layer.
num_hidden_layers: 3 # Number of hidden layers in the Transformer encoder.
num_attention_heads: 12 # Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size: 3072 # The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act: "gelu" # The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: 0.1 # The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: 0.1 # The dropout ratio for the attention probabilities.
initializer_range: 0.02 # The sttdev of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps: "1e-12" # The epsilon used by LayerNorm.
mask_proportion: 0.15 # mask this percentage of all spectrogram frames in each sequence at random during MAM training
mask_consecutive_min: 1 # mask this amount of consecutive frames
mask_consecutive_max: 1 # mask this amount of consecutive frames
mask_allow_overlap: True # allow overlap masking
mask_bucket_ratio: 1.2 # only used when overlap is not allowed. sample a mask from each bucket in size of [sampled mask_consecutive * mask_bucket_ratio]
mask_frequency: 0 # mask maximum this amount of frequency bands, set to 0 for no frequency mask
noise_proportion: 0.0 # for this percentage of the time, Gaussian noise will be applied on all frames during MAM training, set to 0 for no noise
prune_headids: None # Usage: 0,1,2,12-15 will prune headids [0,1,2,12,13,14]. headids = layerid * head_num + headid_in_layer
share_layer: True # Share layer weights
max_input_length: 0 # maximum input length (0 for no restriction)
pre_layer_norm: False # apply the pre layer normalization technique introduced in: https://arxiv.org/abs/2002.04745
optimizer:
type: 'adam' # modes: ['adam', 'adamW', 'lamb']
learning_rate: "2e-4" # Learning rate for opt. "4e-4" for 'data/libri_mel160_subword5000', "2e-4" for 'data/libri_fmllr_cmvn'
loss_scale: 0 # Loss scale to improve fp16 numeric stability. Only used when apex is set to True. 0: dynamic loss scaling. positive power of 2: static loss scaling.
warmup_proportion: 0.07 # Proportion of training to perform linear rate warmup.
gradient_accumulation_steps: 5 # Number of updates steps to accumulate before performing a backward/update pass
gradient_clipping: 5.0 # Maximum gradient norm
dataloader:
n_jobs: 12 # Subprocess used for torch Dataloader
batch_size: 10 # training batch size
max_timestep: 1500 # Max length for audio feature (0 for no restriction), 1500 for pre-train, 3000 for downstream tasks
# LIBRISEECH SETTINGS
data_path: 'data/libri_fbank_cmvn' # Source data path, 'data/libri_fmllr_cmvn', or 'data/libri_mfcc_cmvn', or 'data/libri_mel160_subword5000' for different preprocessing features
target_path: '' # Target data path, not used when `duo_deature:False`. For reconstruction to a different feature type, for example set dataset to 'libri_linear1025_subword5000'.
phone_path: 'data/cpc_phone' # phone boundary label data path for the phone classification task. set to 'data/libri_phone' or 'data/cpc_phone'
train_set: ['train-clean-100'] # can be the subset of ['train-clean-100', 'train-clean-360', 'train-other-500']
runner:
# Training options
apex: False # Use APEX (see https://github.com/NVIDIA/apex for more details)
total_steps: 200000 # total steps for training, a step is a batch of update
log_step: 2500 # log training status every this amount of training steps
save_step: 10000 # save model every this amount of training steps
duo_feature: False # Use different input / output features during training
max_keep: 2 # maximum number of model ckpt to keep during training