forked from speechbrain/speechbrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_wav2vec.yaml
230 lines (191 loc) · 6.79 KB
/
train_wav2vec.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# ############################################################################
# Model: E2E ASR with Transducer ASR
# Encoder: Wav2vec
# Decoder: LiGRU + beamsearch
# losses: Transducer
# Training: TIMIT
# Authors: Abdel Heba, Titouan Percollet, Loren Lugosch, Mirco Ravanelli, Sung-Lin Yeh 2021
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
output_folder: !ref results/augment_wav2vec/<seed>
test_wer_file: !ref <output_folder>/wer_test.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# URL for the biggest Fairseq english wav2vec2 model.
wav2vec2_hub: "facebook/wav2vec2-large-lv60"
wav2vec2_output: 1024
freeze_wav2vec: False
# Data files
data_folder: !PLACEHOLDER # e.g. /path/to/TIMIT
train_annotation: !ref <save_folder>/train.json
valid_annotation: !ref <save_folder>/dev.json
test_annotation: !ref <save_folder>/test.json
skip_prep: False # Skip data preparation
uppercase: False # Must be True when the TIMIT dataset is in the upper-case version
####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 8
lr: 0.0003
lr_wav2vec: 0.0001
sorting: ascending # choose between ascending, descending and random
precision: fp16 # bf16, fp16 or fp32
# Feature parameters
sample_rate: 16000
# n_fft: 400
# n_mels: 40
####################### Model Parameters #######################################
activation: !name:torch.nn.LeakyReLU
# dropout: 0.15
dnn_blocks: 1
dnn_neurons: 43
dec_neurons: 128
# Outputs
output_neurons: 43
joint_dim: 43
blank_index: 0
# Decoding parameters
beam_size: 4
nbest: 1
# by default {state,expand}_beam = 2.3 as mention in paper
# https://arxiv.org/abs/1904.02619
state_beam: 1.0
expand_beam: 1.0
# Dataloader options
train_dataloader_opts:
batch_size: !ref <batch_size>
valid_dataloader_opts:
batch_size: !ref <batch_size>
test_dataloader_opts:
batch_size: !ref <batch_size>
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
############################## Augmentations ###################################
# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
orig_freq: !ref <sample_rate>
speeds: [95, 100, 105]
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: 0
drop_freq_high: 1
drop_freq_count_low: 1
drop_freq_count_high: 3
drop_freq_width: 0.05
# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: 1000
drop_length_high: 2000
drop_count_low: 1
drop_count_high: 5
# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
min_augmentations: 3
max_augmentations: 3
augment_prob: 1.0
augmentations: [
!ref <speed_perturb>,
!ref <drop_freq>,
!ref <drop_chunk>]
############################## Models ##########################################
wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
source: !ref <wav2vec2_hub>
output_norm: True
freeze: !ref <freeze_wav2vec>
save_path: !ref <save_folder>/wav2vec2_checkpoint
enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
input_shape: [null, null, !ref <wav2vec2_output>]
activation: !ref <activation>
dnn_blocks: !ref <dnn_blocks>
dnn_neurons: !ref <dnn_neurons>
enc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <joint_dim>
bias: False
emb: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
consider_as_one_hot: True
blank_id: !ref <blank_index>
dec: !new:speechbrain.nnet.RNN.GRU
input_shape: [null, null, !ref <output_neurons> - 1]
hidden_size: !ref <dec_neurons>
num_layers: 1
dropout: 0.0
bidirectional: False
dec_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_neurons>
n_neurons: !ref <joint_dim>
bias: False
Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
joint: sum # joint [sum | concat]
nonlinearity: !ref <activation>
output: !new:speechbrain.nnet.linear.Linear
input_size: !ref <joint_dim>
n_neurons: !ref <output_neurons> # 42 phonemes + 1 blank
bias: False
compute_cost: !name:speechbrain.nnet.losses.transducer_loss
use_torchaudio: True
blank_index: !ref <blank_index>
model: !new:torch.nn.ModuleList [[
!ref <enc>,
!ref <enc_lin>,
!ref <emb>,
!ref <dec>,
!ref <dec_lin>,
!ref <output>
]]
Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
decode_network_lst: [!ref <emb>, !ref <dec>, !ref <dec_lin>]
tjoint: !ref <Tjoint>
classifier_network: [!ref <output>]
blank_id: !ref <blank_index>
beam_size: 1
nbest: 1
Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
decode_network_lst: [!ref <emb>, !ref <dec>, !ref <dec_lin>]
tjoint: !ref <Tjoint>
classifier_network: [!ref <output>]
blank_id: !ref <blank_index>
beam_size: !ref <beam_size>
nbest: !ref <nbest>
state_beam: !ref <state_beam>
expand_beam: !ref <expand_beam>
adam_opt_class: !name:torch.optim.Adam
lr: !ref <lr>
wav2vec_opt_class: !name:torch.optim.Adam
lr: !ref <lr_wav2vec>
lr_annealing_adam: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <lr>
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <lr_wav2vec>
improvement_threshold: 0.0025
annealing_factor: 0.9
modules:
wav2vec2: !ref <wav2vec2>
enc: !ref <enc>
enc_lin: !ref <enc_lin>
emb: !ref <emb>
dec: !ref <dec>
dec_lin: !ref <dec_lin>
Tjoint: !ref <Tjoint>
output: !ref <output>
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
wav2vec: !ref <wav2vec2>
model: !ref <model>
lr_annealing_adam: !ref <lr_annealing_adam>
lr_annealing_wav2vec: !ref <lr_annealing_wav2vec>
counter: !ref <epoch_counter>
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
transducer_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.transducer_loss
use_torchaudio: True
blank_index: !ref <blank_index>
reduction: none
per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats