forked from speechbrain/speechbrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobust_asr.yaml
279 lines (241 loc) · 9.12 KB
/
robust_asr.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# ################################
# Model: VGG2 + LiGRU with time pooling for efficiency
# Additions: TimeDomainSpecAugment
# Authors: Mirco Ravanelli & Peter Plantinga 2020
# ################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1428
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
output_folder: !ref results/robust_asr/<seed>
stats_file: !ref <output_folder>/stats.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
# Data files
data_folder: !PLACEHOLDER # e.g. /path/to/Voicebank
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
train_annotation: !ref <data_folder>/train.json
valid_annotation: !ref <data_folder>/valid.json
test_annotation: !ref <data_folder>/test.json
noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script
skip_prep: False
####################### Training Parameters ####################################
number_of_epochs: 30
ctc_epochs: 0
batch_size: 8
lr: 0.0001
# token_type: unigram
target_type: words
checkpoint_avg: 5 # average this many checkpoints for eval
sorting: ascending
eval_max_key: null
eval_min_key: null
num_workers: 4
train_loader_options:
batch_size: !ref <batch_size>
num_workers: !ref <num_workers>
valid_loader_options:
batch_size: !ref <batch_size>
num_workers: !ref <num_workers>
shuffle: False
test_loader_options:
batch_size: !ref <batch_size>
num_workers: !ref <num_workers>
shuffle: False
epochs_before_lr_drop: 3
# Loss weights
enhance_type: masking # null, mapping, masking, noisy, clean
enhance_weight: 0.0
mimic_weight: 0.0
ctc_weight: 1.0
ctc_type: joint # null, clean, joint
seq_weight: 1.0
seq_type: joint # null, clean, joint
label_smoothing: 0.1
# Here, the pretrained models are downloaded from the speechbrain
# HuggingFace repository. However, a local path pointing to a directory with
# your checkpoints may also be specified instead (see pretrainer below)
pretrained_asr_path: speechbrain/asr-crdnn-rnnlm-librispeech
pretrained_enhance_path: speechbrain/mtl-mimic-voicebank
frozen_models: [enhance_model, lm_model, src_embedding]
unfreeze_epoch: !ref <epochs_before_lr_drop> + 1
unfrozen_models: [src_embedding]
# Feature parameters
sample_rate: 16000
n_fft: 512
win_length: 32
hop_length: 16
n_mels: 40
# Outputs
output_neurons: 1000
blank_index: 0
bos_index: 0
eos_index: 0
# Decoding params
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 32
eos_threshold: 10.0
using_max_attn_shift: True
max_attn_shift: 240
lm_weight: 0.15
coverage_penalty: 1.5
temperature: 1.25
temperature_lm: 1.25
enhance_model: !include:models/enhance_model.yaml
n_fft: !ref <n_fft>
normalizer: !new:speechbrain.processing.features.InputNormalization
tokenizer: !new:sentencepiece.SentencePieceProcessor
asr_model: !include:models/asr_model.yaml
n_mels: !ref <n_mels>
dropout_rate: 0.1
output_neurons: !ref <output_neurons>
model: !new:torch.nn.ModuleList
- - !ref <asr_model[src_embedding]>
- !ref <asr_model[tgt_embedding]>
- !ref <asr_model[recognizer]>
- !ref <asr_model[ctc_output]>
- !ref <asr_model[seq_output]>
# Change the path to use a local model instead of the remote one
asr_pretrained: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: !ref <save_folder>
loadables:
enhance_model: !ref <enhance_model[enhance_model]>
normalizer: !ref <normalizer>
tokenizer: !ref <tokenizer>
asr_model: !ref <model>
lm: !ref <asr_model[lm_model]>
paths:
enhance_model: !ref <pretrained_enhance_path>/enhance_model.ckpt
normalizer: !ref <pretrained_asr_path>/normalizer.ckpt
tokenizer: !ref <pretrained_asr_path>/tokenizer.ckpt
asr_model: !ref <pretrained_asr_path>/asr.ckpt
lm: !ref <pretrained_asr_path>/lm.ckpt
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
compute_stft: !new:speechbrain.processing.features.STFT
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
win_length: !ref <win_length>
hop_length: !ref <hop_length>
spectral_magnitude: !name:speechbrain.processing.features.spectral_magnitude
power: 0.5
############################## Augmentations ###################################
# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
URL: !ref <NOISE_DATASET_URL>
dest_folder: !ref <data_folder_noise>
ext: wav
csv_file: !ref <noise_annotation>
# Add noise to input signal
add_noise: !new:speechbrain.augment.time_domain.AddNoise
csv_file: !ref <noise_annotation>
snr_low: 0
snr_high: 15
noise_sample_rate: !ref <sample_rate>
clean_sample_rate: !ref <sample_rate>
num_workers: !ref <num_workers>
# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
orig_freq: !ref <sample_rate>
speeds: [95, 100, 105]
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: 0
drop_freq_high: 1
drop_freq_count_low: 1
drop_freq_count_high: 3
drop_freq_width: 0.05
# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: 1000
drop_length_high: 2000
drop_count_low: 1
drop_count_high: 5
# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
concat_original: True
min_augmentations: 4
max_augmentations: 4
augment_prob: 1.0
augmentations: [
!ref <add_noise>,
!ref <speed_perturb>,
!ref <drop_freq>,
!ref <drop_chunk>]
fbank: !new:speechbrain.lobes.features.Fbank
n_mels: !ref <n_mels>
sample_rate: !ref <sample_rate>
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
vocab_size: !ref <output_neurons>
rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
language_model: !ref <asr_model[lm_model]>
temperature: !ref <temperature_lm>
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <rnnlm_scorer>,
!ref <coverage_scorer>]
weights:
rnnlm: !ref <lm_weight>
coverage: !ref <coverage_penalty>
beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <asr_model[tgt_embedding]>
decoder: !ref <asr_model[recognizer]>
linear: !ref <asr_model[seq_output]>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <beam_size>
eos_threshold: !ref <eos_threshold>
using_max_attn_shift: !ref <using_max_attn_shift>
max_attn_shift: !ref <max_attn_shift>
temperature: !ref <temperature>
scorer: !ref <scorer>
opt_class: !name:torch.optim.AdamW
lr: !ref <lr>
weight_decay: 0.00005
lr_annealing: !new:speechbrain.nnet.schedulers.StepScheduler
initial_value: !ref <lr>
decay_factor: 0.7
decay_drop: !ref <epochs_before_lr_drop>
modules:
enhance_model: !ref <enhance_model[enhance_model]>
src_embedding: !ref <asr_model[src_embedding]>
tgt_embedding: !ref <asr_model[tgt_embedding]>
recognizer: !ref <asr_model[recognizer]>
ctc_output: !ref <asr_model[ctc_output]>
seq_output: !ref <asr_model[seq_output]>
lm_model: !ref <asr_model[lm_model]>
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
enhance_model: !ref <enhance_model[enhance_model]>
src_embedding: !ref <asr_model[src_embedding]>
tgt_embedding: !ref <asr_model[tgt_embedding]>
recognizer: !ref <asr_model[recognizer]>
ctc_output: !ref <asr_model[ctc_output]>
seq_output: !ref <asr_model[seq_output]>
counter: !ref <epoch_counter>
normalizer: !ref <normalizer>
enhance_loss: !name:speechbrain.nnet.losses.mse_loss
mimic_loss: !name:speechbrain.nnet.losses.mse_loss
seq_loss: !name:speechbrain.nnet.losses.nll_loss
label_smoothing: !ref <label_smoothing>
enhance_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.mse_loss
reduction: batch
mimic_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.mse_loss
reduction: batch
estoi_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:train.estoi_eval
n_jobs: 30
pesq_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:train.pesq_eval
n_jobs: 30
err_rate_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
merge_tokens: True
space_token: " "
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>