forked from speechbrain/speechbrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.yaml
162 lines (134 loc) · 4.68 KB
/
train.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# ################################
# Model: VGG2 + LiGRU with time pooling for efficiency
# Additions: TimeDomainSpecAugment
# Authors: Mirco Ravanelli & Peter Plantinga 2020
# ################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1236
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
input_type: clean_wav
output_folder: !ref results/<input_type>/phn/<seed>
per_file: !ref <output_folder>/per.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# Data files
data_folder: !PLACEHOLDER # e.g. /path/to/Voicebank
train_annotation: !ref <output_folder>/train.json
valid_annotation: !ref <output_folder>/valid.json
test_annotation: !ref <output_folder>/test.json
skip_prep: False # Skip data preparation
####################### Training Parameters ####################################
number_of_epochs: 50
batch_size: 8
sorting: ascending
dataloader_options:
batch_size: !ref <batch_size>
lr: 0.5
max_grad_norm: 5.0
# Set this to the path of a pretrained model to load before training
# pretrained: model_clean_ep3.ckpt
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40
####################### Model Parameters #######################################
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
cnn_kernelsize: (3, 3)
rnn_layers: 4
rnn_neurons: 512
rnn_bidirectional: True
dnn_blocks: 2
dnn_neurons: 512
# Outputs
output_neurons: 42
blank_index: !ref <output_neurons> - 1
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
############################## Augmentations ###################################
# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
orig_freq: !ref <sample_rate>
speeds: [95, 100, 105]
# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
drop_freq_low: 0
drop_freq_high: 1
drop_freq_count_low: 1
drop_freq_count_high: 3
drop_freq_width: 0.05
# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
drop_length_low: 1000
drop_length_high: 2000
drop_count_low: 1
drop_count_high: 5
# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
concat_original: True
min_augmentations: 3
max_augmentations: 3
augment_prob: 1.0
augmentations: [
!ref <speed_perturb>,
!ref <drop_freq>,
!ref <drop_chunk>]
############################## Models ##########################################
model: !new:speechbrain.lobes.models.CRDNN.CRDNN
input_shape: [null, null, !ref <n_mels>]
activation: !ref <activation>
dropout: !ref <dropout>
cnn_blocks: !ref <cnn_blocks>
cnn_channels: !ref <cnn_channels>
cnn_kernelsize: !ref <cnn_kernelsize>
time_pooling: True
rnn_layers: !ref <rnn_layers>
rnn_neurons: !ref <rnn_neurons>
rnn_bidirectional: !ref <rnn_bidirectional>
dnn_blocks: !ref <dnn_blocks>
dnn_neurons: !ref <dnn_neurons>
output: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons>
bias: True
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
opt_class: !name:torch.optim.Adadelta
lr: !ref <lr>
rho: 0.95
eps: 1.e-8
lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <lr>
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
modules:
model: !ref <model>
output: !ref <output>
normalize: !ref <normalize>
jit_module_keys: [model]
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
model: !ref <model>
output: !ref <output>
counter: !ref <epoch_counter>
scheduler: !ref <lr_annealing>
normalizer: !ref <normalize>
compute_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: !ref <blank_index>
ctc_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.ctc_loss
blank_index: !ref <blank_index>
reduction: batch
per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>