forked from speechbrain/speechbrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnn14.yaml
165 lines (134 loc) · 4.54 KB
/
cnn14.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# #################################
# Basic training parameters for sound classification using the ESC50 dataset.
# This recipe uses the ecapa-tdnn backbone for classification.
#
# Authors:
# * Cem Subakan 2022, 2023
# * Francesco Paissan 2022, 2023, 2024
# (based on the SpeechBrain UrbanSound8k recipe)
# #################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
# Set up folders for reading from and writing to
data_folder: !PLACEHOLDER # e.g., /localscratch/ESC-50-master
audio_data_folder: !ref <data_folder>/audio
experiment_name: !ref cnn14-esc50
output_folder: !ref ./results/<experiment_name>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
add_wham_noise: False
test_only: False
wham_folder: null # Set it if add_wham_noise is True.
wham_audio_folder: !ref <wham_folder>/tr
sample_rate: 16000
signal_length_s: 5
# Tensorboard logs
use_tensorboard: False
tensorboard_logs_folder: !ref <output_folder>/tb_logs/
# Path where data manifest files will be stored
train_annotation: !ref <data_folder>/manifest/train.json
valid_annotation: !ref <data_folder>/manifest/valid.json
test_annotation: !ref <data_folder>/manifest/test.json
# To standardize results, UrbanSound8k has pre-separated samples into
# 10 folds for multi-fold validation
train_fold_nums: [1, 2, 3]
valid_fold_nums: [4]
test_fold_nums: [5]
skip_manifest_creation: False
ckpt_interval_minutes: 15 # save checkpoint every N min
# Training parameters
number_of_epochs: 200
batch_size: 32
lr: 0.0002
base_lr: 0.00000001
max_lr: !ref <lr>
step_size: 65000
# Feature parameters
n_mels: 80
left_frames: 0
right_frames: 0
deltas: False
use_melspectra: True
use_log1p_mel: True
# Number of classes
out_n_neurons: 50
# Note that it's actually important to shuffle the data here
shuffle: True
dataloader_options:
batch_size: !ref <batch_size>
shuffle: !ref <shuffle>
num_workers: 0
# Functions
compute_features: !new:speechbrain.lobes.features.Fbank
n_mels: !ref <n_mels>
left_frames: !ref <left_frames>
right_frames: !ref <right_frames>
deltas: !ref <deltas>
sample_rate: !ref <sample_rate>
n_fft: 1024
win_length: 20
hop_length: 10
embedding_model: !new:speechbrain.lobes.models.Cnn14.Cnn14
mel_bins: !ref <n_mels>
emb_dim: 2048
classifier: !new:torch.nn.Linear
in_features: 2048
out_features: !ref <out_n_neurons>
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: !new:speechbrain.processing.features.STFT
n_fft: !ref <n_fft>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
sample_rate: !ref <sample_rate>
compute_fbank: !new:speechbrain.processing.features.Filterbank
n_mels: 80
n_fft: !ref <n_fft>
sample_rate: !ref <sample_rate>
log_mel: False
modules:
compute_stft: !ref <compute_stft>
compute_fbank: !ref <compute_fbank>
compute_features: !ref <compute_features>
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
margin: 0.2
scale: 30
opt_class: !name:torch.optim.Adam
lr: !ref <lr>
weight_decay: 0.000002
lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
base_lr: !ref <base_lr>
max_lr: !ref <max_lr>
step_size: !ref <step_size>
# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
error_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.classification_error
reduction: batch
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
counter: !ref <epoch_counter>
use_pretrained: True
# If you do not want to use the pretrained encoder
# you can simply delete pretrained_encoder field,
# or set use_pretrained=False
embedding_model_path: speechbrain/cnn14-esc50/embedding_model.ckpt
pretrained_encoder: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: !ref <save_folder>
loadables:
embedding_model: !ref <embedding_model>
paths:
embedding_model: !ref <embedding_model_path>