recipes/TIMIT/ASR/transducer/hparams/train.yaml

# ############################################################################
# Model: E2E ASR with Transducer ASR
# Encoder: CRDNN model
# Decoder: LiGRU + beamsearch
# losses: Transducer
# Training: TIMIT
# Authors:  Abdel Heba, Mirco Ravanelli, Sung-Lin Yeh 2020
# ############################################################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
output_folder: !ref results/augment_noise_CRDNN/<seed>
test_wer_file: !ref <output_folder>/wer_test.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/TIMIT
train_annotation: !ref <save_folder>/train.json
valid_annotation: !ref <save_folder>/dev.json
test_annotation: !ref <save_folder>/test.json
skip_prep: False # Skip data preparation
uppercase: False # Must be True when the TIMIT dataset is in the upper-case version

# Data for augmentation
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script

####################### Training Parameters ####################################
number_of_epochs: 50
batch_size: 8
lr: 1.0
sorting: ascending # choose between ascending, descending and random

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40


####################### Model Parameters #######################################
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
cnn_kernelsize: (3, 3)
rnn_layers: 4
rnn_neurons: 512
rnn_bidirectional: True
dnn_blocks: 2
dnn_neurons: 512
# emb_size: 128
dec_neurons: 128

# Outputs
output_neurons: 42
joint_dim: 128
blank_index: 0

# Decoding parameters
beam_size: 4
nbest: 1
# by default {state,expand}_beam = 2.3 as mention in paper
# https://arxiv.org/abs/1904.02619
state_beam: 1.0
expand_beam: 1.0

# Dataloader options
num_workers: 4
train_dataloader_opts:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>
valid_dataloader_opts:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>

test_dataloader_opts:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

compute_features: !new:speechbrain.lobes.features.Fbank
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    n_mels: !ref <n_mels>


normalize: !new:speechbrain.processing.features.InputNormalization
    norm_type: global

############################## Augmentations ###################################

# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: 0
    snr_high: 15
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
    orig_freq: !ref <sample_rate>
    speeds: [95, 100, 105]

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: 0
    drop_freq_high: 1
    drop_freq_count_low: 1
    drop_freq_count_high: 3
    drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: 1000
    drop_length_high: 2000
    drop_count_low: 1
    drop_count_high: 5

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    concat_original: True
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <speed_perturb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

############################## Models ##########################################


enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
    input_shape: [null, null, !ref <n_mels>]
    activation: !ref <activation>
    dropout: !ref <dropout>
    cnn_blocks: !ref <cnn_blocks>
    cnn_channels: !ref <cnn_channels>
    cnn_kernelsize: !ref <cnn_kernelsize>
    time_pooling: True
    rnn_layers: !ref <rnn_layers>
    rnn_neurons: !ref <rnn_neurons>
    rnn_bidirectional: !ref <rnn_bidirectional>
    dnn_blocks: !ref <dnn_blocks>
    dnn_neurons: !ref <dnn_neurons>

jit_module_keys: [enc]

enc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dnn_neurons>
    n_neurons: !ref <joint_dim>
    bias: False

emb: !new:speechbrain.nnet.embedding.Embedding
    num_embeddings: !ref <output_neurons>
    consider_as_one_hot: True
    blank_id: !ref <blank_index>

dec: !new:speechbrain.nnet.RNN.LiGRU
    input_shape: [null, null, !ref <output_neurons> - 1]
    hidden_size: !ref <dec_neurons>
    num_layers: 1
    dropout: 0.0
    normalization: layernorm
    bidirectional: False

dec_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dec_neurons>
    n_neurons: !ref <joint_dim>
    bias: False

Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
    joint: sum # joint [sum | concat]
    nonlinearity: !ref <activation>

output: !new:speechbrain.nnet.linear.Linear
    input_size: 128
    n_neurons: !ref <output_neurons>  # 42 phonemes + 1 blank
    bias: False

compute_cost: !name:speechbrain.nnet.losses.transducer_loss
    use_torchaudio: True
    blank_index: !ref <blank_index>

model: !new:torch.nn.ModuleList [[
    !ref <enc>,
    !ref <enc_lin>,
    !ref <emb>,
    !ref <dec>,
    !ref <dec_lin>,
    !ref <output>
]]

Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
    decode_network_lst: [!ref <emb>, !ref <dec>, !ref <dec_lin>]
    tjoint: !ref <Tjoint>
    classifier_network: [!ref <output>]
    blank_id: !ref <blank_index>
    beam_size: 1
    nbest: 1

Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
    decode_network_lst: [!ref <emb>, !ref <dec>, !ref <dec_lin>]
    tjoint: !ref <Tjoint>
    classifier_network: [!ref <output>]
    blank_id: !ref <blank_index>
    beam_size: !ref <beam_size>
    nbest: !ref <nbest>
    state_beam: !ref <state_beam>
    expand_beam: !ref <expand_beam>

opt_class: !name:torch.optim.Adadelta
    rho: 0.95
    eps: 1.e-8
    lr: !ref <lr>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
    patient: 0

modules:
    enc: !ref <enc>
    enc_lin: !ref <enc_lin>
    emb: !ref <emb>
    dec: !ref <dec>
    dec_lin: !ref <dec_lin>
    Tjoint: !ref <Tjoint>
    output: !ref <output>
    normalize: !ref <normalize>

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        scheduler: !ref <lr_annealing>
        normalizer: !ref <normalize>
        counter: !ref <epoch_counter>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

transducer_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.transducer_loss
        use_torchaudio: True
        blank_index: !ref <blank_index>
        reduction: none

per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats