recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml

# ############################################################################
# Model: E2E ASR with transformer and transducer
# Encoder: Conformer
# Decoder: LSTM + beamsearch + RNNLM
# Tokens: BPE with unigram
# losses: Transducer + CTC (optional) + CE (optional)
# Training: Librispeech 960h
# Authors:  Titouan Parcollet 2023, Abdel HEBA, Mirco Ravanelli, Sung-Lin Yeh 2020
# ############################################################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 3407
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
output_folder: !ref results/conformer_transducer_large/<seed>
output_wer_folder: !ref <output_folder>/
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Language model (LM) pretraining
# NB: To avoid mismatch, the speech recognizer must be trained with the same
# tokenizer used for LM training. Here, we download everything from the
# speechbrain HuggingFace repository. However, a local path pointing to a
# directory containing the lm.ckpt and tokenizer.ckpt may also be specified
# instead. E.g if you want to use your own LM / tokenizer.
pretrained_lm_tokenizer_path: speechbrain/asr-crdnn-rnnlm-librispeech

# Data files
data_folder: !PLACEHOLDER # e.g, /localscratch/LibriSpeech
# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
# then data_folder_rirs should be /localscratch/xxx_corpus
# otherwise the dataset will automatically be downloaded
train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
dev_splits: ["dev-clean"]
test_splits: ["test-clean", "test-other"]
train_csv: !ref <output_folder>/train.csv
valid_csv: !ref <output_folder>/dev-clean.csv
test_csv:
   - !ref <output_folder>/test-clean.csv
   - !ref <output_folder>/test-other.csv
skip_prep: False
ckpt_interval_minutes: 5 # save checkpoint every N min

####################### Training Parameters ####################################

# To make Transformers converge, the global bath size should be large enough.
# The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor.
# Empirically, we found that this value should be >= 128.
# Please, set your parameters accordingly.
number_of_epochs: 100
warmup_steps: 25000
num_workers: 4
batch_size_valid: 4
lr: 0.0008
weight_decay: 0.01
number_of_ctc_epochs: 60
ctc_weight: 0.3 # Multitask with CTC for the encoder (0.0 = disabled)
ce_weight: 0.0 # Multitask with CE for the decoder (0.0 = disabled)
max_grad_norm: 5.0
loss_reduction: 'batchmean'
precision: fp32 # bf16, fp16 or fp32

# The batch size is used if and only if dynamic batching is set to False
# Validation and testing are done with fixed batches and not dynamic batching.
batch_size: 8
grad_accumulation_factor: 4
sorting: random
avg_checkpoints: 10 # Number of checkpoints to average for evaluation

# Feature parameters
sample_rate: 16000
n_fft: 512
n_mels: 80
win_length: 32

# Streaming & dynamic chunk training options
# At least for the current architecture on LibriSpeech, we found out that
# non-streaming accuracy is very similar between `streaming: True` and
# `streaming: False`.
streaming: True  # controls all Dynamic Chunk Training & chunk size & left context mechanisms

# Configuration for Dynamic Chunk Training.
# In this model, a chunk is roughly equivalent to 40ms of audio.
dynchunktrain_config_sampler: !new:speechbrain.utils.dynamic_chunk_training.DynChunkTrainConfigRandomSampler # yamllint disable-line rule:line-length
   chunkwise_prob: 0.6 # Probability during a batch to limit attention and sample a random chunk size in the following range
   chunk_size_min: 8 # Minimum chunk size (if in a DynChunkTrain batch)
   chunk_size_max: 32 # Maximum chunk size (if in a DynChunkTrain batch)
   limited_left_context_prob: 0.75 # If in a DynChunkTrain batch, the probability during a batch to restrict left context to a random number of chunks
   left_context_chunks_min: 2 # Minimum left context size (in # of chunks)
   left_context_chunks_max: 32 # Maximum left context size (in # of chunks)
   # If you specify a valid/test config, you can optionally have evaluation be
   # done with a specific DynChunkTrain configuration.
   # valid_config: !new:speechbrain.utils.dynamic_chunk_training.DynChunkTrainConfig
   #    chunk_size: 24
   #    left_context_size: 16
   # test_config: ...

# Dataloader options
train_dataloader_opts:
   batch_size: !ref <batch_size>
   num_workers: !ref <num_workers>

valid_dataloader_opts:
   batch_size: !ref <batch_size_valid>

test_dataloader_opts:
   batch_size: !ref <batch_size_valid>

# This setup works well for 3090 24GB GPU, adapt it to your needs.
# Adjust grad_accumulation_factor depending on the DDP node count (here 3)
# Or turn it off (but training speed will decrease)
dynamic_batching: True
max_batch_len: 150
max_batch_len_val: 50 # we reduce it as the beam is much wider (VRAM)
num_bucket: 200

dynamic_batch_sampler:
   max_batch_len: !ref <max_batch_len>
   max_batch_len_val: !ref <max_batch_len_val>
   num_buckets: !ref <num_bucket>
   shuffle_ex: True # if true re-creates batches at each epoch shuffling examples.
   batch_ordering: random
   max_batch_ex: 256

####################### Model Parameters #######################################

# Transformer
d_model: 512
joint_dim: 640
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 1000
dec_dim: 512
dec_emb_dropout: 0.2
dec_dropout: 0.1

# Decoding parameters
blank_index: 0
bos_index: 0
eos_index: 0
pad_index: 0
beam_size: 10
nbest: 1
# by default {state,expand}_beam = 2.3 as mention in paper
# https://arxiv.org/abs/1904.02619
state_beam: 2.3
expand_beam: 2.3
lm_weight: 0.50

# If True uses torchaudio loss. Otherwise, the numba one
use_torchaudio: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
   limit: !ref <number_of_epochs>

normalize: !new:speechbrain.processing.features.InputNormalization
   norm_type: global
   update_until_epoch: 4

compute_features: !new:speechbrain.lobes.features.Fbank
   sample_rate: !ref <sample_rate>
   n_fft: !ref <n_fft>
   n_mels: !ref <n_mels>
   win_length: !ref <win_length>

############################## Augmentations ###################################

# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
   orig_freq: !ref <sample_rate>
   speeds: [95, 100, 105]

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
   min_augmentations: 1
   max_augmentations: 1
   augment_prob: 1.0
   augmentations: [!ref <speed_perturb>]


# Time Drop
time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop
   drop_length_low: 12
   drop_length_high: 20
   drop_count_low: 5
   drop_count_high: 5
   replace: "zeros"

# Frequency Drop
freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop
   drop_length_low: 20
   drop_length_high: 25
   drop_count_low: 2
   drop_count_high: 2
   replace: "zeros"
   dim: 2

# Time warp
time_warp: !new:speechbrain.augment.freq_domain.Warping

fea_augment: !new:speechbrain.augment.augmenter.Augmenter
   parallel_augment: False
   concat_original: False
   repeat_augment: 1
   shuffle_augmentations: False
   min_augmentations: 3
   max_augmentations: 3
   augment_prob: 1.0
   augmentations: [
      !ref <time_drop>,
      !ref <freq_drop>,
      !ref <time_warp>]

############################## Models ##########################################

CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
   input_shape: (8, 10, 80)
   num_blocks: 2
   num_layers_per_block: 1
   out_channels: (64, 32)
   kernel_sizes: (3, 3)
   strides: (2, 2)
   residuals: (False, False)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
   input_size: 640
   tgt_vocab: !ref <output_neurons>
   d_model: !ref <d_model>
   nhead: !ref <nhead>
   num_encoder_layers: !ref <num_encoder_layers>
   num_decoder_layers: !ref <num_decoder_layers>
   d_ffn: !ref <d_ffn>
   dropout: !ref <transformer_dropout>
   activation: !ref <activation>
   encoder_module: conformer
   attention_type: RelPosMHAXL
   normalize_before: True
   causal: False

# We must call an encoder wrapper so the decoder isn't run (we don't have any)
enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
   transformer: !ref <Transformer>

# For MTL CTC over the encoder
proj_ctc: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <joint_dim>
   n_neurons: !ref <output_neurons>

# Define some projection layers to make sure that enc and dec
# output dim are the same before joining
proj_enc: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <d_model>
   n_neurons: !ref <joint_dim>
   bias: False

proj_dec: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dec_dim>
   n_neurons: !ref <joint_dim>
   bias: False

# Uncomment for MTL with CTC
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
   blank_index: !ref <blank_index>
   reduction: !ref <loss_reduction>

emb: !new:speechbrain.nnet.embedding.Embedding
   num_embeddings: !ref <output_neurons>
   consider_as_one_hot: True
   blank_id: !ref <blank_index>

dec: !new:speechbrain.nnet.RNN.LSTM
   input_shape: [null, null, !ref <output_neurons> - 1]
   hidden_size: !ref <dec_dim>
   num_layers: 1
   re_init: True

# For MTL with LM over the decoder (need to uncomment to activate)
# dec_lin: !new:speechbrain.nnet.linear.Linear
#   input_size: !ref <joint_dim>
#   n_neurons: !ref <output_neurons>
#   bias: False

# For MTL
ce_cost: !name:speechbrain.nnet.losses.nll_loss
   label_smoothing: 0.1

Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
   joint: sum # joint [sum | concat]
   nonlinearity: !ref <activation>

transducer_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <joint_dim>
   n_neurons: !ref <output_neurons>
   bias: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True

transducer_cost: !name:speechbrain.nnet.losses.transducer_loss
   blank_index: !ref <blank_index>
   use_torchaudio: !ref <use_torchaudio>

# This is the RNNLM that is used according to the Huggingface repository
# NB: It has to match the pre-trained RNNLM!!
lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
   output_neurons: !ref <output_neurons>
   embedding_dim: 128
   activation: !name:torch.nn.LeakyReLU
   dropout: 0.0
   rnn_layers: 2
   rnn_neurons: 2048
   dnn_blocks: 1
   dnn_neurons: 512
   return_hidden: True  # For inference

# for MTL
# update model if any HEAD module is added
modules:
   CNN: !ref <CNN>
   enc: !ref <enc>
   emb: !ref <emb>
   dec: !ref <dec>
   Tjoint: !ref <Tjoint>
   transducer_lin: !ref <transducer_lin>
   normalize: !ref <normalize>
   lm_model: !ref <lm_model>
   proj_ctc: !ref <proj_ctc>
   proj_dec: !ref <proj_dec>
   proj_enc: !ref <proj_enc>
#   dec_lin: !ref <dec_lin>

# for MTL
# update model if any HEAD module is added
model: !new:torch.nn.ModuleList
   - [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>]

############################## Decoding & optimiser ############################

# Tokenizer initialization
tokenizer: !new:sentencepiece.SentencePieceProcessor

Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
   decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
   tjoint: !ref <Tjoint>
   classifier_network: [!ref <transducer_lin>]
   blank_id: !ref <blank_index>
   beam_size: 1
   nbest: 1

Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
   decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
   tjoint: !ref <Tjoint>
   classifier_network: [!ref <transducer_lin>]
   blank_id: !ref <blank_index>
   beam_size: !ref <beam_size>
   nbest: !ref <nbest>
   lm_module: !ref <lm_model>
   lm_weight: !ref <lm_weight>
   state_beam: !ref <state_beam>
   expand_beam: !ref <expand_beam>

opt_class: !name:torch.optim.AdamW
   lr: !ref <lr>
   betas: (0.9, 0.98)
   eps: 1.e-8
   weight_decay: !ref <weight_decay>

noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
   lr_initial: !ref <lr>
   n_warmup_steps: !ref <warmup_steps>

############################## Logging and Pretrainer ##########################

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
   checkpoints_dir: !ref <save_folder>
   recoverables:
      model: !ref <model>
      scheduler: !ref <noam_annealing>
      normalizer: !ref <normalize>
      counter: !ref <epoch_counter>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   collect_in: !ref <save_folder>
   loadables:
      lm: !ref <lm_model>
      tokenizer: !ref <tokenizer>
   paths:
      lm: !ref <pretrained_lm_tokenizer_path>/lm.ckpt
      tokenizer: !ref <pretrained_lm_tokenizer_path>/tokenizer.ckpt


train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
   save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
   split_tokens: True

# for the inference hparams, you will need to include and uncomment something like this:

# make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext
# tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space

# make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor
# decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming
#    - !ref <Greedysearcher>  # self

# fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper
#    module: !new:speechbrain.nnet.containers.LengthsCapableSequential
#       - !ref <compute_features>
#       - !ref <normalize>
#       - !ref <CNN>
#    # don't consider normalization as part of the input filter chain.
#    # normalization will operate at chunk level, which mismatches training
#    # somewhat, but does not appear to result in noticeable degradation.
#    properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties
#       - [!ref <compute_features>, !ref <CNN>]