recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml

# ################################
# Model: VGG2 + LiGRU with time pooling for efficiency
# Additions: TimeDomainSpecAugment
# Authors: Mirco Ravanelli & Peter Plantinga 2020
# ################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1428
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
output_folder: !ref results/robust_asr/<seed>
stats_file: !ref <output_folder>/stats.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/Voicebank
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
train_annotation: !ref <data_folder>/train.json
valid_annotation: !ref <data_folder>/valid.json
test_annotation: !ref <data_folder>/test.json
noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script
skip_prep: False

####################### Training Parameters ####################################
number_of_epochs: 30
ctc_epochs: 0
batch_size: 8
lr: 0.0001
# token_type: unigram
target_type: words
checkpoint_avg: 5  # average this many checkpoints for eval
sorting: ascending
eval_max_key: null
eval_min_key: null

num_workers: 4
train_loader_options:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>
valid_loader_options:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>
    shuffle: False
test_loader_options:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>
    shuffle: False
epochs_before_lr_drop: 3

# Loss weights
enhance_type: masking  # null, mapping, masking, noisy, clean
enhance_weight: 0.0
mimic_weight: 0.0
ctc_weight: 1.0
ctc_type: joint  # null, clean, joint
seq_weight: 1.0
seq_type: joint  # null, clean, joint
label_smoothing: 0.1

# Here, the pretrained models are downloaded from the speechbrain
# HuggingFace repository. However, a local path pointing to a directory with
# your checkpoints may also be specified instead (see pretrainer below)
pretrained_asr_path: speechbrain/asr-crdnn-rnnlm-librispeech
pretrained_enhance_path: speechbrain/mtl-mimic-voicebank

frozen_models: [enhance_model, lm_model, src_embedding]
unfreeze_epoch: !ref <epochs_before_lr_drop> + 1
unfrozen_models: [src_embedding]

# Feature parameters
sample_rate: 16000
n_fft: 512
win_length: 32
hop_length: 16
n_mels: 40

# Outputs
output_neurons: 1000
blank_index: 0
bos_index: 0
eos_index: 0

# Decoding params
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 32
eos_threshold: 10.0
using_max_attn_shift: True
max_attn_shift: 240
lm_weight: 0.15
coverage_penalty: 1.5
temperature: 1.25
temperature_lm: 1.25

enhance_model: !include:models/enhance_model.yaml
    n_fft: !ref <n_fft>

normalizer: !new:speechbrain.processing.features.InputNormalization
tokenizer: !new:sentencepiece.SentencePieceProcessor

asr_model: !include:models/asr_model.yaml
    n_mels: !ref <n_mels>
    dropout_rate: 0.1
    output_neurons: !ref <output_neurons>

model: !new:torch.nn.ModuleList
    - - !ref <asr_model[src_embedding]>
      - !ref <asr_model[tgt_embedding]>
      - !ref <asr_model[recognizer]>
      - !ref <asr_model[ctc_output]>
      - !ref <asr_model[seq_output]>

# Change the path to use a local model instead of the remote one
asr_pretrained: !new:speechbrain.utils.parameter_transfer.Pretrainer
    collect_in: !ref <save_folder>
    loadables:
        enhance_model: !ref <enhance_model[enhance_model]>
        normalizer: !ref <normalizer>
        tokenizer: !ref <tokenizer>
        asr_model: !ref <model>
        lm: !ref <asr_model[lm_model]>
    paths:
        enhance_model: !ref <pretrained_enhance_path>/enhance_model.ckpt
        normalizer: !ref <pretrained_asr_path>/normalizer.ckpt
        tokenizer: !ref <pretrained_asr_path>/tokenizer.ckpt
        asr_model: !ref <pretrained_asr_path>/asr.ckpt
        lm: !ref <pretrained_asr_path>/lm.ckpt

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

compute_stft: !new:speechbrain.processing.features.STFT
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    win_length: !ref <win_length>
    hop_length: !ref <hop_length>

spectral_magnitude: !name:speechbrain.processing.features.spectral_magnitude
    power: 0.5

############################## Augmentations ###################################

# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: 0
    snr_high: 15
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
    orig_freq: !ref <sample_rate>
    speeds: [95, 100, 105]

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: 0
    drop_freq_high: 1
    drop_freq_count_low: 1
    drop_freq_count_high: 3
    drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: 1000
    drop_length_high: 2000
    drop_count_low: 1
    drop_count_high: 5

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    concat_original: True
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <speed_perturb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

fbank: !new:speechbrain.lobes.features.Fbank
    n_mels: !ref <n_mels>
    sample_rate: !ref <sample_rate>

coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
    vocab_size: !ref <output_neurons>

rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
    language_model: !ref <asr_model[lm_model]>
    temperature: !ref <temperature_lm>

scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
    full_scorers: [!ref <rnnlm_scorer>,
                   !ref <coverage_scorer>]
    weights:
        rnnlm: !ref <lm_weight>
        coverage: !ref <coverage_penalty>

beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
    embedding: !ref <asr_model[tgt_embedding]>
    decoder: !ref <asr_model[recognizer]>
    linear: !ref <asr_model[seq_output]>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <beam_size>
    eos_threshold: !ref <eos_threshold>
    using_max_attn_shift: !ref <using_max_attn_shift>
    max_attn_shift: !ref <max_attn_shift>
    temperature: !ref <temperature>
    scorer: !ref <scorer>

opt_class: !name:torch.optim.AdamW
    lr: !ref <lr>
    weight_decay: 0.00005

lr_annealing: !new:speechbrain.nnet.schedulers.StepScheduler
    initial_value: !ref <lr>
    decay_factor: 0.7
    decay_drop: !ref <epochs_before_lr_drop>

modules:
    enhance_model: !ref <enhance_model[enhance_model]>
    src_embedding: !ref <asr_model[src_embedding]>
    tgt_embedding: !ref <asr_model[tgt_embedding]>
    recognizer: !ref <asr_model[recognizer]>
    ctc_output: !ref <asr_model[ctc_output]>
    seq_output: !ref <asr_model[seq_output]>
    lm_model: !ref <asr_model[lm_model]>

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        enhance_model: !ref <enhance_model[enhance_model]>
        src_embedding: !ref <asr_model[src_embedding]>
        tgt_embedding: !ref <asr_model[tgt_embedding]>
        recognizer: !ref <asr_model[recognizer]>
        ctc_output: !ref <asr_model[ctc_output]>
        seq_output: !ref <asr_model[seq_output]>
        counter: !ref <epoch_counter>
        normalizer: !ref <normalizer>

enhance_loss: !name:speechbrain.nnet.losses.mse_loss
mimic_loss: !name:speechbrain.nnet.losses.mse_loss
seq_loss: !name:speechbrain.nnet.losses.nll_loss
    label_smoothing: !ref <label_smoothing>

enhance_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.mse_loss
        reduction: batch
mimic_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.mse_loss
        reduction: batch
estoi_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:train.estoi_eval
    n_jobs: 30
pesq_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:train.pesq_eval
    n_jobs: 30
err_rate_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
    merge_tokens: True
    space_token: " "

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>