recipes/timers-and-such/direct/hparams/train.yaml

# ############################################################################
# Model: Direct SLU
# Encoder: Pre-trained ASR encoder -> LSTM
# Decoder: GRU + beamsearch
# Tokens: BPE with unigram
# losses: NLL
# Training: Timers and Such
# Authors:  Loren Lugosch, Mirco Ravanelli 2020
# ############################################################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
experiment: train-real-and-synth
output_folder: !ref results/<experiment>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
all_real_wer_file: !ref <output_folder>/all_real_wer.txt
test_real_wer_file: !ref <output_folder>/test_real_wer.txt
test_synth_wer_file: !ref <output_folder>/test_synth_wer.txt

# Data files
data_folder: !PLACEHOLDER # e.g, /localscratch/timers-and-such
train_splits: ["train-synth", "train-real"]
csv_train: !ref <save_folder>/train-type=direct.csv
csv_dev_real: !ref <save_folder>/dev-real-type=direct.csv
csv_dev_synth: !ref <save_folder>/dev-synth-type=direct.csv
csv_test_real: !ref <save_folder>/test-real-type=direct.csv
csv_test_synth: !ref <save_folder>/test-synth-type=direct.csv
csv_all_real: !ref <save_folder>/all-real-type=direct.csv
tokenizer_file: https://huggingface.co/speechbrain/slu-timers-and-such-direct-librispeech-asr/resolve/main/tokenizer.ckpt
skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min
test_on_all_real: False

# Data for augmentation
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script

####################### Training Parameters ####################################
number_of_epochs: 1
batch_size: 16
lr: 0.0003
# token_type: unigram # ["unigram", "bpe", "char"]
sorting: random

####################### Model Parameters #######################################
sample_rate: 16000
emb_size: 128
dec_neurons: 512
output_neurons: 51 # index(eos/bos) = 0
ASR_encoder_dim: 512
encoder_dim: 256

# Decoding parameters
bos_index: 0
eos_index: 0
min_decode_ratio: 0.0
max_decode_ratio: 10.0
slu_beam_size: 80
eos_threshold: 1.5
temperature: 1.25

num_workers: 4
dataloader_opts:
    num_workers: !ref <num_workers>
    batch_size: !ref <batch_size>
    shuffle: True

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

############################## Augmentations ###################################

# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: 0
    snr_high: 15
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Speed perturbation
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
    orig_freq: !ref <sample_rate>
    speeds: [95, 100, 105]

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: 0
    drop_freq_high: 1
    drop_freq_count_low: 1
    drop_freq_count_high: 3
    drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: 1000
    drop_length_high: 2000
    drop_count_low: 1
    drop_count_high: 5

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    concat_original: True
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <speed_perturb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

############################## Models ##########################################

asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech

slu_enc: !new:speechbrain.nnet.containers.Sequential
    input_shape: [null, null, !ref <ASR_encoder_dim>]
    lstm: !new:speechbrain.nnet.RNN.LSTM
        input_size: !ref <ASR_encoder_dim>
        bidirectional: True
        hidden_size: !ref <encoder_dim>
        num_layers: 2
    linear: !new:speechbrain.nnet.linear.Linear
        input_size: !ref <encoder_dim> * 2
        n_neurons: !ref <encoder_dim>

output_emb: !new:speechbrain.nnet.embedding.Embedding
    num_embeddings: !ref <output_neurons>
    embedding_dim: !ref <emb_size>

dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
    enc_dim: !ref <encoder_dim>
    input_size: !ref <emb_size>
    rnn_type: gru
    attn_type: keyvalue
    hidden_size: !ref <dec_neurons>
    attn_dim: 512
    num_layers: 3
    scaling: 1.0
    dropout: 0.0

seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dec_neurons>
    n_neurons: !ref <output_neurons>


modules:
    slu_enc: !ref <slu_enc>
    output_emb: !ref <output_emb>
    dec: !ref <dec>
    seq_lin: !ref <seq_lin>

model: !new:torch.nn.ModuleList
    - [!ref <slu_enc>, !ref <output_emb>,
       !ref <dec>, !ref <seq_lin>]

tokenizer: !new:sentencepiece.SentencePieceProcessor

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    collect_in: !ref <save_folder>/TAS_tokenizer
    loadables:
        tokenizer: !ref <tokenizer>
    paths:
        tokenizer: !ref <tokenizer_file>

beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
    embedding: !ref <output_emb>
    decoder: !ref <dec>
    linear: !ref <seq_lin>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <slu_beam_size>
    eos_threshold: !ref <eos_threshold>
    temperature: !ref <temperature>
    using_max_attn_shift: False

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
    patient: 0

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        scheduler: !ref <lr_annealing>
        counter: !ref <epoch_counter>


log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

seq_cost: !name:speechbrain.nnet.losses.nll_loss
    label_smoothing: 0.1

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
    split_tokens: True