|
| 1 | +# general training parameters |
| 2 | +train.wandb_params = { |
| 3 | + "project": "mtg-ssl", |
| 4 | + "name": "mask_conformer_large_mv_au_to_all_shuffle_mask", |
| 5 | + "offline": True, |
| 6 | + # NOTE: path to logs in the BSC cluster. Change it for local experiments |
| 7 | + "save_dir": "/gpfs/projects/upf97/logs/", |
| 8 | + "entity": "mtg-upf", |
| 9 | + "group": "masking_conformer", |
| 10 | +} |
| 11 | + |
| 12 | +# modules to use |
| 13 | +build_module.representation = [@nets.cqt.CQT, @nets.encodec.EnCodec, @nets.melspectrogram.MelSpectrogram, @nets.waveform.Waveform] |
| 14 | +build_module.module = @modules.maskingmodel.MaskingModel |
| 15 | +build_module.net = @nets.conformer.Conformer |
| 16 | + |
| 17 | +# Choose the devalopment dataloader |
| 18 | +build_dev_datamodule.datamodule = @discotube |
| 19 | + |
| 20 | +# Lighting trainer parameters |
| 21 | +train.params = { |
| 22 | + "accelerator": "gpu", |
| 23 | + "devices": 4, |
| 24 | + "num_nodes": 2, |
| 25 | + "max_steps": 400000, |
| 26 | + "log_every_n_steps": 50, |
| 27 | + "precision": "bf16-mixed", |
| 28 | + "strategy": "ddp_find_unused_parameters_true", |
| 29 | + "num_sanity_val_steps": 0 |
| 30 | +} |
| 31 | + |
| 32 | +new_freq = 24000 |
| 33 | + |
| 34 | +# Dataloader |
| 35 | +AudioDataset.num_frames = 480000 # 30s |
| 36 | +AudioDataset.orig_freq = 16000 |
| 37 | +AudioDataset.new_freq = %new_freq |
| 38 | +AudioDataset.mono = True |
| 39 | +AudioDataset.half_precision = True |
| 40 | +AudioDataModule.num_workers = 20 |
| 41 | + |
| 42 | +# Discogs datamodule parameters |
| 43 | +DiscotubeAudioDataModule.batch_size = 32 |
| 44 | +DiscotubeAudioDataModule.data_dir = "/gpfs/scratch/upf97/mmap/" |
| 45 | +DiscotubeAudioDataModule.filelist_train = "/gpfs/projects/upf97/data/train_mmap.txt" |
| 46 | +DiscotubeAudioDataModule.filelist_val = "/gpfs/projects/upf97/data/test_mmap.txt" |
| 47 | + |
| 48 | +# CosineAnnealing scheduler |
| 49 | +CosineAnnealingCallback.warmup_steps = 30000 |
| 50 | +CosineAnnealingCallback.eta_min = 1e-7 |
| 51 | + |
| 52 | +# MelSpectrogram parameters |
| 53 | +nets.melspectrogram.MelSpectrogram.sr = %new_freq |
| 54 | +nets.melspectrogram.MelSpectrogram.win_len = 512 |
| 55 | +nets.melspectrogram.MelSpectrogram.hop_len = 320 |
| 56 | +nets.melspectrogram.MelSpectrogram.power = 2 |
| 57 | +nets.melspectrogram.MelSpectrogram.n_mel = 96 |
| 58 | +nets.melspectrogram.MelSpectrogram.norm = "slaney" |
| 59 | +nets.melspectrogram.MelSpectrogram.mel_scale = "slaney" |
| 60 | +nets.melspectrogram.MelSpectrogram.norm_std = 1.268292820667291 |
| 61 | +nets.melspectrogram.MelSpectrogram.norm_mean = 2.06755686098554 |
| 62 | +nets.melspectrogram.MelSpectrogram.patch_size = (96, 4) |
| 63 | + |
| 64 | +# CQT parameters |
| 65 | +nets.cqt.CQT.sr = %new_freq |
| 66 | +nets.cqt.CQT.hop_len = 320 |
| 67 | +nets.cqt.CQT.power = 2 |
| 68 | +nets.cqt.CQT.bins_per_octave = 24 |
| 69 | +nets.cqt.CQT.n_bins = 188 # 6 octaves * 24 bins |
| 70 | +nets.cqt.CQT.f_min = 32.703 # C0 |
| 71 | +nets.cqt.CQT.magnitude = True |
| 72 | +nets.cqt.CQT.logC = True |
| 73 | +nets.cqt.CQT.norm_std = 1.9055732535255916 |
| 74 | +nets.cqt.CQT.norm_mean = 4.754879065310596 |
| 75 | +nets.cqt.CQT.patch_size = (188, 4) |
| 76 | + |
| 77 | +# Waveform parameters |
| 78 | +nets.waveform.Waveform.sr = %new_freq |
| 79 | +nets.waveform.Waveform.norm_std = None |
| 80 | +nets.waveform.Waveform.norm_mean = None |
| 81 | +nets.waveform.Waveform.patch_size = (1, 1280) # 16ms |
| 82 | + |
| 83 | +# data augmentation |
| 84 | +nets.melspectrogram.MelSpectrogram.stretch_factor = 1 |
| 85 | +nets.melspectrogram.MelSpectrogram.freq_mask_param = 0 |
| 86 | +nets.melspectrogram.MelSpectrogram.time_mask_param = 0 |
| 87 | + |
| 88 | +# Encodec parameters |
| 89 | +nets.encodec.EnCodec.weights_path = "/gpfs/scratch/upf97/model_weights/encodec_24khz/" |
| 90 | +nets.encodec.EnCodec.norm_type = "global" |
| 91 | +nets.encodec.EnCodec.stats_path = "/gpfs/scratch/upf97/dataset_stats/discotube23/input_stats_1K_steps.json" |
| 92 | +nets.encodec.EnCodec.orig_sr = %new_freq |
| 93 | +nets.encodec.EnCodec.patch_size = (128, 4) |
| 94 | + |
| 95 | +# MaskingModel parameters |
| 96 | +modules.maskingmodel.MaskingModel.num_codebooks = 1 |
| 97 | +modules.maskingmodel.MaskingModel.lr = 1e-4 |
| 98 | +modules.maskingmodel.MaskingModel.weight_decay = 1e-2 |
| 99 | +modules.maskingmodel.MaskingModel.codebook_size = 8196 |
| 100 | +modules.maskingmodel.MaskingModel.codebook_dim = 16 |
| 101 | +modules.maskingmodel.MaskingModel.mask_seconds = 0.4 |
| 102 | +modules.maskingmodel.MaskingModel.mask_prob = 0.6 |
| 103 | +modules.maskingmodel.MaskingModel.seed = 0 |
| 104 | +modules.maskingmodel.MaskingModel.plot_tokens = False |
| 105 | +modules.maskingmodel.MaskingModel.diff_input = False |
| 106 | +modules.maskingmodel.MaskingModel.input_representation = @nets.waveform.Waveform |
| 107 | +modules.maskingmodel.MaskingModel.masking_noise_type = "shuffled_input" |
| 108 | + |
| 109 | +# Transformer parameters |
| 110 | +nets.conformer.Conformer.embed_dim = 1024 |
| 111 | +nets.conformer.Conformer.depth = 24 |
| 112 | +nets.conformer.Conformer.conv_kernel_size = 5 |
| 113 | +nets.conformer.Conformer.num_heads = 8 |
| 114 | +nets.conformer.Conformer.mlp_ratio = 4.0 |
| 115 | +nets.conformer.Conformer.mlp_residual_factor = 4.0 |
| 116 | +nets.conformer.Conformer.dropout = 0.2 |
| 117 | +nets.conformer.Conformer.input_dropout = 0.0 |
| 118 | +nets.conformer.Conformer.use_deepnorm = True |
| 119 | +nets.conformer.Conformer.alpha_deepnorm = 2.6321480259049848 # we can tune this number |
| 120 | +nets.conformer.Conformer.beta_deepnorm = 0.022386873579657126 # we can tune this number |
| 121 | +nets.conformer.Conformer.use_rope = True |
| 122 | +nets.conformer.Conformer.num_patches = None |
0 commit comments