forked from Sllambias/asparagus
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdefault_pretrain.yaml
More file actions
82 lines (72 loc) · 2.55 KB
/
default_pretrain.yaml
File metadata and controls
82 lines (72 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
defaults:
- core/base@
- hardware/2gpus@hardware
- _self_
task:
root: base
stem:
model:
net: ${model.pretrain_net}
ckpt_every_n_epoch: 5
lightning:
data_module: PretrainDataModule
lightning_module: SelfSupervisedModule
data:
data_path: ${oc.env:ASPARAGUS_DATA}/${task}
train_split: split_99_01_00
fold: 0
training:
batch_size: 16
accumulate_grad_batches: 1
patch_size: [160, 160, 160]
seed: ${random:0,1000000}
mask_patch_size: ${model.patch_embed_size}
mask_ratio: 0.6
max_samples: 6_000_000
warmup_ratio: 0.02
# We follow DINOv2/DINOv3 and keep number of *steps* per epoch constant.
# We define our epoch length using FOMO60k as our reference dataset
# (they use Imagenet1k), which helps ensure stable scaling across gpus
# and simplifies logging intervals.
# Calculation is 60_529 / (16*2) ≈ 1890, i.e. num steps for 1 epoch
# on FOMO60k with our base setup of two devices and a per device
# batch size of 16. The key here is that this number is constant
# when we increase the number of devices _or_ use a bigger dataset.
steps_per_epoch: 1890 # <--- should be constant ... but note that if gradient accumulation is used, then steps_per_epoch > number of backwards passes.
val_steps_per_epoch: ${eval:"${training.steps_per_epoch} // 100"}
global_batch_size: ${eval:"${training.batch_size} * ${hardware.num_devices} * ${hardware.num_nodes} * ${training.accumulate_grad_batches}"}
steps: ${eval:"${training.max_samples} // (${training.global_batch_size})"}
warmup_epochs: ${eval:"max(1, int((${training.steps} // ${training.steps_per_epoch}) * ${training.warmup_ratio}))"}
decoder_warmup_epochs: 0
rec_loss_masked_only: False
check_val_every_n_epoch: 3
# num_devices 4
# num_nodes 2
# batch_size 8
# max_samples 6_000_000
# steps_per_epoch: 1890
# samples_per_epoch = steps_per_epoch * num_devices * num_nodes * batch_size
# epochs = 6_000_000 / samples_per_epoch
# steps = 6_000_000 / (num_devices * num_nodes * batch_size)
transforms:
cpu_tr_transforms: pretrain_CPU_train_transforms
cpu_val_transforms: pretrain_CPU_val_transforms
gpu_tr_transforms: pretrain_GPU_train_transforms
gpu_val_transforms: pretrain_GPU_val_transforms
masking: True
normalize: True
logger:
progress_bar: True
profile: False
wandb_log_model: False
wandb_logging: True
wandb_entity: ${oc.env:WANDB_ENTITY, "team-asparagus"}
wandb_project: Pretrain
mlflow_logging: False
log_every_n_steps: 250
log_images_every_n_epoch: 5
log_to_stdout: False
profiler:
enabled: False
profile_every_n_steps: 500
warmup_steps: 10