-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathbard_vl_kd_diffusion_b8_mask_2b.yaml
More file actions
97 lines (85 loc) · 2.29 KB
/
bard_vl_kd_diffusion_b8_mask_2b.yaml
File metadata and controls
97 lines (85 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
experiment:
project: exps/kd_diffusion_b8_mask_2b
log_with: tensorboard
training:
smoke: false
cpu: false
seed: 1234
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
gradient_checkpointing_enable: true
mixed_precision: bf16
max_grad_norm: 1.0
num_epochs: 3
save_every: 100
optimizer:
lr: 6.0e-6
betas: [0.9, 0.95]
weight_decay: 0.0
lr_scheduler:
warmup_steps: 100
min_lr_ratio: 0.5
model:
_target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
pretrained_model_name_or_path: pretrained_models/Bard-VL-B8-Mask-2B-Instruct
attn_implementation: sdpa
torch_dtype: "torch.bfloat16"
teacher_model:
_target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
pretrained_model_name_or_path: pretrained_models/Bard-VL-B4-Mask-8B-Instruct
attn_implementation: sdpa
torch_dtype: "torch.bfloat16"
processor:
_target_: transformers.AutoProcessor.from_pretrained
pretrained_model_name_or_path: pretrained_models/Bard-VL-B8-Mask-2B-Instruct
trust_remote_code: true
min_pixels: 65536
max_pixels: 1048576
dataset:
_target_: nemo_automodel.components.datasets.vlm.datasets.bard_vl_dataset
path_or_dataset: datasets/mixed-distil-17w
max_len: 8192
drop_last: true
root_dir: datasets
prior_dist: "Mask"
dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
shuffle: true
num_workers: 8
pin_memory: true
collate_fn:
_target_: nemo_automodel.components.datasets.vlm.collate_fns.bard_vl_block_collate_fn
noise_scheduler:
_target_: nemo_automodel.utils.scheduler.CondOTScheduler
block_size: 8
model_type: bard-vl
mask_token_id: 151671
vocab_size: 151936
pad_token: <|im_end|>
im_end_token: <|im_end|>
diffusion_kd:
max_response_len: 4096
pad_token_id: 151645
mask_token_id: 151671
student_block_size: 8
teacher_block_size: 4
teacher_topk: 16
corruption_strategy: mask_per_sample
min_mask_rate: 0.10
max_mask_rate: 0.50
ensure_at_least_one_mask: true
distillation:
kl_direction: forward-kl
temperature: 1.0
fp32_upcast: true
loss:
ce_weight: 1.0
kd_weight: 0.5
debug:
enabled: false
log_generated_text: false
log_prompt_text: false
log_topk_overlap: false
topk_overlap_every_steps: 1
topk_overlap_k: 4
topk_overlap_num_samples: 8