forked from alibaba/ROLL
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdistill_vl_megatron.yaml
More file actions
83 lines (72 loc) · 1.92 KB
/
distill_vl_megatron.yaml
File metadata and controls
83 lines (72 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
hydra:
run:
dir: .
output_subdir: null
exp_name: "distill_vl_megatron"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
checkpoint_config:
type: file_system
output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
save_steps: 100
logging_steps: 1
resume_from_checkpoint: false
student_pretrain: Qwen/Qwen2.5-VL-7B-Instruct
teacher_pretrain: Qwen/Qwen2.5-VL-32B-Instruct
# distill config
logits_topk: 64
distill_loss_weight: 0.85
kd_objective: forward_kl
distill_on_prompt: True
logits_transfer_backend: "nccl-only" # support "ipc+nccl", "nccl_only" and "ray"
sequence_length: 1024
max_grad_norm: 1.0
student:
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 2.0e-5
lr_scheduler_type: constant
per_device_train_batch_size: 4
gradient_accumulation_steps: 1
warmup_steps: 0
num_train_epochs: 1
max_steps: 1000
data_args:
template: qwen2-vl
# use leonardPKU/GEOQA_R1V_Train_8K as dataset
# download to ./data/geoqa_data from https://huggingface.co/datasets/leonardPKU/GEOQA_R1V_Train_8K
file_name: data/geoqa_data/
dataset_dir: ./
preprocessing_num_workers: 16
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 2
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,8))
teacher:
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: true
dtype: bf16
data_args:
template: qwen2-vl
training_args:
# teacher forward micro_batch_size
per_device_train_batch_size: 1
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 2
bf16: true
device_mapping: list(range(0,8))
system_envs:
RAY_PROFILING: "0"