-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathppo_trainer.yaml
More file actions
301 lines (292 loc) · 12.2 KB
/
ppo_trainer.yaml
File metadata and controls
301 lines (292 loc) · 12.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
data:
tokenizer: null
use_shm: False
train_files: ~/data/rlhf/gsm8k/train.parquet
val_files: ~/data/rlhf/gsm8k/test.parquet
prompt_key: prompt
reward_fn_key: data_source
max_prompt_length: 512
max_response_length: 512
train_batch_size: 1024
val_batch_size: null
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
return_raw_chat: False
return_full_prompt: False
shuffle: True
filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
filter_overlong_prompts_workers: 1
truncation: error
image_key: images
video_key: videos
trust_remote_code: False # main_ppo will check this config to determine whether to use remote code for tokenizer
custom_cls:
path: null
name: null
actor_rollout_ref:
hybrid_engine: True
model:
path: ~/models/deepseek-llm-7b-chat
use_shm: False
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
enable_activation_offload: False
use_remove_padding: False
lora_rank: 0 # Set to positive value to enable LoRA (e.g., 32)
lora_alpha: 16 # LoRA scaling factor
target_modules: all-linear # all-linear or [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
use_liger: False
use_fused_kernels: False
trust_remote_code: False
actor:
strategy: fsdp # [fsdp, fsdp2], This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
# pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified
clip_ratio_low: 0.2
clip_ratio_high: 0.2
clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
entropy_coeff: 0.001
use_invalid_action_penalty: True
invalid_action_penalty_coef: 0.1
use_kl_loss: False # True for GRPO
use_torch_compile: True # False to disable torch compile
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
checkpoint:
contents: ['model', 'optimizer', 'extra'] # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
optim:
lr: 1e-6
lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: 0.0 # only used with cosine lr scheduler, default to 0.0
num_cycles: 0.5 # only used with cosine lr scheduler, default to 0.5
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
weight_decay: 0.01
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
offload_policy: False # only for fsdp2, offload param\grad\optimizer during train
reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
fsdp_size: -1
ref:
strategy: fsdp
fsdp_config:
param_offload: False
reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
mode: sync # sync: LLM, async: AsyncLLM
chat_scheduler: null # async chat scheduler, e.g examples.ppo_trainer.naive_chat_scheduler.NaiveChatCompletionScheduler
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
use_fire_sampling: False # https://arxiv.org/abs/2410.21236
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor # safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
layered_summon: False # for huge model, layered summon can save memory (prevent OOM) but make it slower
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_model_len: null
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
# for hf rollout
do_sample: True
# number of responses (i.e. num sample times)
n: 1 # > 1 for grpo
engine_kwargs: # inference engine parameters
vllm:
swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
sglang:
attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
val_kwargs:
# sampling parameters for validation
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1.0
temperature: 0
n: 1
do_sample: False # default eager for validation
multi_turn:
enable: False # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
max_turns: null # null for no limit (default max_length // 3)
tool_config_path: null # null for no tool
format: chatml # chatml, more formats will be supported in the future
critic:
rollout_n: ${actor_rollout_ref.rollout.n}
strategy: fsdp # [fsdp, fsdp2]
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
weight_decay: 0.01
model:
path: ~/models/deepseek-llm-7b-chat
use_shm: False
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
enable_activation_offload: False
use_remove_padding: False
trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}
fsdp_config:
param_offload: False
optimizer_offload: False
offload_policy: False # only for fsdp2, offload param\grad\optimizer during train
reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
lora_rank: 0 # Set to positive value to enable LoRA (e.g., 32)
lora_alpha: 16 # LoRA scaling factor
target_modules: all-linear # all-linear or [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5
loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
checkpoint:
contents: ['model', 'optimizer', 'extra'] # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
path: ~/models/FsfairX-LLaMA3-RM-v0.1
use_shm: False
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
trust_remote_code: False
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: False
reshard_after_forward: True # only for fsdp2, [True, False, int between 1 and fsdp_size]
fsdp_size: -1
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null # set a number
max_length: null
ulysses_sequence_parallel_size: 1 # sp size
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
reward_manager: episode
launch_reward_fn_async: False # custom reward function executed async on CPU, during log_prob
sandbox_fusion:
url: null # faas url to run code in cloud sandbox
max_concurrent: 64 # max concurrent requests to sandbox
custom_reward_function:
path: null
name: compute_score
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: gae
norm_adv_by_std_in_grpo: True
use_kl_in_reward: False
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001
horizon: 10000
target_kl: 0.1
use_pf_ppo: False
pf_ppo:
reweight_method: pow # ["pow", "max_min", "max_random"]
weight_pow: 2.0
gigpo:
step_advantage_w: 1.0
mode: "mean_norm" # "mean_norm" or "mean_std_norm"
filter_groups: # DAPO from https://arxiv.org/abs/2503.14476
enable: False
max_num_gen_batches: 10
trainer:
balance_batch: True
total_epochs: 30
total_training_steps: null
project_name: verl_examples
experiment_name: gsm8k
logger: [ 'console', 'wandb' ]
log_val_generations: 0
rollout_data_dir: null # directory for logging the rollout data, no dump if null
validation_data_dir: null # directory for logging the validation data, no dump if null
nnodes: 1
n_gpus_per_node: 8
save_freq: -1
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or disable or resume_path if resume_from_path is set
resume_from_path: null
val_before_train: True
test_freq: -1
critic_warmup: 0
default_hdfs_dir: null
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
max_actor_ckpt_to_keep: null
max_critic_ckpt_to_keep: null
# The timeout for ray worker group to wait for the register center to be ready
ray_wait_register_center_timeout: 300
device: cuda
ray_init:
num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
env:
env_name: alfworld/AlfredTWEnv
seed: 0
max_steps: 50
history_length: 2
rollout:
n: -1 # the group number of envs (for GRPO and GiGPO). -1 means disable env grouping.
sokoban:
dim_room: [6, 6]
num_boxes: 1
search_depth: 30
mode: tiny_rgb_array
webshop:
use_small: True
human_goals: False