@@ -11,7 +11,7 @@ reward_server: "http://127.0.0.1:8731" # The address of the reward model server
11
11
logging_dir : grpo-logs # Directory for logging
12
12
logging_steps : 1 # Number of steps between logging
13
13
output_dir : " qwen2.5-7b-kk-dataset-grpo/checkpoints" # Directory for output ckpts
14
- report_to : " wandb " # Supported reporting options: "all", "wandb", "tensorboard", "visualdl"(default), "none"
14
+ report_to : " visualdl " # Supported reporting options: "all", "wandb", "tensorboard", "visualdl"(default), "none"
15
15
wandb_http_proxy : " http://127.0.0.1:8962" # HTTP proxy for wandb
16
16
run_name : " qwen2.5-7b-kk-dataset-grpo" # Name of the run
17
17
@@ -22,12 +22,13 @@ prompt_key: "src" # Key for the prompt in the dataset
22
22
response_key : " tgt" # Key for the response in the dataset
23
23
dataloader_drop_last : true # Whether to drop the last incomplete batch in the DataLoader
24
24
balance_batch : true # Whether to balance batch size across dataset_world_size
25
+ use_remove_padding : true # Whether to remove padding tokens in the input
25
26
26
27
# distributed training args
27
28
tensor_parallel_degree : 2 # Degree of tensor parallelism
28
29
sequence_parallel : true # Whether to enable sequence parallelism
29
- sharding_parallel_degree : 1 # Degree of sharding parallelism
30
- sharding : " stage2 " # Sharding strategy, e.g., "stage1" or "stage2"
30
+ sharding_parallel_degree : - 1 # Degree of sharding parallelism
31
+ sharding : " stage1 " # Sharding strategy, e.g., "stage1" or "stage2"
31
32
sharding_parallel_config : " enable_release_grads" # Configuration for sharding parallelism
32
33
pipeline_parallel_degree : 1 # Degree of pipeline parallelism
33
34
virtual_pp_degree : 1 # Degree of virtual pipeline parallelism
@@ -39,24 +40,23 @@ min_dec_len: 32 # Minimum length of the response
39
40
top_p : 1.0 # Top-p sampling parameter
40
41
temperature : 0.7 # Temperature parameter for sampling
41
42
repetition_penalty : 1.0 # Repetition penalty parameter
42
- # rollout_use_dynamic_insert: 1 # Whether to use dynamic insert for rollout
43
- # rollout_continue_batching_batch_size: 32 # Base batch size for dynamic insert
44
- quant_type : " " # Quantization type, e.g., "weight_only_int8"
43
+ rollout_max_num_seqs : 32 # The maximum number of sequences that can be processed in a single inference
44
+ rollout_quant_type : " " # Quantization type, e.g., "weight_only_int8"
45
45
46
46
# training args
47
47
do_train : true # Whether to perform training
48
48
seed : 42 # Random seed for reproducibility
49
- global_batch_size : 2 # Global batch size for training
50
- mini_batch_size : 2 # Mini-batch size for training
49
+ global_batch_size : 4 # Global batch size for training
50
+ global_gen_batch_size : -1 # Global generation batch size for dynamic sampling
51
+ global_mini_batch_size : -1 # Mini-batch size for training
51
52
rollout_n : 8 # Number of rollouts
52
53
update_iters : 1 # Number of training iterations for rollout samples
53
- per_device_rollout_batch_size : 1 # Rollout batch size per device
54
54
per_device_logprob_batch_size : 8 # Log probability batch size per device
55
55
per_device_reward_batch_size : 8 # Reward batch size per device
56
56
per_device_value_batch_size : 8 # Value batch size per device
57
57
per_device_train_batch_size : 8 # Training batch size per device
58
58
# gradient_accumulation_steps: 1 # Gradient accumulation steps (auto-calculated)
59
- num_train_epochs : 3 # Number of training epochs
59
+ num_train_epochs : 6 # Number of training epochs
60
60
max_length : 4608 # Maximum length for training, should be larger than max_prompt_len + max_dec_len
61
61
learning_rate : 5e-7 # Learning rate for training
62
62
lr_scheduler_type : " constant" # Learning rate scheduler type
@@ -65,15 +65,15 @@ adam_beta1: 0.9 # AdamW optimizer beta1
65
65
adam_beta2 : 0.999 # AdamW optimizer beta2
66
66
adam_epsilon : 1e-8 # AdamW optimizer epsilon
67
67
max_grad_norm : 1.0 # Maximum gradient norm for clipping
68
- max_steps : 3600 # Maximum number of training steps
68
+ max_steps : -1 # Maximum number of training steps
69
69
save_steps : 300 # Number of steps between model saves
70
70
save_strategy : " steps" # Strategy for saving models
71
71
ignore_save_lr_and_optim : true # Whether to ignore saving learning rate and optimizer state (leave empty if not specified)
72
72
disable_tqdm : true # Whether to disable tqdm progress bar
73
73
74
74
# RL args
75
75
kl_coeff : 0.0 # KL coefficient
76
- kl_loss_coeff : 0.0 # KL loss coefficient
76
+ kl_loss_coeff : 0.001 # KL loss coefficient
77
77
pg_loss_coeff : 1.0 # Policy gradient loss coefficient
78
78
entropy_coeff : 0.0 # Entropy coefficient
79
79
clip_range_ratio : 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
@@ -84,12 +84,11 @@ enable_overlong_reward_buffer: false # Whether to enable overlong reward buffer
84
84
overlong_reward_buffer : 256 # The length of the overlong reward buffer
85
85
overlong_penalty_factor : 1.0 # The penalty factor for overlong reward buffer
86
86
clip_range_value : 5.0 # The clipping range for the output of the value model. The value is clipped into [-clip_range_value, clip_range_value].
87
- normalize_reward : true # Whether to normalize reward
88
- normalize_advantage : true # Whether to normalize advantage
87
+ normalize_reward : false # Whether to normalize reward
88
+ normalize_advantage : false # Whether to normalize advantage
89
89
dynamic_sampling : false # Whether to use dynamic sampling, which is introcuded in DAPO algorithm https://arxiv.org/abs/2503.14476
90
- per_device_sample_batch_size : 1 # Sample batch size per device for dynamic sampling
91
90
max_gen_batches : 2 # Maximum number of generation batches for dynamic sampling
92
- use_fp32_compute : false # Whether to use fp32 to compute xx_log_prob,rewards, advantages and loss
91
+ use_fp32_compute : true # Whether to use fp32 to compute xx_log_prob,rewards, advantages and loss
93
92
94
93
# eval args
95
94
do_eval : true # Whether to perform evaluation
@@ -99,11 +98,10 @@ eval_steps: 20 # Number of steps between evaluations
99
98
100
99
# device memory optimization args
101
100
use_flash_attention : true # Whether to use fused attention operations
102
- use_fused_rms_norm : true # Whether to use fused RMS norm operations
103
- use_fused_rope : true # Whether to use fused rope operations
104
- use_fused_head_and_loss_fn : false # Whether to use fused head and loss function
105
- use_fused_linear : false # Whether to use fused linear operations, which needs to install fused_ln in slm/model_zoo/gpt-3/external_ops
106
- fused_linear : false # Whether to use fused_gemm_epilogue
101
+ use_fused_rms_norm : true # Whether to use fused RMS norm operations, which needs to install fused_ln in slm/model_zoo/gpt-3/external_ops
102
+ use_fused_rope : false # Whether to use fused rope operations
103
+ use_fused_head_and_loss_fn : true # Whether to use fused head and loss function
104
+ use_fused_linear : true # Whether to use fused linear operations
107
105
recompute : true # Whether to enable gradient checkpointing for memory optimization
108
106
recompute_use_reentrant : true # Whether to use reentrant recompute
109
107
recompute_granularity : " full" # Granularity of recompute
0 commit comments