PaddleNLP/llm/config/llama/grpo_argument.yaml at develop · l1l1l1l/PaddleNLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# RL algorithms
rl_algorithm: "grpo" # The reinforcement learning algorithm used, supported: "ppo", "grpo", "reinforce_plus_plus"

# models
actor_model_name_or_path: "meta-llama/Meta-Llama-3.1-8B" # The name or path of the actor model
reward_model_name_or_path: "" # The name or path of the reward model
use_rm_server: true # Whether to use the reward model server
reward_server: "http://127.0.0.1:8731" # The address of the reward model server

# logging
logging_dir: grpo-logs # Directory for logging
logging_steps: 1 # Number of steps between logging
output_dir: "qwen2.5-7b-kk-dataset-grpo/checkpoints" # Directory for output ckpts
report_to: "visualdl" # Supported reporting options: "all", "wandb", "tensorboard", "visualdl"(default), "none"
wandb_http_proxy: "http://127.0.0.1:8962" # HTTP proxy for wandb
run_name: "qwen2.5-7b-kk-dataset-grpo" # Name of the run

# data
train_datasets: "ppo-kk/34567ppl/train.jsonl" # Path to the training dataset
eval_datasets: "ppo-kk/5ppl/test.jsonl" # Path to the evaluation dataset
prompt_key: "src" # Key for the prompt in the dataset
response_key: "tgt" # Key for the response in the dataset
dataloader_drop_last: true # Whether to drop the last incomplete batch in the DataLoader
balance_batch: true # Whether to balance batch size across dataset_world_size
use_remove_padding: true # Whether to remove padding tokens in the input

# distributed training args
tensor_parallel_degree: 2 # Degree of tensor parallelism
sequence_parallel: true # Whether to enable sequence parallelism
sharding_parallel_degree: -1 # Degree of sharding parallelism
sharding: "stage1" # Sharding strategy, e.g., "stage1" or "stage2"
sharding_parallel_config: "enable_release_grads" # Configuration for sharding parallelism
pipeline_parallel_degree: 1 # Degree of pipeline parallelism
virtual_pp_degree: 1 # Degree of virtual pipeline parallelism

# rollout args
max_prompt_len: 512 # Maximum length of the prompt, exceeding which will be automatically truncated
max_dec_len: 4096 # Maximum length of the response
min_dec_len: 32 # Minimum length of the response
top_p: 1.0 # Top-p sampling parameter
temperature: 0.7 # Temperature parameter for sampling
repetition_penalty: 1.0 # Repetition penalty parameter
rollout_max_num_seqs: 32 # The maximum number of sequences that can be processed in a single inference
rollout_quant_type: "" # Quantization type, e.g., "weight_only_int8"

# training args
do_train: true # Whether to perform training
seed: 42 # Random seed for reproducibility
global_batch_size: 4 # Global batch size for training
global_gen_batch_size: -1 # Global generation batch size for dynamic sampling
global_mini_batch_size: -1 # Mini-batch size for training
rollout_n: 8 # Number of rollouts
update_iters: 1 # Number of training iterations for rollout samples
per_device_logprob_batch_size: 8 # Log probability batch size per device
per_device_reward_batch_size: 8 # Reward batch size per device
per_device_value_batch_size: 8 # Value batch size per device
per_device_train_batch_size: 8 # Training batch size per device
# gradient_accumulation_steps: 1 # Gradient accumulation steps (auto-calculated)
num_train_epochs: 6 # Number of training epochs
max_length: 4608 # Maximum length for training, should be larger than max_prompt_len + max_dec_len
learning_rate: 5e-7 # Learning rate for training
lr_scheduler_type: "constant" # Learning rate scheduler type
weight_decay: 1e-2 # Weight decay for the AdamW optimizer
adam_beta1: 0.9 # AdamW optimizer beta1
adam_beta2: 0.999 # AdamW optimizer beta2
adam_epsilon: 1e-8 # AdamW optimizer epsilon
max_grad_norm: 1.0 # Maximum gradient norm for clipping
max_steps: -1 # Maximum number of training steps
save_steps: 300 # Number of steps between model saves
save_strategy: "steps" # Strategy for saving models
ignore_save_lr_and_optim: true # Whether to ignore saving learning rate and optimizer state (leave empty if not specified)
disable_tqdm: true # Whether to disable tqdm progress bar

# RL args
kl_coeff: 0.0 # KL coefficient
kl_loss_coeff: 0.001 # KL loss coefficient
pg_loss_coeff: 1.0 # Policy gradient loss coefficient
entropy_coeff: 0.0 # Entropy coefficient
clip_range_ratio: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
clip_range_ratio_low: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
clip_range_ratio_high: 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
clip_range_score: 10.0 # The clipping range for the output of the score model. The reward is clipped into [-clip_range_score, clip_range_score].
enable_overlong_reward_buffer: false # Whether to enable overlong reward buffer
overlong_reward_buffer: 256 # The length of the overlong reward buffer
overlong_penalty_factor: 1.0 # The penalty factor for overlong reward buffer
clip_range_value: 5.0 # The clipping range for the output of the value model. The value is clipped into [-clip_range_value, clip_range_value].
normalize_reward: false # Whether to normalize reward
normalize_advantage: false # Whether to normalize advantage
dynamic_sampling: false # Whether to use dynamic sampling, which is introcuded in DAPO algorithm https://arxiv.org/abs/2503.14476
max_gen_batches: 2 # Maximum number of generation batches for dynamic sampling
use_fp32_compute: true # Whether to use fp32 to compute xx_log_prob,rewards, advantages and loss

# eval args
do_eval: true # Whether to perform evaluation
per_device_eval_batch_size: 32 # Evaluation batch size per device
evaluation_strategy: "steps" # Evaluation strategy, e.g., "steps"
eval_steps: 20 # Number of steps between evaluations

# device memory optimization args
use_flash_attention: true # Whether to use fused attention operations
use_fused_rms_norm: true # Whether to use fused RMS norm operations, which needs to install fused_ln in slm/model_zoo/gpt-3/external_ops
use_fused_rope: false # Whether to use fused rope operations
use_fused_head_and_loss_fn: true # Whether to use fused head and loss function
use_fused_linear: true # Whether to use fused linear operations
recompute: true # Whether to enable gradient checkpointing for memory optimization
recompute_use_reentrant: true # Whether to use reentrant recompute
recompute_granularity: "full" # Granularity of recompute
bf16: true # Whether to use mixed precision with bfloat16
fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
release_grads: true # Whether to release gradients
offload_optim: false # Whether to offload optimizer to pinned memory

# benchmark args
skip_profile_timer: false # Whether to skip profiling timer