Skip to content

Commit e30d99e

Browse files
smoke config: add ray_kwargs (was missed in last update)
1 parent 4fc9609 commit e30d99e

1 file changed

Lines changed: 136 additions & 0 deletions

File tree

configs/grpo_qwen1_5b_smoke.yaml

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# GRPO config for Qwen-2.5-Coder-1.5B (verl).
2+
#
3+
# Invoke via:
4+
# python -m verl.trainer.main_ppo --config-path=configs --config-name=grpo_qwen1_5b
5+
#
6+
# Override at runtime (CHTC shell will do this) to point the model path
7+
# at a merged SFT checkpoint:
8+
# actor_rollout_ref.model.path=/path/to/merged_sft
9+
#
10+
# Reference: DeepSeek-R1 §3 hyperparameters; llm-starter GSM8K example.
11+
12+
# ---------------------------------------------------------------------------
13+
# Data
14+
# ---------------------------------------------------------------------------
15+
data:
16+
tokenizer: null
17+
# Default: dataset built by `scripts/build_grpo_dataset.py`. Override per-run.
18+
train_files: results/grpo_dataset/v1/train.parquet
19+
val_files: null
20+
prompt_key: prompt
21+
max_prompt_length: 1024
22+
max_response_length: 1024
23+
# Effective rollouts per training step = train_batch_size * actor.rollout.n.
24+
# 32 prompts * 8 rollouts = 256 candidates per step.
25+
train_batch_size: 32
26+
return_raw_chat: false
27+
28+
# ---------------------------------------------------------------------------
29+
# Actor / rollout / reference policy
30+
# ---------------------------------------------------------------------------
31+
actor_rollout_ref:
32+
hybrid_engine: true
33+
34+
model:
35+
# Default to base; CHTC shell overrides to the merged SFT checkpoint.
36+
path: Qwen/Qwen2.5-Coder-1.5B-Instruct
37+
enable_gradient_checkpointing: true
38+
use_remove_padding: true
39+
40+
actor:
41+
strategy: fsdp
42+
# Mini-batch is the gradient-accumulation chunk inside one PPO update.
43+
# Effective grad batch = ppo_mini_batch_size = 16. Adjust if OOM.
44+
ppo_mini_batch_size: 16
45+
ppo_micro_batch_size_per_gpu: 1
46+
use_dynamic_bsz: false
47+
ppo_max_token_len_per_gpu: 16384
48+
grad_clip: 1.0
49+
clip_ratio: 0.2 # PPO-style clip; standard 0.2.
50+
entropy_coeff: 0.0 # No entropy bonus — we want a focused policy.
51+
# KL is added to the LOSS (not the reward) — DeepSeek-R1 style.
52+
use_kl_loss: true
53+
kl_loss_coef: 0.04
54+
kl_loss_type: low_var_kl
55+
optim:
56+
lr: 1.0e-6 # Conservative; can bump to 5e-6 if learning is too slow.
57+
lr_warmup_steps_ratio: 0.0
58+
min_lr_ratio: null
59+
warmup_style: constant
60+
total_training_steps: -1 # filled at runtime from trainer.total_training_steps
61+
fsdp_config:
62+
wrap_policy:
63+
min_num_params: 0
64+
param_offload: false
65+
optimizer_offload: false
66+
fsdp_size: -1
67+
68+
rollout:
69+
name: vllm
70+
temperature: 1.0 # high entropy during rollout — diversity for GRPO group
71+
top_k: -1
72+
top_p: 1.0
73+
prompt_length: 1024
74+
response_length: 1024
75+
dtype: bfloat16
76+
# Leave 40% headroom for training activations.
77+
gpu_memory_utilization: 0.6
78+
ignore_eos: false
79+
enforce_eager: false
80+
free_cache_engine: true
81+
load_format: dummy_dtensor
82+
tensor_model_parallel_size: 1
83+
max_num_batched_tokens: 8192
84+
max_num_seqs: 1024
85+
log_prob_micro_batch_size_per_gpu: 1
86+
# GROUP SIZE — number of completions per prompt in a GRPO step.
87+
# 8 is DeepSeek-R1's default; larger gives lower-variance advantage estimate
88+
# but costs proportionally more GPU time per step.
89+
n: 8
90+
91+
ref:
92+
fsdp_config:
93+
param_offload: false
94+
log_prob_micro_batch_size_per_gpu: 1
95+
log_prob_use_dynamic_bsz: false
96+
log_prob_max_token_len_per_gpu: 16384
97+
98+
# ---------------------------------------------------------------------------
99+
# Algorithm
100+
# ---------------------------------------------------------------------------
101+
algorithm:
102+
gamma: 1.0
103+
lam: 1.0
104+
adv_estimator: grpo # ← group-relative advantage; this is what makes it GRPO
105+
kl_penalty: kl
106+
kl_ctrl:
107+
type: fixed
108+
kl_coef: 0.001 # KL penalty in reward (separate from kl_loss_coef above)
109+
110+
# ---------------------------------------------------------------------------
111+
# Reward — our sandbox-verified composite reward
112+
# ---------------------------------------------------------------------------
113+
custom_reward_function:
114+
path: src/verifiable_rl_coder/training/grpo_reward.py
115+
name: compute_reward
116+
117+
# ---------------------------------------------------------------------------
118+
# Trainer
119+
# ---------------------------------------------------------------------------
120+
trainer:
121+
total_epochs: 5
122+
total_training_steps: 50 # smoke runs override to 50; full runs to 500-1000
123+
project_name: verifiable-rl-coder
124+
experiment_name: grpo-qwen-1.5b-v1
125+
logger: ['console', 'wandb']
126+
val_before_train: false
127+
n_gpus_per_node: 1
128+
nnodes: 1
129+
save_freq: 25 # checkpoint every 50 steps
130+
test_freq: -1 # disable verl's internal eval — we eval externally
131+
default_hdfs_dir: null
132+
default_local_dir: results/grpo_checkpoints/qwen-1.5b-v1
133+
134+
# Ray runtime — required by verl ≥ 0.7 (struct-mode rejects missing keys).
135+
ray_kwargs:
136+
ray_init: {}

0 commit comments

Comments
 (0)