configs: grpo_qwen1_5b.yaml + grpo_qwen7b.yaml for verl GRPO

Devesh-Maheshwari · Devesh-Maheshwari · commit cdfc1bb375a9 · 2026-04-24T23:40:32.000-05:00
diff --git a/configs/grpo_qwen1_5b.yaml b/configs/grpo_qwen1_5b.yaml
@@ -0,0 +1,132 @@
+# GRPO config for Qwen-2.5-Coder-1.5B (verl).
+#
+# Invoke via:
+#   python -m verl.trainer.main_ppo --config-path=configs --config-name=grpo_qwen1_5b
+#
+# Override at runtime (CHTC shell will do this) to point the model path
+# at a merged SFT checkpoint:
+#   actor_rollout_ref.model.path=/path/to/merged_sft
+#
+# Reference: DeepSeek-R1 §3 hyperparameters; llm-starter GSM8K example.
+
+# ---------------------------------------------------------------------------
+# Data
+# ---------------------------------------------------------------------------
+data:
+  tokenizer: null
+  # Default: dataset built by `scripts/build_grpo_dataset.py`. Override per-run.
+  train_files: results/grpo_dataset/v1/train.parquet
+  val_files: null
+  prompt_key: prompt
+  max_prompt_length: 1024
+  max_response_length: 1024
+  # Effective rollouts per training step = train_batch_size * actor.rollout.n.
+  # 32 prompts * 8 rollouts = 256 candidates per step.
+  train_batch_size: 32
+  return_raw_chat: false
+
+# ---------------------------------------------------------------------------
+# Actor / rollout / reference policy
+# ---------------------------------------------------------------------------
+actor_rollout_ref:
+  hybrid_engine: true
+
+  model:
+    # Default to base; CHTC shell overrides to the merged SFT checkpoint.
+    path: Qwen/Qwen2.5-Coder-1.5B-Instruct
+    enable_gradient_checkpointing: true
+    use_remove_padding: true
+
+  actor:
+    strategy: fsdp
+    # Mini-batch is the gradient-accumulation chunk inside one PPO update.
+    # Effective grad batch = ppo_mini_batch_size = 16. Adjust if OOM.
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size_per_gpu: 1
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    grad_clip: 1.0
+    clip_ratio: 0.2          # PPO-style clip; standard 0.2.
+    entropy_coeff: 0.0       # No entropy bonus — we want a focused policy.
+    # KL is added to the LOSS (not the reward) — DeepSeek-R1 style.
+    use_kl_loss: true
+    kl_loss_coef: 0.04
+    kl_loss_type: low_var_kl
+    optim:
+      lr: 1.0e-6              # Conservative; can bump to 5e-6 if learning is too slow.
+      lr_warmup_steps_ratio: 0.0
+      min_lr_ratio: null
+      warmup_style: constant
+      total_training_steps: -1  # filled at runtime from trainer.total_training_steps
+    fsdp_config:
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      fsdp_size: -1
+
+  rollout:
+    name: vllm
+    temperature: 1.0           # high entropy during rollout — diversity for GRPO group
+    top_k: -1
+    top_p: 1.0
+    prompt_length: 1024
+    response_length: 1024
+    dtype: bfloat16
+    # Leave 40% headroom for training activations.
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    free_cache_engine: true
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_num_seqs: 1024
+    log_prob_micro_batch_size_per_gpu: 1
+    # GROUP SIZE — number of completions per prompt in a GRPO step.
+    # 8 is DeepSeek-R1's default; larger gives lower-variance advantage estimate
+    # but costs proportionally more GPU time per step.
+    n: 8
+
+  ref:
+    fsdp_config:
+      param_offload: false
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: false
+    log_prob_max_token_len_per_gpu: 16384
+
+# ---------------------------------------------------------------------------
+# Algorithm
+# ---------------------------------------------------------------------------
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo          # ← group-relative advantage; this is what makes it GRPO
+  kl_penalty: kl
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001              # KL penalty in reward (separate from kl_loss_coef above)
+
+# ---------------------------------------------------------------------------
+# Reward — our sandbox-verified composite reward
+# ---------------------------------------------------------------------------
+custom_reward_function:
+  path: src/verifiable_rl_coder/training/grpo_reward.py
+  name: compute_reward
+
+# ---------------------------------------------------------------------------
+# Trainer
+# ---------------------------------------------------------------------------
+trainer:
+  total_epochs: 5
+  total_training_steps: 500   # smoke runs override to 50; full runs to 500-1000
+  project_name: verifiable-rl-coder
+  experiment_name: grpo-qwen-1.5b-v1
+  logger: ['console', 'wandb']
+  val_before_train: false
+  n_gpus_per_node: 1
+  nnodes: 1
+  save_freq: 50               # checkpoint every 50 steps
+  test_freq: -1               # disable verl's internal eval — we eval externally
+  default_hdfs_dir: null
+  default_local_dir: results/grpo_checkpoints/qwen-1.5b-v1
diff --git a/configs/grpo_qwen7b.yaml b/configs/grpo_qwen7b.yaml
@@ -0,0 +1,111 @@
+# GRPO config for Qwen-2.5-Coder-7B (verl). Multi-GPU.
+#
+# Differences vs grpo_qwen1_5b.yaml:
+#   - Larger model: 7B base / SFT
+#   - Multi-GPU: 4 GPUs (FSDP for training, TP=2 for vLLM rollout)
+#   - Smaller per-GPU batch sizes to fit in VRAM
+#   - More aggressive gradient checkpointing
+#
+# Invoke via:
+#   python -m verl.trainer.main_ppo --config-path=configs --config-name=grpo_qwen7b
+#
+# Override model path at runtime to use a merged SFT-7B checkpoint:
+#   actor_rollout_ref.model.path=/path/to/merged_sft_7b
+
+data:
+  tokenizer: null
+  train_files: results/grpo_dataset/v1/train.parquet
+  val_files: null
+  prompt_key: prompt
+  max_prompt_length: 1024
+  max_response_length: 1024
+  # 16 prompts * 8 rollouts = 128 candidates per step (half the 1.5B step
+  # to keep memory per GPU comparable).
+  train_batch_size: 16
+  return_raw_chat: false
+
+actor_rollout_ref:
+  hybrid_engine: true
+
+  model:
+    path: Qwen/Qwen2.5-Coder-7B-Instruct
+    enable_gradient_checkpointing: true
+    use_remove_padding: true
+
+  actor:
+    strategy: fsdp
+    ppo_mini_batch_size: 8
+    ppo_micro_batch_size_per_gpu: 1
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 12288
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.0
+    use_kl_loss: true
+    kl_loss_coef: 0.04
+    kl_loss_type: low_var_kl
+    optim:
+      lr: 5.0e-7              # Lower than 1.5B — bigger model needs gentler updates
+      lr_warmup_steps_ratio: 0.0
+      min_lr_ratio: null
+      warmup_style: constant
+      total_training_steps: -1
+    fsdp_config:
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: true   # offload optimizer state to CPU at 7B
+      fsdp_size: -1
+
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1
+    top_p: 1.0
+    prompt_length: 1024
+    response_length: 1024
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5  # tighter at 7B
+    ignore_eos: false
+    enforce_eager: false
+    free_cache_engine: true
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 2  # split rollout across 2 GPUs
+    max_num_batched_tokens: 8192
+    max_num_seqs: 512
+    log_prob_micro_batch_size_per_gpu: 1
+    n: 8
+
+  ref:
+    fsdp_config:
+      param_offload: true
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: false
+    log_prob_max_token_len_per_gpu: 12288
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+
+custom_reward_function:
+  path: src/verifiable_rl_coder/training/grpo_reward.py
+  name: compute_reward
+
+trainer:
+  total_epochs: 5
+  total_training_steps: 500
+  project_name: verifiable-rl-coder
+  experiment_name: grpo-qwen-7b-v1
+  logger: ['console', 'wandb']
+  val_before_train: false
+  n_gpus_per_node: 4          # 4-GPU node — FSDP across 4, vLLM TP=2
+  nnodes: 1
+  save_freq: 50
+  test_freq: -1
+  default_hdfs_dir: null
+  default_local_dir: results/grpo_checkpoints/qwen-7b-v1