smoke config: add ray_kwargs (was missed in last update)

Devesh-Maheshwari · Devesh-Maheshwari · commit e30d99ea56cd · 2026-04-25T03:02:20.000-05:00
diff --git a/configs/grpo_qwen1_5b_smoke.yaml b/configs/grpo_qwen1_5b_smoke.yaml
@@ -0,0 +1,136 @@
+# GRPO config for Qwen-2.5-Coder-1.5B (verl).
+#
+# Invoke via:
+#   python -m verl.trainer.main_ppo --config-path=configs --config-name=grpo_qwen1_5b
+#
+# Override at runtime (CHTC shell will do this) to point the model path
+# at a merged SFT checkpoint:
+#   actor_rollout_ref.model.path=/path/to/merged_sft
+#
+# Reference: DeepSeek-R1 §3 hyperparameters; llm-starter GSM8K example.
+
+# ---------------------------------------------------------------------------
+# Data
+# ---------------------------------------------------------------------------
+data:
+  tokenizer: null
+  # Default: dataset built by `scripts/build_grpo_dataset.py`. Override per-run.
+  train_files: results/grpo_dataset/v1/train.parquet
+  val_files: null
+  prompt_key: prompt
+  max_prompt_length: 1024
+  max_response_length: 1024
+  # Effective rollouts per training step = train_batch_size * actor.rollout.n.
+  # 32 prompts * 8 rollouts = 256 candidates per step.
+  train_batch_size: 32
+  return_raw_chat: false
+
+# ---------------------------------------------------------------------------
+# Actor / rollout / reference policy
+# ---------------------------------------------------------------------------
+actor_rollout_ref:
+  hybrid_engine: true
+
+  model:
+    # Default to base; CHTC shell overrides to the merged SFT checkpoint.
+    path: Qwen/Qwen2.5-Coder-1.5B-Instruct
+    enable_gradient_checkpointing: true
+    use_remove_padding: true
+
+  actor:
+    strategy: fsdp
+    # Mini-batch is the gradient-accumulation chunk inside one PPO update.
+    # Effective grad batch = ppo_mini_batch_size = 16. Adjust if OOM.
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size_per_gpu: 1
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    grad_clip: 1.0
+    clip_ratio: 0.2          # PPO-style clip; standard 0.2.
+    entropy_coeff: 0.0       # No entropy bonus — we want a focused policy.
+    # KL is added to the LOSS (not the reward) — DeepSeek-R1 style.
+    use_kl_loss: true
+    kl_loss_coef: 0.04
+    kl_loss_type: low_var_kl
+    optim:
+      lr: 1.0e-6              # Conservative; can bump to 5e-6 if learning is too slow.
+      lr_warmup_steps_ratio: 0.0
+      min_lr_ratio: null
+      warmup_style: constant
+      total_training_steps: -1  # filled at runtime from trainer.total_training_steps
+    fsdp_config:
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      fsdp_size: -1
+
+  rollout:
+    name: vllm
+    temperature: 1.0           # high entropy during rollout — diversity for GRPO group
+    top_k: -1
+    top_p: 1.0
+    prompt_length: 1024
+    response_length: 1024
+    dtype: bfloat16
+    # Leave 40% headroom for training activations.
+    gpu_memory_utilization: 0.6
+    ignore_eos: false
+    enforce_eager: false
+    free_cache_engine: true
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_num_seqs: 1024
+    log_prob_micro_batch_size_per_gpu: 1
+    # GROUP SIZE — number of completions per prompt in a GRPO step.
+    # 8 is DeepSeek-R1's default; larger gives lower-variance advantage estimate
+    # but costs proportionally more GPU time per step.
+    n: 8
+
+  ref:
+    fsdp_config:
+      param_offload: false
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: false
+    log_prob_max_token_len_per_gpu: 16384
+
+# ---------------------------------------------------------------------------
+# Algorithm
+# ---------------------------------------------------------------------------
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo          # ← group-relative advantage; this is what makes it GRPO
+  kl_penalty: kl
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001              # KL penalty in reward (separate from kl_loss_coef above)
+
+# ---------------------------------------------------------------------------
+# Reward — our sandbox-verified composite reward
+# ---------------------------------------------------------------------------
+custom_reward_function:
+  path: src/verifiable_rl_coder/training/grpo_reward.py
+  name: compute_reward
+
+# ---------------------------------------------------------------------------
+# Trainer
+# ---------------------------------------------------------------------------
+trainer:
+  total_epochs: 5
+  total_training_steps: 50   # smoke runs override to 50; full runs to 500-1000
+  project_name: verifiable-rl-coder
+  experiment_name: grpo-qwen-1.5b-v1
+  logger: ['console', 'wandb']
+  val_before_train: false
+  n_gpus_per_node: 1
+  nnodes: 1
+  save_freq: 25               # checkpoint every 50 steps
+  test_freq: -1               # disable verl's internal eval — we eval externally
+  default_hdfs_dir: null
+  default_local_dir: results/grpo_checkpoints/qwen-1.5b-v1
+
+# Ray runtime — required by verl ≥ 0.7 (struct-mode rejects missing keys).
+ray_kwargs:
+  ray_init: {}