diag: sample 1/32 completions to job.out so we can see what verl rollout produces

Devesh-Maheshwari · Devesh-Maheshwari · commit edf8b1982a1d · 2026-04-25T16:42:28.000-05:00
diff --git a/src/verifiable_rl_coder/training/grpo_reward.py b/src/verifiable_rl_coder/training/grpo_reward.py
@@ -92,6 +92,18 @@ def compute_reward(
 
     try:
         result = _get_verifier().verify(solution_str, task, benchmark)
+        # DIAGNOSTIC — sample 1/32 completions to job.out so we can see what
+        # verl's rollout is actually generating. Remove once GRPO produces
+        # meaningful rewards (i.e. the rollout-shape question is answered).
+        import random
+        if random.random() < 1.0 / 32:
+            print(
+                f"[compute_reward] task={task_id} "
+                f"reward={result.reward:.3f} passed={result.passed} "
+                f"len={len(solution_str)} "
+                f"first200={solution_str[:200]!r}",
+                flush=True,
+            )
         return float(result.reward)
     except Exception:
         return 0.0