We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 4378b8a commit edf8b19Copy full SHA for edf8b19
1 file changed
src/verifiable_rl_coder/training/grpo_reward.py
@@ -92,6 +92,18 @@ def compute_reward(
92
93
try:
94
result = _get_verifier().verify(solution_str, task, benchmark)
95
+ # DIAGNOSTIC — sample 1/32 completions to job.out so we can see what
96
+ # verl's rollout is actually generating. Remove once GRPO produces
97
+ # meaningful rewards (i.e. the rollout-shape question is answered).
98
+ import random
99
+ if random.random() < 1.0 / 32:
100
+ print(
101
+ f"[compute_reward] task={task_id} "
102
+ f"reward={result.reward:.3f} passed={result.passed} "
103
+ f"len={len(solution_str)} "
104
+ f"first200={solution_str[:200]!r}",
105
+ flush=True,
106
+ )
107
return float(result.reward)
108
except Exception:
109
return 0.0
0 commit comments