[levanter] Fix top-p cutoff boundary

taivu1998 · taivu1998 · commit 50fdbe2ff681 · 2026-04-24T18:45:11.000-07:00
diff --git a/lib/levanter/src/levanter/layers/sampler.py b/lib/levanter/src/levanter/layers/sampler.py
@@ -114,9 +114,12 @@ def _apply_top_p(
             top_p_array = top_ps
         threshold = jnp.clip(jnp.asarray(top_p_array, dtype=jnp.float32), min=0.0, max=1.0)[..., None]
 
-        keep_sorted = cumulative_probs <= threshold
+        # Keep the smallest prefix whose cumulative mass reaches the threshold.
+        # The cutoff-crossing token should remain eligible, but we should not
+        # include the next token when the threshold is met exactly.
+        keep_sorted = cumulative_probs < threshold
         keep_sorted = jnp.concatenate(
-            [jnp.ones_like(keep_sorted[..., :1], dtype=bool), keep_sorted[..., 1:]],
+            [jnp.ones_like(keep_sorted[..., :1], dtype=bool), keep_sorted[..., :-1]],
             axis=-1,
         )
         filtered_sorted_logits = jnp.where(keep_sorted, sorted_logits, -jnp.inf)
diff --git a/lib/levanter/tests/test_sampler.py b/lib/levanter/tests/test_sampler.py
@@ -23,3 +23,25 @@ def test_sampler_top_p_keeps_only_the_nucleus_head():
 
     assert int(token.array) == 0
     assert float(log_prob.array) == pytest.approx(0.0)
+
+
+def test_sampler_top_p_keeps_cutoff_crossing_token():
+    vocab = hax.Axis("vocab", 3)
+    sampler = Sampler(vocab)
+    logits = hax.named(jnp.log(jnp.array([0.4, 0.35, 0.25], dtype=jnp.float32)), (vocab,))
+
+    masked_logits = sampler._apply_top_p(logits, jnp.array(0.6, dtype=jnp.float32))
+
+    assert jnp.isfinite(masked_logits.array[:2]).all()
+    assert jnp.isneginf(masked_logits.array[2])
+
+
+def test_sampler_top_p_does_not_overshoot_exact_threshold():
+    vocab = hax.Axis("vocab", 3)
+    sampler = Sampler(vocab)
+    logits = hax.named(jnp.log(jnp.array([0.4, 0.35, 0.25], dtype=jnp.float32)), (vocab,))
+
+    masked_logits = sampler._apply_top_p(logits, jnp.array(0.4, dtype=jnp.float32))
+
+    assert jnp.isfinite(masked_logits.array[0])
+    assert jnp.isneginf(masked_logits.array[1:]).all()