fix: advance sampler RNG each decode step (#940)

Prayer3th · web-flow · commit bc770647877e · 2026-04-23T16:37:39.000+08:00
* fix: advance sampler RNG each decode step to enable true random sampling
diff --git a/python/sgl_jax/srt/layers/sampler.py b/python/sgl_jax/srt/layers/sampler.py
@@ -163,6 +163,7 @@ def __call__(
         logits_output: LogitsProcessorOutput,
         sampling_metadata: SamplingMetadata,
         use_sort_for_toppk_minp: bool,
+        rng_override: jax.Array | None = None,
     ):
         """Run a sampler & compute logprobs and update logits_output accordingly.
 
@@ -188,7 +189,7 @@ def __call__(
             (logits, sampling_metadata.vocab_mask),
         )
 
-        _, rng = jax.random.split(self.rngs.params())
+        _, rng = jax.random.split(rng_override if rng_override is not None else self.rngs.params())
         operands = (logits, sampling_metadata, rng)
         regular_fn = lambda op: self._regular_sampling((*op, use_sort_for_toppk_minp))
         batch_next_token_ids, logprobs = lax.cond(
diff --git a/python/sgl_jax/srt/model_executor/model_runner.py b/python/sgl_jax/srt/model_executor/model_runner.py
@@ -151,6 +151,8 @@ def initialize(self):
             self.init_lora_manager()
 
         if not self.is_draft_worker:
+            self._sampler_base_rng = jax.random.PRNGKey(server_args.random_seed)
+            self._sampler_step = 0
             self.initialize_jit()
 
         # Init memory pool and attention backends
@@ -220,18 +222,27 @@ def jitted_run_model(
             with LoraBatchContext.set_batch(forward_batch):
                 return model(forward_batch, token_to_kv_pool, logits_metadata)
 
+        # Capture base RNG key as a constant in the JIT closure.
+        # fold_in(constant, dynamic_step) is computed inside JIT, avoiding
+        # the eager jax.random.split that would serialize the host-device pipeline.
+        base_rng_key = self._sampler_base_rng
+
         @partial(jax.jit, static_argnames=["sampler_state_def", "use_sort_for_toppk_minp"])
         def jitted_sampler(
             sampler_def,
             sampler_state_def,
             sampler_state_leaves,
             use_sort_for_toppk_minp,
+            rng_step,
             *args,
         ):
 
             model_state = jax.tree_util.tree_unflatten(sampler_state_def, sampler_state_leaves)
             sampler = nnx.merge(sampler_def, model_state)
-            return sampler(*args, use_sort_for_toppk_minp=use_sort_for_toppk_minp)
+            rng_key = jax.random.fold_in(base_rng_key, rng_step)
+            return sampler(
+                *args, use_sort_for_toppk_minp=use_sort_for_toppk_minp, rng_override=rng_key
+            )
 
         @partial(jax.jit, static_argnames=["mesh"])
         def jitted_compute_logprobs(mesh, logits, next_tokens):
@@ -728,8 +739,12 @@ def sample(
         Returns:
             A list of next_token_ids
         """
+        # Advance step counter (pure Python, zero device overhead).
+        # fold_in(base_key, step) inside JIT produces a unique RNG per step.
+        self._sampler_step += 1
         # Penalty application has been moved to the Sampler for better JIT performance
         return self.jitted_sampler(
+            self._sampler_step,
             logits_output,
             sampling_metadata,
         )
diff --git a/test/srt/test_logprobs.py b/test/srt/test_logprobs.py
@@ -138,12 +138,6 @@ def test_logprobs(self):
 
         sampling_params = {"n": 1, "temperature": 0.6, "top_p": 0.95, "max_new_tokens": 3}
 
-        expected_output_logprobs = [
-            [-0.8984375, 71486, "Alright"],  ## todo use output compute is -0.79296875
-            [0.0, 11, ","],
-            [-0.06787109375, 279, " the"],
-        ]
-
         output = self.engine.generate(
             input_ids=input_ids,
             sampling_params=sampling_params,
@@ -153,22 +147,28 @@ def test_logprobs(self):
             token_ids_logprob=token_ids_logprob,
         )
         output_meta = output["meta_info"]
-        self.check_output(output_meta, "output_token_logprobs", expected_output_logprobs)
+        # With temperature>0 sampling, exact tokens depend on RNG state.
+        # Only verify structural correctness here.
+        self.assertEqual(
+            len(output_meta["output_token_logprobs"]),
+            3,
+            "output_token_logprobs length mismatch",
+        )
+        for i, logprob in enumerate(output_meta["output_token_logprobs"]):
+            self.assertLessEqual(logprob[0], 0.0, f"logprob at {i} should be non-positive")
 
-        # use another expected, because jax compiler fused ops will introduce numerical precision issue
-        expected_output_logprobs = [
-            [-0.78125, 32313, "Okay"],  # todo use output compute is -0.79296875
-            [0.0, 11, ","],
-            [-0.1650390625, 773, " so"],
-        ]
         output = self.engine.generate(
             input_ids=input_ids,
             sampling_params=sampling_params,
             return_logprob=True,
         )
         output_meta = output["meta_info"]
         self.assertEqual(output_meta["cache_miss_count"], 0, "occur cache_miss")
-        self.check_output(output_meta, "output_token_logprobs", expected_output_logprobs)
+        self.assertEqual(
+            len(output_meta["output_token_logprobs"]),
+            3,
+            "output_token_logprobs length mismatch",
+        )
 
     def check_output(self, actual, key, expected):
         for i, logprob in enumerate(actual[key]):