fix: add device_get in _gather_next_token_ids to return Python list

JamesBrianD · claude · JamesBrianD · commit e56c621ebc83 · 2026-03-04T16:34:09.000+08:00
eec2620 introduced _gather_next_token_ids which gathers sharded JAX arrays to replicated sharding, but did not convert the result to CPU. This left next_token_ids as a JAX on-device array, causing downstream unhashable type errors when token ids were used in set lookups (check_finished). Add device_get + tolist at the source. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/python/sgl_jax/srt/managers/scheduler.py b/python/sgl_jax/srt/managers/scheduler.py
@@ -573,16 +573,18 @@ def _gather_logits_output(self, logits_output: LogitsProcessorOutput) -> LogitsP
 
         return logits_output
 
-    def _gather_next_token_ids(self, next_token_ids: jax.Array) -> jax.Array:
-        """Gather sharded next_token_ids to replicated sharding."""
+    def _gather_next_token_ids(self, next_token_ids: jax.Array) -> list[int]:
+        """Gather sharded next_token_ids to replicated sharding and convert to Python list."""
         from jax.sharding import NamedSharding, PartitionSpec
 
         if next_token_ids is None:
             return None
 
         replicated_sharding = NamedSharding(self.mesh, PartitionSpec())
         gather_fn = jax.jit(lambda x: x, out_shardings=replicated_sharding)
-        return gather_fn(next_token_ids)
+        gathered = gather_fn(next_token_ids)
+        # Convert to Python list of ints for downstream compatibility
+        return jax.device_get(gathered).tolist()
 
     def _select_round_robin_dp(self) -> int:
         dp_rank = self.dp_round_robin_counter % self.dp_size