Fix server stall for prompts exceeding max_num_batched_tokens (#199) (#200)

ivanfioravanti · web-flow · commit e13c5d64bc6e · 2026-03-24T10:50:46.000Z
**Problem**:When chunked prefill is disabled on the Metal platform, the
vLLM v1 scheduler breaks out of the waiting-queue loop whenever a
prompt's token count exceeds the token budget (default 2048). Any
request with a large context was permanently stuck in the waiting queue
— the server sent a few initial SSE chunks then went silent and never
sent [DONE].

**Fix**: after setting enable_chunked_prefill=False in
check_and_update_config, raise max_num_batched_tokens (and
max_num_scheduled_tokens when set) to at least max_model_len so the
scheduler can always fit a full prompt in one scheduling step.

Add two new test cases covering: (1) max_num_scheduled_tokens is also
raised, and (2) a budget already larger than max_model_len is preserved.

---------

Signed-off-by: ivanfioravanti &lt;ivan.fioravanti@gmail.com&gt;
diff --git a/tests/test_platform.py b/tests/test_platform.py
@@ -132,7 +132,12 @@ def test_verify_quantization_supported(self) -> None:
     def test_check_and_update_config_disables_chunked_prefill(
         self, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Metal should disable chunked prefill until the runner supports it."""
+        """Metal should disable chunked prefill until the runner supports it.
+
+        When chunked prefill is disabled, max_num_batched_tokens must be at
+        least max_model_len so the scheduler can schedule the entire prompt
+        in a single step.
+        """
         import vllm_metal.stt.config as stt_config
         import vllm_metal.utils as metal_utils
 
@@ -150,22 +155,157 @@ def test_check_and_update_config_disables_chunked_prefill(
                 model="test-model",
                 disable_cascade_attn=False,
                 tokenizer=None,
+                max_model_len=32768,
             ),
             scheduler_config=SimpleNamespace(
                 async_scheduling=True,
                 enable_chunked_prefill=True,
+                max_num_batched_tokens=2048,
+                max_num_scheduled_tokens=None,
             ),
         )
 
         MetalPlatform.check_and_update_config(vllm_config)
 
         assert vllm_config.scheduler_config.enable_chunked_prefill is False
+        assert vllm_config.scheduler_config.max_num_batched_tokens == 32768
         assert (
             vllm_config.parallel_config.worker_cls == "vllm_metal.v1.worker.MetalWorker"
         )
         assert vllm_config.parallel_config.distributed_executor_backend == "uni"
         assert vllm_config.parallel_config.disable_custom_all_reduce is True
 
+    def test_check_and_update_config_increases_max_num_scheduled_tokens_below_max_model_len(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """max_num_scheduled_tokens below max_model_len should be bumped up to max_model_len.
+
+        When max_num_scheduled_tokens is explicitly set to a value smaller
+        than max_model_len, it must be raised to match max_model_len so that
+        the scheduler can schedule the full prompt in a single step.
+        """
+        import vllm_metal.stt.config as stt_config
+        import vllm_metal.utils as metal_utils
+
+        monkeypatch.setattr(metal_utils, "get_model_download_path", lambda model: model)
+        monkeypatch.setattr(stt_config, "is_stt_model", lambda _model: False)
+
+        vllm_config = SimpleNamespace(
+            parallel_config=SimpleNamespace(
+                worker_cls="auto",
+                distributed_executor_backend="auto",
+                disable_custom_all_reduce=False,
+            ),
+            cache_config=SimpleNamespace(block_size=None),
+            model_config=SimpleNamespace(
+                model="test-model",
+                disable_cascade_attn=False,
+                tokenizer=None,
+                max_model_len=32768,
+            ),
+            scheduler_config=SimpleNamespace(
+                async_scheduling=True,
+                enable_chunked_prefill=True,
+                max_num_batched_tokens=2048,
+                max_num_scheduled_tokens=2048,
+            ),
+        )
+
+        MetalPlatform.check_and_update_config(vllm_config)
+
+        assert vllm_config.scheduler_config.enable_chunked_prefill is False
+        assert vllm_config.scheduler_config.max_num_batched_tokens == 32768
+        assert vllm_config.scheduler_config.max_num_scheduled_tokens == 32768
+
+    def test_check_and_update_config_does_not_reduce_large_max_num_batched_tokens(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """max_num_batched_tokens must not be lowered when already >= max_model_len.
+
+        If the user has explicitly set a token budget larger than max_model_len,
+        that setting must be preserved.
+        """
+        import vllm_metal.stt.config as stt_config
+        import vllm_metal.utils as metal_utils
+
+        monkeypatch.setattr(metal_utils, "get_model_download_path", lambda model: model)
+        monkeypatch.setattr(stt_config, "is_stt_model", lambda _model: False)
+
+        vllm_config = SimpleNamespace(
+            parallel_config=SimpleNamespace(
+                worker_cls="auto",
+                distributed_executor_backend="auto",
+                disable_custom_all_reduce=False,
+            ),
+            cache_config=SimpleNamespace(block_size=None),
+            model_config=SimpleNamespace(
+                model="test-model",
+                disable_cascade_attn=False,
+                tokenizer=None,
+                max_model_len=32768,
+            ),
+            scheduler_config=SimpleNamespace(
+                async_scheduling=True,
+                enable_chunked_prefill=True,
+                max_num_batched_tokens=65536,
+                max_num_scheduled_tokens=None,
+            ),
+        )
+
+        MetalPlatform.check_and_update_config(vllm_config)
+
+        assert vllm_config.scheduler_config.enable_chunked_prefill is False
+        # 65536 > 32768, so the value must stay at 65536
+        assert vllm_config.scheduler_config.max_num_batched_tokens == 65536
+
+    @pytest.mark.parametrize("max_num_scheduled_tokens", [32768, 65536])
+    def test_check_and_update_config_does_not_reduce_max_num_scheduled_tokens_when_at_least_max_model_len(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+        max_num_scheduled_tokens: int,
+    ) -> None:
+        """max_num_scheduled_tokens must not be lowered when already >= max_model_len.
+
+        If the user has explicitly set a scheduled-token budget at least
+        max_model_len, that setting must be preserved (only values strictly
+        below max_model_len are bumped up).
+        """
+        import vllm_metal.stt.config as stt_config
+        import vllm_metal.utils as metal_utils
+
+        monkeypatch.setattr(metal_utils, "get_model_download_path", lambda model: model)
+        monkeypatch.setattr(stt_config, "is_stt_model", lambda _model: False)
+
+        vllm_config = SimpleNamespace(
+            parallel_config=SimpleNamespace(
+                worker_cls="auto",
+                distributed_executor_backend="auto",
+                disable_custom_all_reduce=False,
+            ),
+            cache_config=SimpleNamespace(block_size=None),
+            model_config=SimpleNamespace(
+                model="test-model",
+                disable_cascade_attn=False,
+                tokenizer=None,
+                max_model_len=32768,
+            ),
+            scheduler_config=SimpleNamespace(
+                async_scheduling=True,
+                enable_chunked_prefill=True,
+                max_num_batched_tokens=65536,
+                max_num_scheduled_tokens=max_num_scheduled_tokens,
+            ),
+        )
+
+        MetalPlatform.check_and_update_config(vllm_config)
+
+        assert vllm_config.scheduler_config.enable_chunked_prefill is False
+        assert vllm_config.scheduler_config.max_num_batched_tokens == 65536
+        assert (
+            vllm_config.scheduler_config.max_num_scheduled_tokens
+            == max_num_scheduled_tokens
+        )
+
     def test_check_and_update_config_applies_stt_scheduler_policy(
         self, monkeypatch: pytest.MonkeyPatch
     ) -> None:
diff --git a/vllm_metal/platform.py b/vllm_metal/platform.py
@@ -231,7 +231,27 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             # scheduler only requests full prefills, which matches the current
             # model-runner contract.
             scheduler_config.enable_chunked_prefill = False
-            logger.info("Metal: disabled chunked prefill")
+
+            # Without chunked prefill, the scheduler must fit the entire
+            # prompt in a single step.  Ensure max_num_batched_tokens (and
+            # max_num_scheduled_tokens) are at least max_model_len;
+            # otherwise the scheduler silently refuses to schedule any
+            # prompt that exceeds the budget (see Scheduler.schedule —
+            # the "chunked_prefill is disabled" break).
+            if model_config is not None:
+                model_max = model_config.max_model_len
+                if scheduler_config.max_num_batched_tokens < model_max:
+                    scheduler_config.max_num_batched_tokens = model_max
+                if (
+                    scheduler_config.max_num_scheduled_tokens is not None
+                    and scheduler_config.max_num_scheduled_tokens < model_max
+                ):
+                    scheduler_config.max_num_scheduled_tokens = model_max
+
+            logger.info(
+                "Metal: disabled chunked prefill, max_num_batched_tokens=%d",
+                scheduler_config.max_num_batched_tokens,
+            )
 
         if config.use_paged_attention and getattr(
             cache_config, "enable_prefix_caching", False