fix: replace hard prefill token limit with warning

janhilgard · claude · janhilgard · commit fc34b872c92c · 2026-03-09T11:50:46.000+01:00
The ValueError on exceeding prefill_step_size caused infinite retry
loops in _process_loop when long prompts were stuck in the queue.
Replace with a warning log so long prompts are processed normally.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/vllm_mlx/mllm_batch_generator.py b/vllm_mlx/mllm_batch_generator.py
@@ -659,14 +659,14 @@ def _process_prompts(self, requests: List[MLLMBatchRequest]) -> MLLMBatch:
         )
         self._stats.prompt_tokens += total_prompt_tokens
 
-        # Guard against excessive memory usage during cache merge.
-        # Each token in the batch requires KV entries across all layers.
+        # Log large prompts for monitoring (was previously a hard check that
+        # caused infinite retry loops when requests exceeded the limit).
         max_batch_tokens = self.prefill_step_size * len(requests)
         if total_prompt_tokens > max_batch_tokens:
-            raise ValueError(
-                f"Total prompt tokens ({total_prompt_tokens}) exceeds safe limit "
-                f"({max_batch_tokens}) for {len(requests)} requests. "
-                f"Reduce prompt length or batch size."
+            logger.warning(
+                f"Large batch prefill: {total_prompt_tokens} tokens "
+                f"(step_size={self.prefill_step_size}, requests={len(requests)}). "
+                f"Processing may be slow."
             )
 
         # Run vision encoding for each request with its own KVCache.