fix: MLLM image processing — exclude_none for Jinja template, error handling

janhilgard · claude · janhilgard · commit 310246df79cb · 2026-03-04T07:26:02.000+01:00
- Use model_dump(exclude_none=True) for MLLM messages: Qwen3VL Jinja
  template checks 'image_url' in item — null keys from Pydantic
  model_dump() falsely triggered extra &lt;|image_pad|&gt; tokens, causing
  "index out of bounds" crash in processor
- Add per-request error handling in MLLM batch preprocessing: failed
  requests now get immediate finish_reason="error" instead of infinite
  retry loop (was retrying 5756 times in 300s before timeout)
- Handle error responses in MLLM scheduler to properly clean up and
  return error status to client

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/vllm_mlx/mllm_batch_generator.py b/vllm_mlx/mllm_batch_generator.py
@@ -348,6 +348,9 @@ def __init__(
         # Statistics
         self._stats = MLLMBatchStats()
 
+        # Error responses for requests that failed during preprocessing
+        self._pending_error_responses: List[MLLMBatchResponse] = []
+
         # Vision embedding cache for repeated images
         self.vision_cache = VisionEmbeddingCache(
             max_pixel_entries=vision_cache_size,
@@ -621,9 +624,35 @@ def _process_prompts(self, requests: List[MLLMBatchRequest]) -> MLLMBatch:
 
         tic = time.perf_counter()
 
-        # Preprocess all requests
+        # Preprocess all requests (per-request error handling)
+        failed_requests = []
         for req in requests:
-            self._preprocess_request(req)
+            try:
+                self._preprocess_request(req)
+            except Exception as e:
+                logger.error(
+                    f"Failed to preprocess request {req.request_id}: "
+                    f"{type(e).__name__}: {e}"
+                )
+                failed_requests.append(req)
+
+        # Remove failed requests from batch and create error responses
+        if failed_requests:
+            for req in failed_requests:
+                requests.remove(req)
+                self._pending_error_responses.append(
+                    MLLMBatchResponse(
+                        uid=req.uid,
+                        request_id=req.request_id,
+                        token=0,
+                        logprobs=mx.zeros(1),
+                        finish_reason="error",
+                    )
+                )
+
+        if not requests:
+            # All requests failed
+            return None
 
         total_prompt_tokens = sum(
             req.input_ids.size if req.input_ids is not None else 1 for req in requests
@@ -768,10 +797,16 @@ def _next(self) -> List[MLLMBatchResponse]:
             self.active_batch = new_batch
             prompt_processing = True
 
+        # Collect any pending error responses (from failed preprocessing)
+        error_responses = []
+        if self._pending_error_responses:
+            error_responses = list(self._pending_error_responses)
+            self._pending_error_responses.clear()
+
         # Generate next token for active batch
         batch = self.active_batch
         if batch is None:
-            return []
+            return error_responses
 
         y, logprobs = batch.y, batch.logprobs
         batch.y, batch.logprobs = self._step(y[:, None], batch.cache)
@@ -840,7 +875,7 @@ def _next(self) -> List[MLLMBatchResponse]:
                 self.active_batch = None
 
         self._stats.generation_tokens += len(responses)
-        return responses
+        return error_responses + responses
 
     def next(self) -> List[MLLMBatchResponse]:
         """
diff --git a/vllm_mlx/mllm_scheduler.py b/vllm_mlx/mllm_scheduler.py
@@ -446,6 +446,27 @@ def _process_batch_responses(
             if request is None:
                 continue
 
+            # Handle error responses from failed preprocessing
+            if response.finish_reason == "error":
+                output = RequestOutput(
+                    request_id=request_id,
+                    new_token_ids=[],
+                    new_text="",
+                    output_token_ids=[],
+                    prompt_tokens=0,
+                    completion_tokens=0,
+                    finished=True,
+                    finish_reason="error",
+                )
+                request.status = RequestStatus.FINISHED_ABORTED
+                request.output_text = ""
+                request.finish_reason = "error"
+                finished_ids.add(request_id)
+                self.num_requests_processed += 1
+                logger.warning(f"Request {request_id} failed during preprocessing")
+                outputs.append(output)
+                continue
+
             # Append token to request
             request.output_tokens.append(response.token)
             request.num_output_tokens = len(request.output_tokens)
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
@@ -1385,10 +1385,16 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
     # For MLLM models, keep original messages with embedded images
     # (MLLM.chat() extracts images from message content internally)
     if engine.is_mllm:
-        # Convert Pydantic messages to dicts preserving full content
+        # Convert Pydantic messages to dicts preserving full content.
+        # exclude_none=True is critical: Qwen3VL Jinja template checks
+        # 'image_url' in item — null keys would falsely trigger image tokens.
         messages = []
         for msg in request.messages:
-            msg_dict = msg.model_dump() if hasattr(msg, "model_dump") else dict(msg)
+            msg_dict = (
+                msg.model_dump(exclude_none=True)
+                if hasattr(msg, "model_dump")
+                else {k: v for k, v in dict(msg).items() if v is not None}
+            )
             messages.append(msg_dict)
         images, videos = [], []  # MLLM extracts these from messages
         logger.debug(f"MLLM: Processing {len(messages)} messages")