feat: Qwen3.5 VLM loading, streaming detokenizer, tool markup stripping

janhilgard · claude · janhilgard · commit 476f717cc64b · 2026-03-03T17:14:08.000+01:00
- Add _needs_strict_false() to detect VLM models and skip wasteful strict=True load
- Add per-request streaming detokenizer pool for UTF-8 safe incremental decode
- Strip leaked &lt;tool_call&gt; markup tags from content output

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/vllm_mlx/api/utils.py b/vllm_mlx/api/utils.py
@@ -18,7 +18,8 @@
     r"<\|end\|>|<\|eot_id\|>|<\|start_header_id\|>|<\|end_header_id\|>|"
     r"<\|channel\|>|<\|message\|>|<\|start\|>|<\|return\|>|<\|call\|>|<\|constrain\|>|"
     r"</s>|<s>|<pad>|\[PAD\]|\[SEP\]|\[CLS\]|"
-    r"\[e~\[|\]~b\][a-z]*|\]~!b\["
+    r"\[e~\[|\]~b\][a-z]*|\]~!b\[|"
+    r"</?tool_call>|</?tool_call_reasoning>"
 )
 
 
@@ -133,6 +134,9 @@ def clean_output_text(text: str) -> str:
     "InternVL",  # InternVL
     "deepseek-vl",
     "DeepSeek-VL",  # DeepSeek-VL
+    # NOTE: Qwen3.5 is natively multimodal but MoE produces ArraysCache
+    # which is incompatible with MLLM continuous batching (requires KVCache).
+    # Runs as text-only via strict=False fallback until upstream fixes this.
 ]
 
 
diff --git a/vllm_mlx/mllm_scheduler.py b/vllm_mlx/mllm_scheduler.py
@@ -28,6 +28,7 @@
 from dataclasses import dataclass, field
 from typing import Any, AsyncIterator, Dict, List, Optional, Set, Tuple
 
+from mlx_lm.tokenizer_utils import NaiveStreamingDetokenizer
 
 from .mllm_batch_generator import (
     MLLMBatchGenerator,
@@ -198,6 +199,9 @@ def __init__(
         self.request_id_to_uid: Dict[str, int] = {}
         self.uid_to_request_id: Dict[int, str] = {}
 
+        # Per-request streaming detokenizers for UTF-8-safe incremental decode
+        self._detokenizer_pool: Dict[str, Any] = {}
+
         # Output queues for async streaming
         self.output_queues: Dict[str, asyncio.Queue] = {}
 
@@ -446,8 +450,17 @@ def _process_batch_responses(
             request.output_tokens.append(response.token)
             request.num_output_tokens = len(request.output_tokens)
 
-            # Decode the new token
-            new_text = tokenizer.decode([response.token])
+            # Decode the new token using streaming detokenizer (UTF-8 safe)
+            if request_id not in self._detokenizer_pool:
+                if hasattr(tokenizer, "detokenizer"):
+                    detok = tokenizer.detokenizer
+                else:
+                    detok = NaiveStreamingDetokenizer(tokenizer)
+                detok.reset()
+                self._detokenizer_pool[request_id] = detok
+            detok = self._detokenizer_pool[request_id]
+            detok.add_token(response.token)
+            new_text = detok.last_segment
 
             # Create output
             output = RequestOutput(
@@ -470,10 +483,16 @@ def _process_batch_responses(
                 output.finish_reason = response.finish_reason
                 finished_ids.add(request_id)
 
-                # Decode full output
-                output.output_text = tokenizer.decode(request.output_tokens)
+                # Finalize streaming detokenizer and get full output
+                detok = self._detokenizer_pool.get(request_id)
+                if detok is not None:
+                    detok.finalize()
+                    output.output_text = detok.text
+                else:
+                    output.output_text = tokenizer.decode(request.output_tokens)
                 request.output_text = output.output_text
                 request.finish_reason = response.finish_reason
+                self._detokenizer_pool.pop(request_id, None)
 
                 self.total_completion_tokens += request.num_output_tokens
                 self.num_requests_processed += 1
diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py
@@ -20,6 +20,7 @@
 import mlx.core as mx
 from mlx_lm.generate import BatchGenerator
 from mlx_lm.sample_utils import make_logits_processors, make_sampler
+from mlx_lm.tokenizer_utils import NaiveStreamingDetokenizer
 
 from .memory_cache import MemoryAwarePrefixCache, MemoryCacheConfig
 from .paged_cache import PagedCacheManager
@@ -403,7 +404,9 @@ def _chunked_next(self=batch_gen):  # noqa: C901
 
                     if not is_cached:
                         padded = _left_pad_prompts(inputs_raw, max_length=max_length)
-                        prompt_cache = _make_cache(self.model, padding, self.max_kv_size)
+                        prompt_cache = _make_cache(
+                            self.model, padding, self.max_kv_size
+                        )
                     else:
                         last_inputs = mx.array([p[-1:] for p in inputs_raw])
                         padded = _right_pad_prompts(inputs_raw, max_length=max_length)
@@ -980,6 +983,9 @@ def __init__(
         # Detect if tokenizer is a processor (MLLM) and get the actual tokenizer
         self._actual_tokenizer = self._get_actual_tokenizer(tokenizer)
 
+        # Per-request streaming detokenizers for UTF-8-safe incremental decode
+        self._detokenizer_pool: Dict[str, Any] = {}
+
         # Request management - following vLLM's design
         self.waiting: deque[Request] = deque()  # Waiting queue (FCFS)
         self.running: Dict[str, Request] = {}  # Running requests by ID
@@ -1080,6 +1086,21 @@ def _decode_tokens(self, token_ids: List[int]) -> str:
         """
         return self._actual_tokenizer.decode(token_ids)
 
+    def _get_detokenizer(self, request_id: str) -> Any:
+        """Get or create a streaming detokenizer for a request."""
+        if request_id not in self._detokenizer_pool:
+            if hasattr(self.tokenizer, "detokenizer"):
+                detok = self.tokenizer.detokenizer
+            else:
+                detok = NaiveStreamingDetokenizer(self._actual_tokenizer)
+            detok.reset()
+            self._detokenizer_pool[request_id] = detok
+        return self._detokenizer_pool[request_id]
+
+    def _cleanup_detokenizer(self, request_id: str) -> None:
+        """Remove the streaming detokenizer for a finished request."""
+        self._detokenizer_pool.pop(request_id, None)
+
     def _get_stop_tokens(self) -> Set[int]:
         """Get stop token IDs from tokenizer or processor."""
         stop_tokens = set()
@@ -1872,11 +1893,13 @@ def _process_batch_responses(
 
                 request.first_token_time = _time.time()
 
-            # Decode the new token (skip stop tokens — they are not content)
+            # Decode the new token using streaming detokenizer (UTF-8 safe)
             if response.finish_reason == "stop":
                 new_text = ""
             else:
-                new_text = self._decode_tokens([response.token])
+                detok = self._get_detokenizer(request_id)
+                detok.add_token(response.token)
+                new_text = detok.last_segment
 
             # Create output
             output = RequestOutput(
@@ -1899,9 +1922,15 @@ def _process_batch_responses(
                 output.finish_reason = response.finish_reason
                 finished_ids.add(request_id)
 
-                # Decode full output
-                output.output_text = self._decode_tokens(request.output_token_ids)
+                # Finalize streaming detokenizer and get full output
+                detok = self._detokenizer_pool.get(request_id)
+                if detok is not None:
+                    detok.finalize()
+                    output.output_text = detok.text
+                else:
+                    output.output_text = self._decode_tokens(request.output_token_ids)
                 request.output_text = output.output_text
+                self._cleanup_detokenizer(request_id)
 
                 # Extract cache for future reuse (critical for agentic multi-turn)
                 if hasattr(response, "prompt_cache"):
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
@@ -42,6 +42,7 @@
 import json
 import logging
 import os
+import re
 import secrets
 import tempfile
 import threading
@@ -158,6 +159,11 @@ def _resolve_top_p(request_value: float | None) -> float:
 _tool_call_parser: str | None = None  # Parser name: auto, mistral, qwen, llama, hermes
 _tool_parser_instance = None  # Instantiated parser
 
+# Pattern to strip leaked tool call markup from content output.
+# Safety net: the tool parser should consume these, but if it doesn't
+# (e.g. malformed JSON, stray closing tags), strip them before emitting.
+_TOOL_MARKUP_PATTERN = re.compile(r"</?tool_call>|</?tool_call_reasoning>")
+
 
 def _load_prefix_cache_from_disk() -> None:
     """Load prefix cache from disk during startup."""
@@ -2097,6 +2103,9 @@ async def stream_chat_completion(
 
                     # Normal content from tool parser
                     content = tool_result.get("content", "")
+                    # Strip any leaked tool markup tags
+                    if content:
+                        content = _TOOL_MARKUP_PATTERN.sub("", content)
 
             chunk = ChatCompletionChunk(
                 id=response_id,
@@ -2187,6 +2196,9 @@ async def stream_chat_completion(
 
                     # Normal content from tool parser
                     content = tool_result.get("content", "")
+                    # Strip any leaked tool markup tags
+                    if content:
+                        content = _TOOL_MARKUP_PATTERN.sub("", content)
 
             chunk = ChatCompletionChunk(
                 id=response_id,
diff --git a/vllm_mlx/utils/tokenizer.py b/vllm_mlx/utils/tokenizer.py
@@ -28,6 +28,27 @@ def _needs_tokenizer_fallback(model_name: str) -> bool:
     return any(pattern.lower() in model_lower for pattern in FALLBACK_MODELS)
 
 
+def _needs_strict_false(model_name: str) -> bool:
+    """Check if model needs strict=False loading (VLM models with extra weights).
+
+    VLM models (e.g., Qwen3.5) have vision_tower weights that don't match
+    the text-only model class.  Loading with strict=True fails and wastes
+    memory by loading all weights (~100 GB) before raising ValueError.
+    Detect these models up-front to avoid the double-load penalty.
+    """
+    from mlx_lm.utils import _download, load_config
+
+    try:
+        model_path = _download(model_name)
+        config = load_config(model_path)
+    except Exception:
+        return False
+    # VLM models have vision_config or text_config with a separate model_type
+    if "vision_config" in config and "text_config" in config:
+        return True
+    return False
+
+
 def load_model_with_fallback(model_name: str, tokenizer_config: dict = None):
     """
     Load model and tokenizer with fallback for non-standard tokenizers.
@@ -50,32 +71,36 @@ def load_model_with_fallback(model_name: str, tokenizer_config: dict = None):
         )
         return _load_with_tokenizer_fallback(model_name)
 
+    # VLM models (e.g., Qwen3.5) have extra vision weights that cause
+    # strict=True to fail.  Skip the first load attempt to avoid loading
+    # ~100 GB of weights twice (which can cause OOM on 256 GB systems).
+    if _needs_strict_false(model_name):
+        logger.info(
+            f"Model {model_name} detected as VLM, loading directly with strict=False"
+        )
+        return _load_strict_false(model_name, tokenizer_config)
+
     try:
         model, tokenizer = load(model_name, tokenizer_config=tokenizer_config)
     except ValueError as e:
         # Fallback for models with non-standard tokenizers
         if "TokenizersBackend" in str(e) or "Tokenizer class" in str(e):
             logger.warning(f"Standard tokenizer loading failed, using fallback: {e}")
             return _load_with_tokenizer_fallback(model_name)
-        # Fallback for models with extra weights (e.g., MTP layers)
+        # Fallback for models with extra weights (e.g., MTP layers, vision tower)
         elif "parameters not in model" in str(e):
-            logger.warning(f"Extra parameters found (e.g., MTP weights), retrying with strict=False")
+            logger.warning(
+                f"Extra parameters found (e.g., MTP/vision weights), retrying with strict=False"
+            )
             # Clear traceback references to free memory from the failed first load.
             # Without this, large models (200GB+) cause OOM during retry because
             # the traceback holds references to the first load's weight tensors.
             e.__traceback__ = None
             del e
             import gc
+
             gc.collect()
-            from mlx_lm.utils import _download, load_model, load_tokenizer
-            model_path = _download(model_name)
-            model, config = load_model(model_path, strict=False)
-            tokenizer = load_tokenizer(
-                model_path, tokenizer_config,
-                eos_token_ids=config.get("eos_token_id", None),
-            )
-            _try_inject_mtp(model, model_path, config)
-            return model, tokenizer
+            return _load_strict_false(model_name, tokenizer_config)
         else:
             raise
 
@@ -84,10 +109,53 @@ def load_model_with_fallback(model_name: str, tokenizer_config: dict = None):
     return model, tokenizer
 
 
+def _load_strict_false(model_name: str, tokenizer_config: dict = None):
+    """Load model with strict=False to discard extra weights.
+
+    Handles models with extra parameters that the text-only model class
+    doesn't define (e.g., vision tower weights in VLM models like Qwen3.5,
+    or MTP layers).  The model's own sanitize() handles key remapping
+    (e.g., language_model.* prefix), and strict=False silently drops
+    unmatched keys.
+    """
+    import mlx.core as mx
+    from mlx_lm.utils import _download, load_model, load_tokenizer
+
+    model_path = _download(model_name)
+    model, config = load_model(model_path, strict=False)
+
+    # Verify weights loaded correctly
+    from mlx.utils import tree_flatten
+
+    params = tree_flatten(model.parameters())
+    total_params = len(params)
+    zero_params = sum(1 for _, v in params if mx.all(v == 0).item())
+    logger.info(
+        f"[strict=False] Loaded {total_params} parameters, "
+        f"{zero_params} all-zero tensors"
+    )
+    # Spot-check embedding weights
+    if hasattr(model, "language_model"):
+        emb = model.language_model.model.embed_tokens.weight
+        logger.info(
+            f"[strict=False] embed_tokens: shape={emb.shape}, "
+            f"dtype={emb.dtype}, mean={mx.mean(emb.astype(mx.float32)).item():.4f}"
+        )
+
+    tokenizer = load_tokenizer(
+        model_path,
+        tokenizer_config or {},
+        eos_token_ids=config.get("eos_token_id", None),
+    )
+    _try_inject_mtp(model, model_path, config)
+    return model, tokenizer
+
+
 def _try_inject_mtp(model, model_path, config):
     """Inject MTP support if model has MTP config + weights."""
     if config.get("num_nextn_predict_layers", 0) > 0:
         from ..patches.qwen3_next_mtp import inject_mtp_support
+
         inject_mtp_support(model, model_path, config)