fix: streaming crash with --no-thinking (enable_thinking kwarg leak)

Your Name · claude · Your Name · commit 1f7bb8fee779 · 2026-03-21T08:17:51.000-07:00
stream_chat() passed enable_thinking through **kwargs to
stream_generate() → MLXLanguageModel.stream_generate() which doesn't
accept it, causing TypeError on every streaming request. Now popped
from kwargs before passing downstream, matching the non-streaming path
which already did this correctly.

Also fixed MLLM stream_chat path for the same issue.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
@@ -505,6 +505,8 @@ async def stream_chat(
                 token_count = 0
 
                 # Run the synchronous generator in a thread
+                # Pop enable_thinking — MLLM models don't support it
+                kwargs.pop("enable_thinking", None)
                 sync_gen = self._model.stream_chat(
                     messages=messages,
                     max_tokens=max_tokens,
@@ -540,7 +542,7 @@ async def stream_chat(
             return
 
         # For LLM, apply chat template and stream
-        enable_thinking = kwargs.get("enable_thinking")
+        enable_thinking = kwargs.pop("enable_thinking", None)
         prompt = shared_apply_chat_template(
             self._model.tokenizer,
             messages,