2828import uvicorn
2929import zmq
3030import zmq .asyncio
31- from fastapi .responses import ORJSONResponse , StreamingResponse
31+ from fastapi .responses import JSONResponse , StreamingResponse
3232from mlx_lm .tokenizer_utils import StreamingDetokenizer
3333from mlx_lm .utils import load_config
3434from pydantic import BaseModel
@@ -101,6 +101,7 @@ class HTTPRequestInfo:
101101 # tool calling support
102102 tool_state : Optional [ToolCallState ] = None
103103 tool_calls : List [Dict [str , Any ]] = field (default_factory = list )
104+ enable_thinking : bool = True
104105
105106
106107class HTTPHandler :
@@ -137,12 +138,29 @@ def __init__(
137138 self .tokenizer = load_tokenizer (model_path , eos_token_ids = config .get ("eos_token_id" , None ))
138139 self .detokenizer_class , self .tokenmap = load_detokenizer (model_path , self .tokenizer )
139140
141+ @staticmethod
142+ def _is_thinking_enabled (request : Dict ) -> bool :
143+ chat_template_kwargs = dict (request .get ("chat_template_kwargs" , {}))
144+ extra_body = request .get ("extra_body" )
145+ if isinstance (extra_body , dict ) and "chat_template_kwargs" in extra_body :
146+ chat_template_kwargs .update (extra_body ["chat_template_kwargs" ])
147+ return chat_template_kwargs .get ("enable_thinking" ) is not False
148+
149+ def _get_initial_assistant_content (self , request_info : HTTPRequestInfo ) -> str :
150+ model_path = self .model_path_str .lower ()
151+ if "minimax-m2" in model_path :
152+ return "<think>"
153+ if "qwen3.6" in model_path and request_info .enable_thinking :
154+ return "<think>"
155+ return ""
156+
140157 def create_request (self , request : Dict ):
141158 """Creates a new request information"""
142159 rid = request ["rid" ]
143160 stream = request .get ("stream" , False )
144161 model = request .get ("model" , "default" )
145162 return_probs = request .get ("return_probs" , False ) # Check if probs requested
163+ enable_thinking = self ._is_thinking_enabled (request )
146164 chat_object = "chat.completion.chunk" if stream else "chat.completion"
147165 detokenizer = self .detokenizer_class (self .tokenizer , self .tokenmap )
148166 create_time = time .time ()
@@ -156,6 +174,7 @@ def create_request(self, request: Dict):
156174 update_time = update_time ,
157175 detokenizer = detokenizer ,
158176 return_probs = return_probs ,
177+ enable_thinking = enable_thinking ,
159178 )
160179 request_info .tool_state = ToolCallState .from_tokenizer (
161180 self .tokenizer , request .get ("tools" ), stream
@@ -206,9 +225,7 @@ def _generate_stream_chunk(self, rid, token, is_first=False, is_last=False):
206225
207226 if is_first :
208227 role = "assistant"
209- content = ""
210- if "minimax-m2" in self .model_path_str .lower ():
211- content = "<think>"
228+ content = self ._get_initial_assistant_content (request_info )
212229 tool_calls = None
213230 elif is_last :
214231 role = None
@@ -318,7 +335,7 @@ def generate_non_stream_response(self, rid):
318335 choice = response ["choices" ][0 ]
319336 choice ["message" ] = {
320337 "role" : "assistant" ,
321- "content" : request_info .text ,
338+ "content" : self . _get_initial_assistant_content ( request_info ) + request_info .text ,
322339 "reasoning_content" : None ,
323340 "tool_calls" : request_info .tool_calls or None ,
324341 }
@@ -464,7 +481,7 @@ def create_error_response(
464481):
465482 """Creates a json error response for the frontend."""
466483 error = ErrorResponse (message = message , type = err_type , code = status_code .value )
467- return ORJSONResponse (content = error .model_dump (), status_code = error .code )
484+ return JSONResponse (content = error .model_dump (), status_code = error .code )
468485
469486
470487# Fast API
@@ -548,7 +565,7 @@ async def v1_chat_completions(raw_request: fastapi.Request):
548565
549566 response = app .state .http_handler .generate_non_stream_response (request_id )
550567 app .state .http_handler .release_request (request_id )
551- return ORJSONResponse (status_code = 200 , content = response )
568+ return JSONResponse (status_code = 200 , content = response )
552569 except Exception as e :
553570 # Handle any unexpected errors during processing
554571 logger .error (f"Error processing non-streaming request { request_id } : { e } " )
0 commit comments