@@ -198,9 +198,17 @@ def parse_args():
198198 "the server forwards 'tools' to the chat template and parses "
199199 "tool calls out of the model output using --tool-call-parser." )
200200 p .add_argument ("--tool-call-parser" , default = "hermes" ,
201- choices = ["hermes" ],
202- help = "Format used by the model to emit tool calls. "
203- "'hermes' = <tool_call>{json}</tool_call> (Qwen2.5/3, Hermes 2/3)." )
201+ choices = ["hermes" , "json_codeblock" , "qwen3_coder" ,
202+ "llama" , "mistral" , "auto" ],
203+ help = "Format used by the model to emit tool calls.\n "
204+ " hermes = <tool_call>{json}</tool_call> (Qwen2.5/3, Hermes 2/3)\n "
205+ " json_codeblock = ```json\\ n{...}\\ n``` (Qwen2.5-Coder agent, mixed-format models)\n "
206+ " qwen3_coder = <function=X><parameter=K>V</parameter> XML attr style\n "
207+ " llama = <|python_tag|>{json}<|eom_id|> (Llama 3.1/3.2/3.3)\n "
208+ " mistral = [TOOL_CALLS][...] prefix + JSON array\n "
209+ " auto = try each parser in order; first match wins.\n "
210+ "Recommended: 'auto' when serving mixed/unknown formats; pin to a specific one\n "
211+ "for the model you're serving." )
204212
205213 # Pre-warm
206214 p .add_argument ("--warm" , default = None ,
@@ -455,8 +463,9 @@ def main():
455463 max_retries = args .oom_max_retries ,
456464 )
457465 log .info (
458- "OOM recovery enabled: max_retries=%d, streaming=%s" ,
459- args .oom_max_retries , args .awq_streaming ,
466+ "OOM recovery enabled: max_retries=%s, streaming=%s" ,
467+ args .oom_max_retries if args .oom_max_retries is not None else "default" ,
468+ args .awq_streaming ,
460469 )
461470
462471 worker = EngineWorker (
0 commit comments