tool-call auto

pythongiant · pythongiant · commit 47c7027d34b6 · 2026-05-25T10:11:51.000+05:30
diff --git a/src/kvboost/server/__main__.py b/src/kvboost/server/__main__.py
@@ -198,9 +198,17 @@ def parse_args():
                         "the server forwards 'tools' to the chat template and parses "
                         "tool calls out of the model output using --tool-call-parser.")
     p.add_argument("--tool-call-parser", default="hermes",
-                   choices=["hermes"],
-                   help="Format used by the model to emit tool calls. "
-                        "'hermes' = <tool_call>{json}</tool_call> (Qwen2.5/3, Hermes 2/3).")
+                   choices=["hermes", "json_codeblock", "qwen3_coder",
+                            "llama", "mistral", "auto"],
+                   help="Format used by the model to emit tool calls.\n"
+                        "  hermes         = <tool_call>{json}</tool_call> (Qwen2.5/3, Hermes 2/3)\n"
+                        "  json_codeblock = ```json\\n{...}\\n``` (Qwen2.5-Coder agent, mixed-format models)\n"
+                        "  qwen3_coder    = <function=X><parameter=K>V</parameter> XML attr style\n"
+                        "  llama          = <|python_tag|>{json}<|eom_id|> (Llama 3.1/3.2/3.3)\n"
+                        "  mistral        = [TOOL_CALLS][...] prefix + JSON array\n"
+                        "  auto           = try each parser in order; first match wins.\n"
+                        "Recommended: 'auto' when serving mixed/unknown formats; pin to a specific one\n"
+                        "for the model you're serving.")
 
     # Pre-warm
     p.add_argument("--warm", default=None,
@@ -455,8 +463,9 @@ def main():
             max_retries=args.oom_max_retries,
         )
         log.info(
-            "OOM recovery enabled: max_retries=%d, streaming=%s",
-            args.oom_max_retries, args.awq_streaming,
+            "OOM recovery enabled: max_retries=%s, streaming=%s",
+            args.oom_max_retries if args.oom_max_retries is not None else "default",
+            args.awq_streaming,
         )
 
     worker = EngineWorker(
diff --git a/src/kvboost/server/app.py b/src/kvboost/server/app.py
@@ -292,8 +292,15 @@ async def chat_completions(req: ChatCompletionRequest, response: Response):
             and req.tool_choice != "none"
         )
         if tools_active:
+            # Build an allowlist of declared tool names — the parser uses it
+            # to drop calls naming hallucinated / unrelated functions, which
+            # also prevents misclassifying incidental ```json blocks as calls.
+            tool_names = {
+                t.function.name for t in (req.tools or [])
+                if t.function and t.function.name
+            }
             cleaned_text, parsed_calls = tool_parsers.parse(
-                result.output_text, _parser_name,
+                result.output_text, _parser_name, tool_names=tool_names,
             )
             parsed_calls = _filter_tool_choice(parsed_calls, req.tool_choice)
             if parsed_calls:
@@ -489,8 +496,16 @@ async def _stream_chat(
     tools_active = (
         auto_tools and bool(req.tools) and req.tool_choice != "none"
     )
+    stream_tool_names = (
+        {
+            t.function.name for t in (req.tools or [])
+            if t.function and t.function.name
+        }
+        if tools_active else None
+    )
     stream_parser = (
-        tool_parsers.make_streaming_parser(parser_name) if tools_active else None
+        tool_parsers.make_streaming_parser(parser_name, tool_names=stream_tool_names)
+        if tools_active else None
     )
     emitted_tool_calls = []  # list of ToolCall, used to set finish_reason
     tool_call_index = 0       # OpenAI delta indexing within this completion
diff --git a/src/kvboost/server/tool_parsers.py b/src/kvboost/server/tool_parsers.py