[tools][wip] Add support for Llama 3.2 tool-call injection: batch tool calls, user message integration, and enhanced response parsing.

orionpapadakis · orionpapadakis · commit a35e87ba4dd9 · 2026-05-14T15:10:54.000+03:00
diff --git a/src/main/java/org/beehive/gpullama3/model/format/ChatFormat.java b/src/main/java/org/beehive/gpullama3/model/format/ChatFormat.java
@@ -39,16 +39,54 @@ default ChatTokens chatTokens() {
 
     /**
      * Returns plain text to append to the system message content when tools are available.
-     * The returned string is concatenated to the system message before encoding, so the
-     * normal {@link #encodeMessage} path handles tokenization.
+     * Used by formats that inject tool definitions into the <em>system</em> message.
      *
-     * @param toolsJson JSON array of tool definitions, e.g.
-     *                  {@code [{"type":"function","function":{...}}]}
+     * <p>Formats that inject tools into the <em>user</em> message instead should override
+     * {@link #injectsToolsInUserMessage()}, {@link #toolSystemMessagePrefix()}, and
+     * {@link #toolFirstUserMessagePrefix(String)} rather than this method.
+     *
+     * @param toolsJson JSON array of tool definitions
      */
     default String toolSystemPromptSuffix(String toolsJson) {
         throw new UnsupportedOperationException("Tool calling not supported for: " + getClass().getSimpleName());
     }
 
+    /**
+     * Returns {@code true} when this format injects tool definitions into the
+     * <em>first user message</em> instead of the system message.
+     *
+     * <p>When this returns {@code true}, callers should:
+     * <ol>
+     *   <li>Prepend {@link #toolSystemMessagePrefix()} to the system message content.</li>
+     *   <li>Prepend {@link #toolFirstUserMessagePrefix(String)} to the first user message.</li>
+     * </ol>
+     * When {@code false} (default), callers should append {@link #toolSystemPromptSuffix} to
+     * the system message as before.
+     */
+    default boolean injectsToolsInUserMessage() {
+        return false;
+    }
+
+    /**
+     * Returns text to <em>prepend</em> to the system message content when tools are active
+     * and {@link #injectsToolsInUserMessage()} is {@code true}.
+     * Default: empty string (no prefix).
+     */
+    default String toolSystemMessagePrefix() {
+        return "";
+    }
+
+    /**
+     * Returns the preamble to <em>prepend</em> to the first user message when
+     * {@link #injectsToolsInUserMessage()} is {@code true}.
+     * The preamble should include the tool definitions and usage instructions.
+     *
+     * @param toolsJson JSON array of tool definitions
+     */
+    default String toolFirstUserMessagePrefix(String toolsJson) {
+        return "";
+    }
+
     /**
      * Re-encodes a prior assistant tool-call turn into the conversation token stream.
      * Used when replaying multi-turn history that contains a previous tool call.
@@ -80,6 +118,18 @@ default Optional<ToolCallExtract> extractToolCall(String responseText) {
         return Optional.empty();
     }
 
+    /**
+     * Extracts ALL tool calls from a response. Models may emit multiple
+     * {@code <tool_call>} blocks in a single turn (batch tool calls).
+     * The default delegates to {@link #extractToolCall} for formats that
+     * do not support batch calls.
+     *
+     * @param responseText the fully decoded response from the model
+     */
+    default List<ToolCallExtract> extractAllToolCalls(String responseText) {
+        return extractToolCall(responseText).map(List::of).orElse(List.of());
+    }
+
     /**
      * Stop tokens to use when tool calling is enabled.
      * Some models (LLaMA 3.1+) use a different end-of-turn token ({@code <|eom_id|>})
diff --git a/src/main/java/org/beehive/gpullama3/model/format/LlamaChatFormat.java b/src/main/java/org/beehive/gpullama3/model/format/LlamaChatFormat.java
@@ -78,32 +78,58 @@ public List<Integer> encodeDialogPrompt(boolean appendAssistantTurn, List<Messag
     // ── Tool calling ──────────────────────────────────────────────────────────
 
     /**
-     * LLaMA 3.1 tool calling system prompt suffix.
-     * Instructs the model to respond with JSON using the {"name":…,"parameters":{…}} format.
+     * Llama 3.2 Instruct injects tool definitions into the <em>first user message</em>
+     * (the GGUF-embedded chat template has {@code tools_in_user_message = true} by default).
+     * The system message receives only an environment prefix; the tools and usage instructions
+     * go in the user turn.
      */
     @Override
-    public String toolSystemPromptSuffix(String toolsJson) {
-        return "\n\n# Tools\n\n"
-                + "You may call one or more functions to assist with the user query.\n\n"
-                + "You are provided with function signatures within <tools></tools> XML tags:\n\n"
-                + "<tools>\n" + toolsJson + "\n</tools>\n\n"
-                + "IMPORTANT: the \"name\" field in your tool call MUST be exactly one of the function names "
-                + "listed inside <tools> above — not a path, not a word from the user's message.\n\n"
-                + "For each function call, return a json object with function name and arguments "
-                + "within <tool_call></tool_call> XML tags:\n\n"
-                + "<tool_call>\n"
-                + "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
-                + "</tool_call>";
+    public boolean injectsToolsInUserMessage() {
+        return true;
     }
 
     /**
-     * Re-encodes a prior assistant tool-call turn for multi-turn history.
-     * Format: {@code <|start_header_id|>assistant<|end_header_id|>\n<|python_tag|>JSON<|eom_id|>}
+     * System-message prefix that signals tool availability to Llama 3.2.
+     * Matches the template's {@code "Environment: ipython\n"} line.
+     */
+    @Override
+    public String toolSystemMessagePrefix() {
+        return "Environment: ipython\n\n";
+    }
+
+    /**
+     * Prepends tool definitions and usage instructions to the first user message,
+     * matching the Llama 3.2 GGUF chat template ({@code tools_in_user_message = true}).
+     *
+     * <p>Format mirrors:
+     * <pre>
+     * Given the following functions, please respond with a JSON for a function call
+     * with its proper arguments that best answers the given prompt.
+     *
+     * Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.
+     * Do not use variables.
+     *
+     * {toolsJson}
+     *
+     * </pre>
+     */
+    @Override
+    public String toolFirstUserMessagePrefix(String toolsJson) {
+        return "Given the following functions, please respond with a JSON for a function call "
+                + "with its proper arguments that best answers the given prompt.\n\n"
+                + "Respond in the format {\"name\": function name, \"parameters\": dictionary of "
+                + "argument name and its value}. Do not use variables.\n\n"
+                + toolsJson + "\n\n";
+    }
+
+    /**
+     * Re-encodes a prior assistant tool-call turn for multi-turn history using the
+     * Llama 3.2 native JSON format: {@code {"name":"…","parameters":{…}}<|eot_id|>}.
      */
     @Override
     public List<Integer> encodeToolCallAssistantTurn(ToolCallExtract toolCall) {
         List<Integer> tokens = new ArrayList<>(encodeHeader(new Message(Role.ASSISTANT, "")));
-        String json = "<tool_call>\n{\"name\":\"" + toolCall.name() + "\",\"arguments\":" + toolCall.argumentsJson() + "}\n</tool_call>";
+        String json = "{\"name\": \"" + toolCall.name() + "\", \"parameters\": " + toolCall.argumentsJson() + "}";
         tokens.addAll(tokenizer.encodeAsList(json));
         tokens.add(endOfTurn);
         return tokens;
@@ -136,6 +162,11 @@ public Optional<ToolCallExtract> extractToolCall(String responseText) {
         return ToolCallParserUtils.parseLlamaResponse(responseText);
     }
 
+    @Override
+    public List<ToolCallExtract> extractAllToolCalls(String responseText) {
+        return ToolCallParserUtils.parseAllToolCalls(responseText);
+    }
+
     /**
      * Adds {@code <|eom_id|>} to the stop tokens when tools are enabled.
      * LLaMA 3.1 ends tool-call turns with {@code <|eom_id|>} instead of {@code <|eot_id|>}.
diff --git a/src/main/java/org/beehive/gpullama3/model/format/Qwen3ChatFormat.java b/src/main/java/org/beehive/gpullama3/model/format/Qwen3ChatFormat.java
@@ -193,4 +193,9 @@ public List<Integer> encodeToolResultTurn(String toolCallId, String toolName, St
     public Optional<ToolCallExtract> extractToolCall(String responseText) {
         return ToolCallParserUtils.parseQwen3Response(responseText);
     }
+
+    @Override
+    public List<ToolCallExtract> extractAllToolCalls(String responseText) {
+        return ToolCallParserUtils.parseAllToolCalls(responseText);
+    }
 }
diff --git a/src/main/java/org/beehive/gpullama3/model/format/ToolCallParserUtils.java b/src/main/java/org/beehive/gpullama3/model/format/ToolCallParserUtils.java
@@ -1,5 +1,7 @@
 package org.beehive.gpullama3.model.format;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Optional;
 
 /**
@@ -41,6 +43,11 @@ public static Optional<ToolCallExtract> parseLlamaResponse(String responseText)
             String json = responseText.substring(tcStart + "<tool_call>".length(), tcEnd).strip();
             return parseLlamaJson(json);
         }
+        // 2b. Unclosed <tool_call> — model stopped (eot_id / eom_id) before writing the closing tag
+        if (tcStart != -1 && tcEnd == -1) {
+            String json = responseText.substring(tcStart + "<tool_call>".length()).strip();
+            return parseLlamaJson(json);
+        }
 
         // 3. Fallback: raw JSON, possibly inside markdown code fences
         String stripped = stripMarkdownFences(responseText.strip());
@@ -72,16 +79,66 @@ private static Optional<ToolCallExtract> parseLlamaJson(String json) {
 
     // ── Qwen3 ─────────────────────────────────────────────────────────────────
 
+    /**
+     * Extracts ALL tool calls from a response that may contain multiple
+     * {@code <tool_call>…</tool_call>} blocks (Llama 3.2 and Qwen3 batch calls).
+     *
+     * Falls back to the raw-JSON single-call path if no tags are found.
+     * Returns an empty list when the response contains no tool calls.
+     */
+    public static List<ToolCallExtract> parseAllToolCalls(String responseText) {
+        List<ToolCallExtract> calls = new java.util.ArrayList<>();
+
+        // <|python_tag|> (Llama 3.1) — single call by definition
+        int pythonIdx = responseText.indexOf("<|python_tag|>");
+        if (pythonIdx != -1) {
+            parseLlamaJson(responseText.substring(pythonIdx + "<|python_tag|>".length()).strip())
+                    .ifPresent(calls::add);
+            return calls;
+        }
+
+        // Scan for all <tool_call>…</tool_call> blocks
+        int searchFrom = 0;
+        while (true) {
+            int start = responseText.indexOf("<tool_call>", searchFrom);
+            if (start == -1) break;
+            int end = responseText.indexOf("</tool_call>", start);
+            String json;
+            if (end != -1) {
+                json = responseText.substring(start + "<tool_call>".length(), end).strip();
+                searchFrom = end + "</tool_call>".length();
+            } else {
+                // Unclosed tag — model stopped before writing the closing tag
+                json = responseText.substring(start + "<tool_call>".length()).strip();
+                searchFrom = responseText.length();
+            }
+            parseLlamaJson(json).ifPresent(calls::add);
+            if (end == -1) break;
+        }
+
+        // Raw JSON fallback (no tags at all)
+        if (calls.isEmpty()) {
+            String stripped = stripMarkdownFences(responseText.strip());
+            if (stripped.startsWith("{")) {
+                parseLlamaJson(stripped).ifPresent(calls::add);
+            }
+        }
+
+        return calls;
+    }
+
     /**
      * Extracts a tool call enclosed in {@code <tool_call>…</tool_call>} tags
      * as produced by Qwen3 models.
      */
     public static Optional<ToolCallExtract> parseQwen3Response(String responseText) {
         int start = responseText.indexOf("<tool_call>");
         int end   = responseText.lastIndexOf("</tool_call>");
-        if (start == -1 || end == -1 || end <= start) return Optional.empty();
+        if (start == -1) return Optional.empty();
 
-        String json = responseText.substring(start + "<tool_call>".length(), end).strip();
+        String json = (end != -1 && end > start)
+                ? responseText.substring(start + "<tool_call>".length(), end).strip()
+                : responseText.substring(start + "<tool_call>".length()).strip();
 
         String name = extractStringValue(json, "name");
         if (name == null) return Optional.empty();
@@ -104,7 +161,11 @@ public static String stripMarkdownFences(String text) {
         return body.strip();
     }
 
-    /** Extracts the string value for {@code "key": "<value>"} from a JSON object. Tolerates whitespace around {@code :}. */
+    /**
+     * Extracts the string value for {@code "key": "<value>"} from a JSON object.
+     * Tolerates whitespace around {@code :} and correctly skips escaped quotes ({@code \"})
+     * inside the value, so multi-line code strings with embedded {@code "} are returned intact.
+     */
     public static String extractStringValue(String json, String key) {
         String marker = "\"" + key + "\"";
         int markerIdx = json.indexOf(marker);
@@ -113,9 +174,20 @@ public static String extractStringValue(String json, String key) {
         if (colonIdx == -1) return null;
         int quoteStart = json.indexOf('"', colonIdx + 1);
         if (quoteStart == -1) return null;
-        int quoteEnd = json.indexOf('"', quoteStart + 1);
-        if (quoteEnd == -1) return null;
-        return json.substring(quoteStart + 1, quoteEnd);
+        // Scan for the closing quote, honouring backslash escapes
+        int i = quoteStart + 1;
+        while (i < json.length()) {
+            char c = json.charAt(i);
+            if (c == '\\') {
+                i += 2; // skip escape sequence (e.g. \", \\, \n)
+            } else if (c == '"') {
+                break;
+            } else {
+                i++;
+            }
+        }
+        if (i >= json.length()) return null;
+        return json.substring(quoteStart + 1, i);
     }
 
     /**
diff --git a/src/main/java/org/beehive/gpullama3/tools/ToolCallingSession.java b/src/main/java/org/beehive/gpullama3/tools/ToolCallingSession.java

Original file line number	Diff line number	Diff line change
`@@ -193,4 +193,9 @@ public List<Integer> encodeToolResultTurn(String toolCallId, String toolName, St`
`193`	`193`	`public Optional<ToolCallExtract> extractToolCall(String responseText) {`
`194`	`194`	`return ToolCallParserUtils.parseQwen3Response(responseText);`
`195`	`195`	`}`
	`196`	`+`
	`197`	`+ @Override`
	`198`	`+ public List<ToolCallExtract> extractAllToolCalls(String responseText) {`
	`199`	`+ return ToolCallParserUtils.parseAllToolCalls(responseText);`
	`200`	`+ }`
`196`	`201`	`}`