fix: handle Harmony <|call|> EOG token for GPT-OSS tool calling (#1822)

dev-nid · web-flow · commit b3157ebc4a14 · 2026-04-30T12:14:04.000+02:00
* fix: handle Harmony &lt;|call|&gt; EOG token for GPT-OSS tool calling

GPT-OSS models use &lt;|call|&gt; as a frame delimiter in Harmony tool-call
protocol. This token is in the EOG set, causing generation to stop
silently before tool calls reach the SDK.

Add Harmony model detection and &lt;|call|&gt;-specific handling in the
generation loop: render the token as visible text (special=true) so the
SDK can parse frame boundaries, then stop generation for the turn-based
tool execution protocol.

Add multi-turn tool calling example demonstrating sequential tool
execution across turns and confirming parallel is not supported.

* package version bumped to 0.17.2, CHANGELOG updated for same
diff --git a/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md b/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## [0.17.2] - 2026-04-30
+
+### Fixed
+
+#### GPT-OSS Harmony tool calling: `<|call|>` frame delimiter now surfaces to the SDK
+
+The `<|call|>` token (Harmony frame terminator) is in the model's EOG set. When sampled, it rendered as 0 bytes and silently stopped generation — tool call output was truncated with no visible frame boundary, resulting in the SDK parsing 0 tool calls.
+
+The generation loop now detects Harmony models and intercepts `<|call|>` before the generic EOG break: it renders the token as visible text (`special=true`) so the SDK can identify frame boundaries, then stops generation cleanly. GPT-OSS uses a turn-based tool protocol — one tool call per generation pass — and the SDK is expected to execute the tool, append results, and re-prompt for subsequent calls.
+
 ## [0.17.1] - 2026-04-28
 
 This patch release adds per-request structured-output support to the LLM addon: callers can now constrain a single completion to either a JSON Schema or a raw GBNF grammar without reloading the model. Back-port of the same change being prepared for `main` in [#1787](https://github.com/tetherto/qvac/pull/1787); shipped here on top of `0.17.0` so it can be consumed by SDK lines that have not yet migrated to `0.18.x`.
diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.cpp b/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.cpp
@@ -100,6 +100,22 @@ MtmdLlmContext::MtmdLlmContext(
     antipromptTokens_.insert(
         antipromptTokens_.end(), tempTokens.begin(), tempTokens.end());
   }
+
+  isHarmonyModel_ =
+      qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
+  if (isHarmonyModel_) {
+    harmonyCallToken_ =
+        qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
+    if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
+      isHarmonyModel_ = false;
+    }
+  }
+  QLOG_IF(
+      Priority::DEBUG,
+      string_format(
+          "[MtmdLlm] Harmony detection: isHarmony=%d callToken=%d\n",
+          isHarmonyModel_,
+          harmonyCallToken_));
 }
 
 void MtmdLlmContext::initVisionContext() {
@@ -453,7 +469,26 @@ bool MtmdLlmContext::generateResponse(
       }
     }
 
-    if (llama_vocab_is_eog(vocab_, tokenId) || checkAntiprompt()) {
+    bool isEos = llama_vocab_is_eog(vocab_, tokenId);
+
+    if (isEos && isHarmonyModel_ && params_.use_jinja &&
+        tokenId == harmonyCallToken_) {
+      QLOG_IF(
+          Priority::DEBUG,
+          string_format(
+              "[MtmdLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
+      if (outputCallback) {
+        std::string callMarker =
+            common_token_to_piece(lctx_, tokenId, true);
+        if (!callMarker.empty()) {
+          outputCallback(callMarker);
+        }
+      }
+      flushPendingUtf8ToCallback(outputCallback);
+      break;
+    }
+
+    if (isEos || checkAntiprompt()) {
       flushPendingUtf8ToCallback(outputCallback);
       break;
     }
diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.hpp b/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.hpp
@@ -220,5 +220,10 @@ class MtmdLlmContext : public LlmContext {
 
   // UTF-8 token buffer for handling incomplete emoji sequences
   qvac_lib_inference_addon_llama::UTF8TokenBuffer utf8Buffer_;
+
+  // GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
+  bool isHarmonyModel_ = false;
+  llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;
+
   std::atomic<bool> stopGeneration_ = false;
 };
diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/TextLlmContext.cpp b/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/TextLlmContext.cpp
@@ -53,6 +53,23 @@ TextLlmContext::TextLlmContext(
           lctx_, reasoningState_);
     }
 
+    isHarmonyModel_ =
+        qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
+    if (isHarmonyModel_) {
+      harmonyCallToken_ =
+          qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
+      if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
+        isHarmonyModel_ = false;
+      }
+    }
+    QLOG_IF(
+        Priority::DEBUG,
+        string_format(
+            "[TextLlm] Harmony detection: isHarmony=%d callToken=%d useJinja=%d\n",
+            isHarmonyModel_,
+            harmonyCallToken_,
+            params_.use_jinja));
+
     std::string chatTemplate =
         getChatTemplate(model_, params_, tools_.enabled());
     tmpls_ = common_chat_templates_init(model_, chatTemplate);
@@ -510,6 +527,23 @@ bool TextLlmContext::generateResponse(
       }
     }
 
+    if (isEos && isHarmonyModel_ && params_.use_jinja &&
+        tokenId == harmonyCallToken_) {
+      QLOG_IF(
+          Priority::DEBUG,
+          string_format(
+              "[TextLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
+      if (outputCallback) {
+        std::string callMarker =
+            common_token_to_piece(lctx_, tokenId, true);
+        if (!callMarker.empty()) {
+          outputCallback(callMarker);
+        }
+      }
+      flushPendingUtf8ToCallback(outputCallback);
+      break;
+    }
+
     if (isEos || checkAntiprompt()) {
       flushPendingUtf8ToCallback(outputCallback);
       break;
diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/TextLlmContext.hpp b/packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/TextLlmContext.hpp
@@ -198,5 +198,9 @@ class TextLlmContext : public LlmContext {
   // Cache whether this is a Qwen3 model (checked once at load time)
   bool isQwen3Model_ = false;
 
+  // GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
+  bool isHarmonyModel_ = false;
+  llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;
+
   std::atomic<bool> stopGeneration_ = false;
 };
diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon/src/utils/ChatTemplateUtils.cpp b/packages/qvac-lib-infer-llamacpp-llm/addon/src/utils/ChatTemplateUtils.cpp
@@ -31,6 +31,11 @@ bool isQwen3Architecture(const std::string& architecture) {
   return archStr == "qwen3";
 }
 
+bool isHarmonyArchitecture(const std::string& architecture) {
+  const std::string archStr = normalizeArchitecture(architecture);
+  return archStr == "gpt-oss";
+}
+
 bool modelNameLooksLikeQwen3(const std::string& modelName) {
   std::string normalizedName = modelName;
   std::transform(
@@ -85,6 +90,23 @@ bool isQwen3Model(const ::llama_model* model) {
       getModelArchitecture(model), getModelName(model));
 }
 
+bool isHarmonyModel(const ::llama_model* model) {
+  if (model == nullptr) {
+    return false;
+  }
+  std::optional<std::string> arch = getModelArchitecture(model);
+  return arch.has_value() && isHarmonyArchitecture(arch.value());
+}
+
+llama_token getHarmonyCallToken(::llama_context* lctx) {
+  std::vector<llama_token> tokens =
+      common_tokenize(lctx, "<|call|>", false, true);
+  if (tokens.size() == 1) {
+    return tokens[0];
+  }
+  return LLAMA_TOKEN_NULL;
+}
+
 bool supportsToolsCompactForModelMetadata(
     const std::optional<std::string>& architecture,
     const std::optional<std::string>& modelName) {
diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon/src/utils/ChatTemplateUtils.hpp b/packages/qvac-lib-infer-llamacpp-llm/addon/src/utils/ChatTemplateUtils.hpp
@@ -14,6 +14,8 @@ namespace qvac_lib_inference_addon_llama {
 namespace utils {
 
 bool isQwen3Model(const ::llama_model* model);
+bool isHarmonyModel(const ::llama_model* model);
+llama_token getHarmonyCallToken(::llama_context* lctx);
 std::optional<std::string> getModelArchitecture(const ::llama_model* model);
 bool supportsToolsCompactForModelMetadata(
     const std::optional<std::string>& architecture,
diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/harmonyMultiTurnTools.js b/packages/qvac-lib-infer-llamacpp-llm/examples/harmonyMultiTurnTools.js
diff --git a/packages/qvac-lib-infer-llamacpp-llm/package.json b/packages/qvac-lib-infer-llamacpp-llm/package.json