Skip to content

Commit b3157eb

Browse files
authored
fix: handle Harmony <|call|> EOG token for GPT-OSS tool calling (#1822)
* fix: handle Harmony <|call|> EOG token for GPT-OSS tool calling GPT-OSS models use <|call|> as a frame delimiter in Harmony tool-call protocol. This token is in the EOG set, causing generation to stop silently before tool calls reach the SDK. Add Harmony model detection and <|call|>-specific handling in the generation loop: render the token as visible text (special=true) so the SDK can parse frame boundaries, then stop generation for the turn-based tool execution protocol. Add multi-turn tool calling example demonstrating sequential tool execution across turns and confirming parallel is not supported. * package version bumped to 0.17.2, CHANGELOG updated for same
1 parent 354a517 commit b3157eb

9 files changed

Lines changed: 337 additions & 2 deletions

File tree

packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# Changelog
22

3+
## [0.17.2] - 2026-04-30
4+
5+
### Fixed
6+
7+
#### GPT-OSS Harmony tool calling: `<|call|>` frame delimiter now surfaces to the SDK
8+
9+
The `<|call|>` token (Harmony frame terminator) is in the model's EOG set. When sampled, it rendered as 0 bytes and silently stopped generation — tool call output was truncated with no visible frame boundary, resulting in the SDK parsing 0 tool calls.
10+
11+
The generation loop now detects Harmony models and intercepts `<|call|>` before the generic EOG break: it renders the token as visible text (`special=true`) so the SDK can identify frame boundaries, then stops generation cleanly. GPT-OSS uses a turn-based tool protocol — one tool call per generation pass — and the SDK is expected to execute the tool, append results, and re-prompt for subsequent calls.
12+
313
## [0.17.1] - 2026-04-28
414

515
This patch release adds per-request structured-output support to the LLM addon: callers can now constrain a single completion to either a JSON Schema or a raw GBNF grammar without reloading the model. Back-port of the same change being prepared for `main` in [#1787](https://github.com/tetherto/qvac/pull/1787); shipped here on top of `0.17.0` so it can be consumed by SDK lines that have not yet migrated to `0.18.x`.

packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.cpp

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,22 @@ MtmdLlmContext::MtmdLlmContext(
100100
antipromptTokens_.insert(
101101
antipromptTokens_.end(), tempTokens.begin(), tempTokens.end());
102102
}
103+
104+
isHarmonyModel_ =
105+
qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
106+
if (isHarmonyModel_) {
107+
harmonyCallToken_ =
108+
qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
109+
if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
110+
isHarmonyModel_ = false;
111+
}
112+
}
113+
QLOG_IF(
114+
Priority::DEBUG,
115+
string_format(
116+
"[MtmdLlm] Harmony detection: isHarmony=%d callToken=%d\n",
117+
isHarmonyModel_,
118+
harmonyCallToken_));
103119
}
104120

105121
void MtmdLlmContext::initVisionContext() {
@@ -453,7 +469,26 @@ bool MtmdLlmContext::generateResponse(
453469
}
454470
}
455471

456-
if (llama_vocab_is_eog(vocab_, tokenId) || checkAntiprompt()) {
472+
bool isEos = llama_vocab_is_eog(vocab_, tokenId);
473+
474+
if (isEos && isHarmonyModel_ && params_.use_jinja &&
475+
tokenId == harmonyCallToken_) {
476+
QLOG_IF(
477+
Priority::DEBUG,
478+
string_format(
479+
"[MtmdLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
480+
if (outputCallback) {
481+
std::string callMarker =
482+
common_token_to_piece(lctx_, tokenId, true);
483+
if (!callMarker.empty()) {
484+
outputCallback(callMarker);
485+
}
486+
}
487+
flushPendingUtf8ToCallback(outputCallback);
488+
break;
489+
}
490+
491+
if (isEos || checkAntiprompt()) {
457492
flushPendingUtf8ToCallback(outputCallback);
458493
break;
459494
}

packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,5 +220,10 @@ class MtmdLlmContext : public LlmContext {
220220

221221
// UTF-8 token buffer for handling incomplete emoji sequences
222222
qvac_lib_inference_addon_llama::UTF8TokenBuffer utf8Buffer_;
223+
224+
// GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
225+
bool isHarmonyModel_ = false;
226+
llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;
227+
223228
std::atomic<bool> stopGeneration_ = false;
224229
};

packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/TextLlmContext.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,23 @@ TextLlmContext::TextLlmContext(
5353
lctx_, reasoningState_);
5454
}
5555

56+
isHarmonyModel_ =
57+
qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
58+
if (isHarmonyModel_) {
59+
harmonyCallToken_ =
60+
qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
61+
if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
62+
isHarmonyModel_ = false;
63+
}
64+
}
65+
QLOG_IF(
66+
Priority::DEBUG,
67+
string_format(
68+
"[TextLlm] Harmony detection: isHarmony=%d callToken=%d useJinja=%d\n",
69+
isHarmonyModel_,
70+
harmonyCallToken_,
71+
params_.use_jinja));
72+
5673
std::string chatTemplate =
5774
getChatTemplate(model_, params_, tools_.enabled());
5875
tmpls_ = common_chat_templates_init(model_, chatTemplate);
@@ -510,6 +527,23 @@ bool TextLlmContext::generateResponse(
510527
}
511528
}
512529

530+
if (isEos && isHarmonyModel_ && params_.use_jinja &&
531+
tokenId == harmonyCallToken_) {
532+
QLOG_IF(
533+
Priority::DEBUG,
534+
string_format(
535+
"[TextLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
536+
if (outputCallback) {
537+
std::string callMarker =
538+
common_token_to_piece(lctx_, tokenId, true);
539+
if (!callMarker.empty()) {
540+
outputCallback(callMarker);
541+
}
542+
}
543+
flushPendingUtf8ToCallback(outputCallback);
544+
break;
545+
}
546+
513547
if (isEos || checkAntiprompt()) {
514548
flushPendingUtf8ToCallback(outputCallback);
515549
break;

packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/TextLlmContext.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,5 +198,9 @@ class TextLlmContext : public LlmContext {
198198
// Cache whether this is a Qwen3 model (checked once at load time)
199199
bool isQwen3Model_ = false;
200200

201+
// GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
202+
bool isHarmonyModel_ = false;
203+
llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;
204+
201205
std::atomic<bool> stopGeneration_ = false;
202206
};

packages/qvac-lib-infer-llamacpp-llm/addon/src/utils/ChatTemplateUtils.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ bool isQwen3Architecture(const std::string& architecture) {
3131
return archStr == "qwen3";
3232
}
3333

34+
bool isHarmonyArchitecture(const std::string& architecture) {
35+
const std::string archStr = normalizeArchitecture(architecture);
36+
return archStr == "gpt-oss";
37+
}
38+
3439
bool modelNameLooksLikeQwen3(const std::string& modelName) {
3540
std::string normalizedName = modelName;
3641
std::transform(
@@ -85,6 +90,23 @@ bool isQwen3Model(const ::llama_model* model) {
8590
getModelArchitecture(model), getModelName(model));
8691
}
8792

93+
bool isHarmonyModel(const ::llama_model* model) {
94+
if (model == nullptr) {
95+
return false;
96+
}
97+
std::optional<std::string> arch = getModelArchitecture(model);
98+
return arch.has_value() && isHarmonyArchitecture(arch.value());
99+
}
100+
101+
llama_token getHarmonyCallToken(::llama_context* lctx) {
102+
std::vector<llama_token> tokens =
103+
common_tokenize(lctx, "<|call|>", false, true);
104+
if (tokens.size() == 1) {
105+
return tokens[0];
106+
}
107+
return LLAMA_TOKEN_NULL;
108+
}
109+
88110
bool supportsToolsCompactForModelMetadata(
89111
const std::optional<std::string>& architecture,
90112
const std::optional<std::string>& modelName) {

packages/qvac-lib-infer-llamacpp-llm/addon/src/utils/ChatTemplateUtils.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ namespace qvac_lib_inference_addon_llama {
1414
namespace utils {
1515

1616
bool isQwen3Model(const ::llama_model* model);
17+
bool isHarmonyModel(const ::llama_model* model);
18+
llama_token getHarmonyCallToken(::llama_context* lctx);
1719
std::optional<std::string> getModelArchitecture(const ::llama_model* model);
1820
bool supportsToolsCompactForModelMetadata(
1921
const std::optional<std::string>& architecture,

0 commit comments

Comments
 (0)