tetherto
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md‎
Lines changed: 41 additions & 0 deletions b/‎packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/CMakeLists.txt‎
Lines changed: 10 additions & 0 deletions b/‎packages/qvac-lib-infer-llamacpp-llm/CMakeLists.txt‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/addon/AddonJs.hpp‎
Lines changed: 21 additions & 0 deletions b/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/addon/AddonJs.hpp‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/GenerationParamsApply.cpp‎
Lines changed: 121 additions & 0 deletions b/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/GenerationParamsApply.cpp‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/GenerationParamsApply.hpp‎
Lines changed: 56 additions & 0 deletions b/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/GenerationParamsApply.hpp‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/LlmContext.hpp‎
Lines changed: 12 additions & 1 deletion b/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/LlmContext.hpp‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.cpp‎
Lines changed: 2 additions & 32 deletions b/‎packages/qvac-lib-infer-llamacpp-llm/addon/src/model-interface/MtmdLlmContext.cpp‎
Lines changed: 2 additions & 32 deletions
@@ -1,5 +1,46 @@
 # Changelog
 
+## [0.17.1] - 2026-04-28
+
+This patch release adds per-request structured-output support to the LLM addon: callers can now constrain a single completion to either a JSON Schema or a raw GBNF grammar without reloading the model. Back-port of the same change being prepared for `main` in [#1787](https://github.com/tetherto/qvac/pull/1787); shipped here on top of `0.17.0` so it can be consumed by SDK lines that have not yet migrated to `0.18.x`.
+
+### Added
+
+#### Per-request `json_schema` and `grammar` in `generationParams`
+
+`RunOptions.generationParams` accepts two new optional fields:
+
+- **`json_schema`** — JSON Schema applied to a single `run()` call. Accepts either a JSON Schema object literal or a pre-stringified JSON Schema. Internally converted to GBNF via llama.cpp's `json_schema_to_grammar()`, the same converter used by the load-time `--json-schema` config key.
+- **`grammar`** — raw GBNF string applied to a single `run()` call. Useful for non-JSON outputs (regex-like DSLs, CSV, custom syntaxes). Mirrors the load-time `--grammar` config key.
+
+The two are mutually exclusive — passing both throws a `TypeError` at the JS boundary. When either is set, the sampler is re-initialized for that request and the prior (typically load-time) grammar is restored automatically afterwards. Both `TextLlmContext` and `MtmdLlmContext` are wired through, so multimodal models get the same per-request hook as text-only ones.
+
+```js
+// JSON Schema (recommended for structured output)
+await model.run(prompt, {
+  generationParams: {
+    json_schema: {
+      type: 'object',
+      properties: { name: { type: 'string' }, age: { type: 'integer' } },
+      required: ['name', 'age']
+    }
+  }
+})
+
+// GBNF (non-JSON outputs)
+await model.run(prompt, {
+  generationParams: {
+    grammar: 'root ::= ("yes" | "no")'
+  }
+})
+```
+
+A new `nlohmann-json` vcpkg dependency is pulled in (header-only) so the addon can call `json_schema_to_grammar()` directly without shipping a JSON-Schema-to-GBNF converter on the JS side. Bad GBNF / unparseable JSON Schema surfaces as `InvalidArgument` with the saved sampler restored, so a malformed per-request schema cannot leave the model in a broken state.
+
+## Pull Requests
+
+- [#1787](https://github.com/tetherto/qvac/pull/1787) - feat[api]: per-request grammar / json_schema in llm-llamacpp generationParams (forward-port to `main`)
+
 ## [0.17.0] - 2026-04-21
 
 ### Changed
 
@@ -33,6 +33,12 @@ configure_file(${VCPKG_INSTALLED_PATH}/share/qvac-lint-cpp/.clang-tidy
 find_path(PICOJSON_INCLUDE_DIRS "picojson/picojson.h")
 find_path(QVAC_LIB_INFERENCE_ADDON_CPP_INCLUDE_DIRS "qvac-lib-inference-addon-cpp/JsInterface.hpp")
 find_package(llama CONFIG REQUIRED)
+# Required to call llama.cpp's `json_schema_to_grammar()` for per-request
+# JSON-Schema → GBNF conversion. The function signature lives in libcommon
+# (linked via `llama::common`) but takes a `nlohmann::ordered_json`, so we
+# need the full nlohmann headers, not just the forward decl shipped with
+# `llama/common/json-schema-to-grammar.h`.
+find_package(nlohmann_json CONFIG REQUIRED)
 
 if(WIN32)
   add_definitions( -DNOMINMAX -DWIN32_MEAN_AND_LEAN -DNOGDI )
@@ -65,6 +71,7 @@ endif()
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/AsyncWeightsLoader.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/CacheManager.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/ContextSlider.cpp
+    ${PROJECT_SOURCE_DIR}/addon/src/model-interface/GenerationParamsApply.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/LlamaLazyInitializeBackend.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/LlamaModel.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/LlamaFinetuningHelpers.cpp
@@ -98,6 +105,7 @@ endif()
       llama::llama
       llama::common
       llama::mtmd
+      nlohmann_json::nlohmann_json
   )
 
 
@@ -113,6 +121,7 @@ if(BUILD_CLI)
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/AsyncWeightsLoader.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/CacheManager.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/ContextSlider.cpp
+    ${PROJECT_SOURCE_DIR}/addon/src/model-interface/GenerationParamsApply.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/LlamaLazyInitializeBackend.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/LlamaModel.cpp
     ${PROJECT_SOURCE_DIR}/addon/src/model-interface/MtmdLlmContext.cpp
@@ -142,6 +151,7 @@ if(BUILD_CLI)
           llama::llama
           llama::common
           llama::mtmd
+          nlohmann_json::nlohmann_json
   )
   find_path(QVAC_LIB_INFERENCE_ADDON_CPP_INCLUDE_DIRS "qvac-lib-inference-addon-cpp/JsInterface.hpp")
   target_include_directories(cli_tool PRIVATE ${QVAC_LIB_INFERENCE_ADDON_CPP_INCLUDE_DIRS})
 
@@ -339,6 +339,27 @@ inline js_value_t* runJob(js_env_t* env, js_callback_info_t* info) try {
       readNum("frequency_penalty", ov.frequency_penalty);
       readNum("presence_penalty", ov.presence_penalty);
       readNum("repeat_penalty", ov.repeat_penalty);
+
+      auto grammarStr =
+          configObj->getOptionalPropertyAs<js::String, std::string>(
+              env, "grammar");
+      if (grammarStr.has_value() && !grammarStr->empty()) {
+        ov.grammar = std::move(*grammarStr);
+      }
+
+      auto jsonSchemaStr =
+          configObj->getOptionalPropertyAs<js::String, std::string>(
+              env, "json_schema");
+      if (jsonSchemaStr.has_value() && !jsonSchemaStr->empty()) {
+        ov.json_schema = std::move(*jsonSchemaStr);
+      }
+
+      if (ov.grammar && ov.json_schema) {
+        throw StatusError(
+            general_error::InvalidArgument,
+            "generationParams.grammar and generationParams.json_schema are "
+            "mutually exclusive");
+      }
     }
 
     prompt.cacheKey =
 
@@ -0,0 +1,121 @@
+#include "GenerationParamsApply.hpp"
+
+#include <exception>
+#include <string>
+#include <utility>
+
+#include <nlohmann/json.hpp>
+#include <qvac-lib-inference-addon-cpp/Errors.hpp>
+
+#include "addon/LlmErrors.hpp"
+#include "common/json-schema-to-grammar.h"
+#include "common/log.h"
+
+void applyGenerationOverridesToSampling(
+    common_params_sampling& sampling, int& nPredict,
+    const GenerationParams& overrides) {
+  auto setIf = [](const auto& src, auto& dst) {
+    if (src) {
+      dst = *src;
+    }
+  };
+
+  setIf(overrides.temp, sampling.temp);
+  setIf(overrides.top_p, sampling.top_p);
+  setIf(overrides.top_k, sampling.top_k);
+  setIf(overrides.n_predict, nPredict);
+  setIf(overrides.seed, sampling.seed);
+  setIf(overrides.frequency_penalty, sampling.penalty_freq);
+  setIf(overrides.presence_penalty, sampling.penalty_present);
+  setIf(overrides.repeat_penalty, sampling.penalty_repeat);
+
+  // `json_schema` and `grammar` are mutually exclusive at the JS boundary
+  // and in `AddonJs::runJob::parseText`, so reaching this branch with both
+  // set means a caller bypassed those checks (most likely the C++ unit
+  // tests or `cli_tool` driving the helper directly). Log a warning so
+  // the issue surfaces in stderr/log output and pick `json_schema`, which
+  // is the higher-level surface.
+  if (overrides.json_schema && overrides.grammar) {
+    LOG_WRN(
+        "%s: both generationParams.grammar and generationParams.json_schema "
+        "were provided; ignoring `grammar` and applying `json_schema` "
+        "(the JS and AddonJs paths reject this combination — this branch "
+        "exists only for direct C++ callers).\n",
+        __func__);
+  }
+
+  if (overrides.json_schema) {
+    try {
+      auto parsed = nlohmann::ordered_json::parse(*overrides.json_schema);
+      sampling.grammar = json_schema_to_grammar(parsed);
+    } catch (const std::exception& ex) {
+      throw qvac_errors::StatusError(
+          ADDON_ID,
+          qvac_errors::general_error::toString(
+              qvac_errors::general_error::InvalidArgument),
+          std::string("invalid generationParams.json_schema: ") + ex.what());
+    }
+  } else if (overrides.grammar) {
+    sampling.grammar = *overrides.grammar;
+  }
+}
+
+std::function<void()> applyGenerationParamsToContext(
+    common_params& params, CommonSamplerPtr& smpl, llama_model* model,
+    const GenerationParams& overrides) {
+  if (!overrides.hasOverrides()) {
+    return []() {};
+  }
+
+  // Apply overrides to *local copies* first. Only commit them onto the
+  // live `params` and `smpl` after both the json_schema parse/convert and
+  // `common_sampler_init` have succeeded — otherwise a partially applied
+  // override (e.g. temp/seed already written, then json_schema throws)
+  // would leak into subsequent requests because no restore lambda gets
+  // returned to the caller's `ScopeGuard`.
+  common_params_sampling nextSampling = params.sampling;
+  int nextPredict = params.n_predict;
+
+  // May throw `InvalidArgument` for malformed `json_schema`. `params`
+  // and `smpl` remain untouched in that case.
+  applyGenerationOverridesToSampling(nextSampling, nextPredict, overrides);
+
+  // `common_sampler_init` returns nullptr on bad inputs (most commonly an
+  // invalid GBNF grammar — `json_schema` is converted to GBNF above and
+  // can in principle produce a grammar that the sampler rejects). Build
+  // the new sampler before touching live state so a failure here also
+  // leaves `params` / `smpl` intact.
+  CommonSamplerPtr nextSmpl(common_sampler_init(model, nextSampling));
+  if (!nextSmpl) {
+    throw qvac_errors::StatusError(
+        ADDON_ID,
+        qvac_errors::general_error::toString(
+            qvac_errors::general_error::InvalidArgument),
+        "failed to initialise sampler with per-request generationParams "
+        "(invalid grammar or json_schema?)");
+  }
+
+  // Snapshot the live values before committing so the restore lambda can
+  // roll the request's mutations back at the end of the call.
+  common_params_sampling savedSampling = params.sampling;
+  int savedPredict = params.n_predict;
+
+  params.sampling = std::move(nextSampling);
+  params.n_predict = nextPredict;
+  smpl = std::move(nextSmpl);
+
+  bool restored = false;
+  return [&params,
+          &smpl,
+          model,
+          savedSampling = std::move(savedSampling),
+          savedPredict,
+          restored]() mutable {
+    if (restored)
+      return;
+    restored = true;
+    params.sampling = savedSampling;
+    params.n_predict = savedPredict;
+    smpl.reset(common_sampler_init(model, params.sampling));
+  };
+}
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <functional>
+
+#include "LlmContext.hpp"
+#include "common/common.h"
+
+// Apply per-request `generationParams` overrides onto a sampling block
+// + `n_predict` value in place. Operates on the two mutable fields the
+// helper actually needs so callers can pass *copies* and only commit
+// them to live state once the whole call (including json_schema parse
+// and `common_sampler_init`) has succeeded — avoiding partial mutation
+// of the live `common_params` if this throws.
+//
+// If `overrides.json_schema` is set, parses the JSON Schema and converts
+// it to GBNF via llama.cpp's `json_schema_to_grammar()`, mirroring what
+// the `--json-schema` load-time flag does. If `overrides.grammar` is set,
+// the GBNF is used verbatim. The two are mutually exclusive (validated at
+// the JS boundary and again in `AddonJs::runJob::parseText`); if both are
+// present here a `LOG_WRN` is emitted and `json_schema` wins — the JS and
+// AddonJs paths reject this combination, so reaching it means a direct
+// C++ caller (unit tests / `cli_tool`) bypassed those checks.
+//
+// Throws `qvac_errors::StatusError(InvalidArgument)` when `json_schema`
+// fails to parse or convert. Caller is responsible for re-initialising
+// the sampler after this call so the new sampling block takes effect.
+void applyGenerationOverridesToSampling(
+    common_params_sampling& sampling, int& nPredict,
+    const GenerationParams& overrides);
+
+// Apply per-request `generationParams` overrides onto a context's live
+// `params` + `smpl` and return a restore lambda the caller can install
+// into a `ScopeGuard` to roll the mutation back at end-of-request.
+//
+// Implements the atomic-commit pattern: overrides are applied to local
+// copies of `params.sampling` and `params.n_predict`, the new sampler is
+// built against those copies, and only after both the json_schema parse
+// and `common_sampler_init` have succeeded are the live `params` / `smpl`
+// updated. Any throw or null-sampler failure leaves live state untouched.
+//
+// Returns a no-op lambda when `overrides.hasOverrides()` is false. The
+// returned lambda re-initialises `smpl` from the saved sampling block, so
+// it MUST be invoked before the owning context is destroyed (i.e. via a
+// guard scoped to the request).
+//
+// Throws `qvac_errors::StatusError(InvalidArgument)` for malformed
+// `json_schema` or when the resulting GBNF is rejected by the sampler.
+//
+// `params`, `smpl`, and `model` are captured by reference inside the
+// returned lambda; callers must guarantee they outlive the lambda. Both
+// `TextLlmContext::applyGenerationParams` and
+// `MtmdLlmContext::applyGenerationParams` satisfy this — the context
+// owns the fields and outlives any single request.
+std::function<void()> applyGenerationParamsToContext(
+    common_params& params, CommonSamplerPtr& smpl, llama_model* model,
+    const GenerationParams& overrides);
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <functional>
 #include <optional>
+#include <string>
 
 #include "addon/LlmErrors.hpp"
 #include "common/chat.h"
@@ -20,10 +21,20 @@ struct GenerationParams {
   std::optional<float> presence_penalty;
   std::optional<float> repeat_penalty;
   std::optional<uint32_t> seed;
+  // GBNF grammar applied per request to constrain sampling. When set, the
+  // sampler is re-initialized with this grammar for the duration of the
+  // request and the prior grammar is restored afterwards. Mirrors the
+  // load-time `--grammar` flag but scoped to a single completion call.
+  std::optional<std::string> grammar;
+  // JSON-Schema applied per request. Converted to GBNF via llama.cpp's
+  // `json_schema_to_grammar()` and applied identically to `grammar`.
+  // Mutually exclusive with `grammar` — the JS wrapper rejects requests
+  // that set both. Mirrors the load-time `--json-schema` flag.
+  std::optional<std::string> json_schema;
 
   [[nodiscard]] bool hasOverrides() const {
     return n_predict || temp || top_p || top_k || frequency_penalty ||
-           presence_penalty || repeat_penalty || seed;
+           presence_penalty || repeat_penalty || seed || grammar || json_schema;
   }
 };
 
 
@@ -9,6 +9,7 @@
 #include <qvac-lib-inference-addon-cpp/Errors.hpp>
 
 #include "ContextSlider.hpp"
+#include "GenerationParamsApply.hpp"
 #include "addon/LlmErrors.hpp"
 #include "qvac-lib-inference-addon-cpp/Logger.hpp"
 #include "utils/ChatTemplateUtils.hpp"
@@ -480,38 +481,7 @@ bool MtmdLlmContext::generateResponse(
 
 std::function<void()>
 MtmdLlmContext::applyGenerationParams(const GenerationParams& overrides) {
-  if (!overrides.hasOverrides()) {
-    return []() {};
-  }
-
-  common_params_sampling savedSampling = params_.sampling;
-  int savedPredict = params_.n_predict;
-
-  auto setIf = [](const auto& src, auto& dst) {
-    if (src) {
-      dst = *src;
-    }
-  };
-  setIf(overrides.temp, params_.sampling.temp);
-  setIf(overrides.top_p, params_.sampling.top_p);
-  setIf(overrides.top_k, params_.sampling.top_k);
-  setIf(overrides.n_predict, params_.n_predict);
-  setIf(overrides.seed, params_.sampling.seed);
-  setIf(overrides.frequency_penalty, params_.sampling.penalty_freq);
-  setIf(overrides.presence_penalty, params_.sampling.penalty_present);
-  setIf(overrides.repeat_penalty, params_.sampling.penalty_repeat);
-
-  smpl_.reset(common_sampler_init(model_, params_.sampling));
-
-  bool restored = false;
-  return [this, savedSampling, savedPredict, restored]() mutable {
-    if (restored)
-      return;
-    restored = true;
-    params_.sampling = savedSampling;
-    params_.n_predict = savedPredict;
-    smpl_.reset(common_sampler_init(model_, params_.sampling));
-  };
+  return applyGenerationParamsToContext(params_, smpl_, model_, overrides);
 }
 
 void MtmdLlmContext::stop() { stopGeneration_.store(true); }