tetherto
diff --git a/‎.cursor/rules/sdk/docs/request-lifecycle-system.mdc‎
Lines changed: 63 additions & 12 deletions b/‎.cursor/rules/sdk/docs/request-lifecycle-system.mdc‎
Lines changed: 63 additions & 12 deletions
diff --git a/‎.cursor/rules/sdk/error-handling.mdc‎
Lines changed: 1 addition & 0 deletions b/‎.cursor/rules/sdk/error-handling.mdc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.cursor/rules/sdk/request-lifecycle-primitives.mdc‎
Lines changed: 183 additions & 9 deletions b/‎.cursor/rules/sdk/request-lifecycle-primitives.mdc‎
Lines changed: 183 additions & 9 deletions
diff --git a/‎packages/sdk/index.ts‎
Lines changed: 6 additions & 1 deletion b/‎packages/sdk/index.ts‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎packages/sdk/schemas/plugin.ts‎
Lines changed: 35 additions & 0 deletions b/‎packages/sdk/schemas/plugin.ts‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎packages/sdk/schemas/sdk-errors-server.ts‎
Lines changed: 11 additions & 0 deletions b/‎packages/sdk/schemas/sdk-errors-server.ts‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎packages/sdk/server/bare/plugins/llamacpp-completion/ops/completion-stream.ts‎
Lines changed: 14 additions & 3 deletions b/‎packages/sdk/server/bare/plugins/llamacpp-completion/ops/completion-stream.ts‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎packages/sdk/server/bare/plugins/llamacpp-completion/ops/kv-cache-session.ts‎
Lines changed: 25 additions & 8 deletions b/‎packages/sdk/server/bare/plugins/llamacpp-completion/ops/kv-cache-session.ts‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts‎
Lines changed: 11 additions & 2 deletions b/‎packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎packages/sdk/server/bare/plugins/llamacpp-embedding/plugin.ts‎
Lines changed: 3 additions & 0 deletions b/‎packages/sdk/server/bare/plugins/llamacpp-embedding/plugin.ts‎
Lines changed: 3 additions & 0 deletions
@@ -180,6 +180,7 @@ Located in `@/utils/errors-server`
 - `RequestIdConflictError` (52417) - `registry.begin(...)` called with a `requestId` already present
 - `RequestNotFoundError` (52418) - registry lookup miss (no in-flight request for the given id)
 - `InferenceCancelledError` (52419) - cancelled inference run; carries `requestId` + `partial: { text?, toolCalls?, stats? }`. Constructed client-side on `stopReason: "cancelled"` (event stream ends normally; promise-aggregates reject with this). Re-exported from `@qvac/sdk` for `instanceof` checks.
+- `RequestRejectedByPolicyError` (52420) - registry concurrency-policy admission failure (e.g. `oneAtATimePerModel`); carries `requestId`, `kind`, `modelId`, and a `reason` string. Re-exported from `@qvac/sdk` for `instanceof` checks. See `.cursor/rules/sdk/request-lifecycle-primitives.mdc` for the policy contract.
 
 #### RAG Operations (52,800-52,999)
 - `RAGSaveFailedError` - Save failed
 
@@ -153,9 +153,14 @@ export { SUPPORTED_AUDIO_FORMATS } from "./constants/audio";
 // promises. `InferenceCancelledError` rides the standard `QvacError`
 // envelope, but consumers reach for it through `instanceof` on
 // `await run.final` / `run.text` / `run.toolCalls` / `run.stats`
-// rejections.
+// rejections. `RequestRejectedByPolicyError` is thrown by
+// `RequestRegistry.begin(...)` when a registered concurrency policy
+// (e.g. `oneAtATimePerModel` on `completion`) rejects a new request;
+// it propagates out through the worker so the client can distinguish
+// "the request collided with another one" from "the request failed".
 export { InferenceCancelledError } from "./utils/errors-server";
 export type { InferenceCancelledPartial } from "./utils/errors-server";
+export { RequestRejectedByPolicyError } from "./utils/errors-server";
 
 // Logging exports
 export { getLogger, SDK_LOG_ID } from "./logging";
 
@@ -1,6 +1,24 @@
 import { z } from "zod";
 import type { ModelSrcInput } from "./model-src-utils";
 
+/**
+ * Granularity at which the addon can cancel.
+ *  - `"request"` — addon cancels a specific in-flight `requestId`.
+ *  - `"model"` — addon cancels whatever is running on the model.
+ *  - `"none"` — no addon cancel surface; SDK falls back to soft-cancel
+ *    (stop yielding, drop result; the C++ work runs to completion).
+ */
+export type PluginHandlerCancelScope = "request" | "model" | "none";
+
+export interface PluginHandlerCancel {
+  scope: PluginHandlerCancelScope;
+  /**
+   * `true` — `addon.cancel()` interrupts compute; otherwise it's
+   * best-effort. Only meaningful for `scope: "model" | "request"`.
+   */
+  hard?: boolean;
+}
+
 /**
  * Definition for a plugin handler with explicit Zod schemas.
  * Each handler must define its request/response schemas for validation.
@@ -21,6 +39,11 @@ export interface PluginHandlerDefinition<
         ) => Promise<O> | AsyncGenerator<O>
       : never
     : never;
+  /**
+   * Cancel surface this handler advertises. Omitting is equivalent
+   * to `{ scope: "none" }` (soft-cancel fallback).
+   */
+  cancel?: PluginHandlerCancel;
 }
 
 /**
@@ -43,6 +66,8 @@ export interface DuplexPluginHandlerDefinition<
         ) => AsyncGenerator<O>
       : never
     : never;
+  /** See `PluginHandlerDefinition.cancel`. */
+  cancel?: PluginHandlerCancel;
 }
 
 /**
@@ -254,13 +279,23 @@ const zodSchemaLikeRuntimeSchema = z
   })
   .catchall(z.unknown());
 
+const pluginHandlerCancelRuntimeSchema = z
+  .object({
+    scope: z.enum(["request", "model", "none"], {
+      error: "cancel.scope must be 'request', 'model', or 'none'",
+    }),
+    hard: z.boolean().optional(),
+  })
+  .catchall(z.unknown());
+
 export const pluginHandlerDefinitionRuntimeSchema = z
   .object({
     requestSchema: zodSchemaLikeRuntimeSchema,
     responseSchema: zodSchemaLikeRuntimeSchema,
     streaming: z.boolean({ error: "streaming must be a boolean" }),
     duplex: z.boolean().optional(),
     handler: functionRuntimeSchema,
+    cancel: pluginHandlerCancelRuntimeSchema.optional(),
   })
   .catchall(z.unknown());
 
 
@@ -41,6 +41,7 @@ export const SDK_SERVER_ERROR_CODES = {
   REQUEST_ID_CONFLICT: 52417,
   REQUEST_NOT_FOUND: 52418,
   INFERENCE_CANCELLED: 52419,
+  REQUEST_REJECTED_BY_POLICY: 52420,
 
   // RAG Operations (52,800-52,999)
   RAG_SAVE_FAILED: 52800,
@@ -309,6 +310,16 @@ const serverErrorDefinitions: ErrorCodesMap = {
     message: (requestId: string) =>
       `Inference request "${requestId}" was cancelled before it could complete`,
   },
+  [SDK_SERVER_ERROR_CODES.REQUEST_REJECTED_BY_POLICY]: {
+    name: "REQUEST_REJECTED_BY_POLICY",
+    message: (
+      requestId: string,
+      kind: string,
+      modelId: string,
+      reason: string,
+    ) =>
+      `Request "${requestId}" (kind: ${kind}, modelId: ${modelId}) was rejected by registry concurrency policy: ${reason}`,
+  },
 
   // RAG Operations (52,800-52,999)
   [SDK_SERVER_ERROR_CODES.RAG_SAVE_FAILED]: {
 
@@ -42,6 +42,7 @@ import { parseToolCalls } from "@/server/utils/tools";
 import { getResponseFormatJsonSchema } from "@/server/utils/response-format";
 import { buildAutoCacheSaveHistory, type CacheMessage } from "@/server/utils";
 import { getServerLogger } from "@/logging";
+import type { Logger } from "@/logging/types";
 import { AttachmentNotFoundError } from "@/utils/errors-server";
 import { nowMs } from "@/profiling";
 import {
@@ -415,11 +416,21 @@ export async function* completion(
     toolDialect?: ToolDialect;
     responseFormat?: ResponseFormat;
   },
-  opts: { signal: AbortSignal; scope: DisposableScope },
+  opts: {
+    signal: AbortSignal;
+    scope: DisposableScope;
+    /**
+     * Request-scoped logger forwarded to `createKvCacheSession` so
+     * kv-cache lines share the request's lifecycle prefix. Falls
+     * back to the module-level server logger when omitted.
+     */
+    logger?: Logger;
+  },
 ): AsyncGenerator<{ token: string }, CompletionResult, unknown> {
   const { history, modelId, kvCache, tools, generationParams, responseFormat } =
     params;
   const { signal, scope } = opts;
+  const requestLogger = opts.logger ?? logger;
 
   const modelConfig = getModelConfig(modelId);
   const toolsEnabled = (modelConfig as { tools?: boolean }).tools === true;
@@ -470,7 +481,7 @@ export async function* completion(
     const addon = model.addon;
     if (addon?.cancel) {
       addon.cancel.call(addon).catch((err: unknown) => {
-        logger.warn(
+        requestLogger.warn(
           `[cancel] addon.cancel() rejected during abort for modelId=${modelId}: ${err instanceof Error ? err.message : String(err)}`,
         );
       });
@@ -523,7 +534,7 @@ export async function* completion(
   // rollback. Cancellations / zero-token replies / rename failures all
   // unwind through the same `scope.defer` hook. ----
 
-  const session = createKvCacheSession(modelId);
+  const session = createKvCacheSession(modelId, { logger: requestLogger });
   const systemPromptFromHistory = extractSystemPrompt(history);
   // Dynamic mode lets each turn carry its own tool set, so the cache
   // hash must not depend on the tool list — otherwise a tool change
 
@@ -14,8 +14,12 @@ import {
   logCacheStatus,
 } from "@/server/bare/plugins/llamacpp-completion/ops/cache-logger";
 import { getServerLogger } from "@/logging";
+import type { Logger } from "@/logging/types";
 
-const logger = getServerLogger();
+// Used by cross-model paths that have no `RequestContext` (e.g.
+// `deleteKvCacheState`). Per-session call sites receive a logger from
+// the caller — typically `withRequestContext(...)`.
+const moduleLogger = getServerLogger();
 
 /**
  * Single owner of the three KV-cache bookkeeping layers.
@@ -204,7 +208,17 @@ interface InternalTurnState {
 
 // ----- factory -----
 
-export function createKvCacheSession(modelId: string): KvCacheSession {
+/**
+ * Construct a session bound to one `(modelId, turn-owning request)`
+ * scope. `options.logger` is the per-instance logger the session emits
+ * through (typically `withRequestContext(getServerLogger(), ctx)`);
+ * falls back to the module-scoped logger when omitted.
+ */
+export function createKvCacheSession(
+  modelId: string,
+  options?: { logger?: Logger },
+): KvCacheSession {
+  const logger = options?.logger ?? moduleLogger;
   // Per-session map: each `TurnHandle` carries an opaque entry here. A
   // WeakMap so handles drop their state once the handler scope releases
   // the reference; the module-scoped maps above survive.
@@ -254,7 +268,7 @@ export function createKvCacheSession(modelId: string): KvCacheSession {
 
     if (!exists) {
       await input.primeIfMissing(cachePath);
-      await verifyPrimedFile(cachePath);
+      await verifyPrimedFile(cachePath, logger);
       initializedCaches.add(registryKey);
     }
 
@@ -297,7 +311,7 @@ export function createKvCacheSession(modelId: string): KvCacheSession {
 
     if (!cacheExists) {
       await input.primeIfMissing(cachePath);
-      await verifyPrimedFile(cachePath);
+      await verifyPrimedFile(cachePath, logger);
       initializedCaches.add(registryKey);
     }
 
@@ -453,9 +467,9 @@ export async function deleteKvCacheState(
     const removed = await deleteCacheUtil({ all: true });
     cachedMessageCounts.clear();
     initializedCaches.clear();
-    // `removed` is the kv-cache root dir; logging surfaces it for
-    // ops visibility but isn't part of the contract.
-    logger.debug(`[kv-cache] Cleared all caches under ${removed}`);
+    // `removed` is the kv-cache root dir; surfaces it for ops
+    // visibility but isn't part of the contract.
+    moduleLogger.debug(`[kv-cache] Cleared all caches under ${removed}`);
     return;
   }
 
@@ -512,7 +526,10 @@ export async function deleteKvCacheState(
  * `completion-stream.ts` lets the error propagate up and no
  * `initializedCaches` entry is recorded.
  */
-async function verifyPrimedFile(cachePath: string): Promise<void> {
+async function verifyPrimedFile(
+  cachePath: string,
+  logger: Logger,
+): Promise<void> {
   let stats: { size: number };
   try {
     stats = await fsPromises.stat(cachePath);
 
@@ -30,8 +30,12 @@ import { attachModelExecutionMs } from "@/profiling/model-execution";
 import { getModelConfig } from "@/server/bare/registry/model-registry";
 import { createCompletionNormalizer } from "@/server/utils/completion-normalizer";
 import { detectToolDialect } from "@/server/utils/tool-integration";
-import { getRequestRegistry } from "@/server/bare/runtime";
+import {
+  getRequestRegistry,
+  withRequestContext,
+} from "@/server/bare/runtime";
 import { generateServerRequestId } from "@/server/bare/runtime/request-id";
+import { getServerLogger } from "@/logging";
 
 
 function createLlmModel(
@@ -96,6 +100,7 @@ export const llmPlugin = definePlugin({
       requestSchema: completionStreamRequestSchema,
       responseSchema: completionStreamResponseSchema,
       streaming: true,
+      cancel: { scope: "model", hard: true },
 
       handler: async function* (request) {
         const filteredHistory = request.history.map(
@@ -141,6 +146,8 @@ export const llmPlugin = definePlugin({
           modelId: request.modelId,
         });
 
+        const requestLogger = withRequestContext(getServerLogger(), ctx);
+
         const stream = completion(
           {
             history: filteredHistory,
@@ -151,7 +158,7 @@ export const llmPlugin = definePlugin({
             ...(toolsActive && { toolDialect: dialect }),
             ...(request.responseFormat && { responseFormat: request.responseFormat }),
           },
-          { signal: ctx.signal, scope: ctx.scope },
+          { signal: ctx.signal, scope: ctx.scope, logger: requestLogger },
         );
 
         try {
@@ -210,6 +217,7 @@ export const llmPlugin = definePlugin({
       requestSchema: finetuneRequestSchema,
       responseSchema: finetuneResponseSchema,
       streaming: false,
+      cancel: { scope: "none" },
 
       handler: function (request) {
         return finetune(request);
@@ -220,6 +228,7 @@ export const llmPlugin = definePlugin({
       requestSchema: translateRequestSchema,
       responseSchema: translateResponseSchema,
       streaming: true,
+      cancel: { scope: "model", hard: true },
 
       handler: async function* (request) {
         const stream = translate(request);
 
@@ -110,6 +110,9 @@ export const embeddingsPlugin = definePlugin({
       requestSchema: embedRequestSchema,
       responseSchema: embedResponseSchema,
       streaming: false,
+      // Model-wide hard cancel via `addon.cancel()` on the llama.cpp
+      // embedding addon. Compute is interrupted when fired.
+      cancel: { scope: "model", hard: true },
 
       handler: async function (request) {
         const embedResult = await embed({