Improve 1B inference quality loop

AbrahamGreenman · AbrahamGreenman · commit 71d415406de8 · 2026-04-24T13:13:30.000-04:00
diff --git a/.changeset/json-repair-quality-routing.md b/.changeset/json-repair-quality-routing.md
@@ -0,0 +1,5 @@
+---
+"@razroo/ray-core": minor
+---
+
+Add JSON repair diagnostics for tiny model classification, task-routing diagnostics, prompt-format benchmark sweeps, scored quality metrics, benchmark history output, and sanitized capability hints.
diff --git a/README.md b/README.md
@@ -239,17 +239,18 @@ pnpm benchmark:assert:cx23
 pnpm benchmark:assert:cax11
 pnpm benchmark:assert:cx23:1b
 pnpm benchmark:assert:8gb:1b
+pnpm benchmark:1b:prompt-formats
 ```
 
-Those commands write the latest report to `.ray/benchmarks/` and compare the run against the baseline JSON in `examples/benchmarks/baselines/`. The 1B workload also checks simple output quality signals such as JSON validity, prompt echo, stop-token leakage, and generic email filler.
+Those commands write the latest report to `.ray/benchmarks/`, append JSONL history when configured, and compare the run against the baseline JSON in `examples/benchmarks/baselines/`. The 1B workload also checks scored output quality signals such as JSON validity, prompt echo, stop-token leakage, call-to-action presence, forbidden wrappers, and generic email filler.
 
 For prompt-family quality checks across cold outreach, follow-up, classification, rewrite, and section generation:
 
 ```bash
 pnpm eval:prompt-families:1b
 ```
 
-The structured benchmark output includes provider diagnostics such as prompt format, request shape, model ref, launch preset, slot reuse, cached tokens, and context window so a quality regression can be tied back to the backend path Ray chose.
+The structured benchmark output includes provider diagnostics such as prompt format, request shape, model ref, launch preset, slot reuse, cached tokens, JSON repair attempts, and context window so a quality regression can be tied back to the backend path Ray chose. `/health` also exposes detected backend capabilities, and `/v1/config` includes sanitized capability hints for the configured profile.
 
 ### Quality gate (matches CI)
 
diff --git a/docs/integrations/razroo-email-ai.md b/docs/integrations/razroo-email-ai.md
@@ -38,8 +38,8 @@ The gateway exposes:
 - `POST /v1/jobs` — async durable submission (same inference fields, plus optional `callbackUrl`). Returns `202 Accepted` and a job location.
 - `GET /v1/jobs/:id` — durable job state and final result/error.
 - `GET /livez` — lightweight unauthenticated liveness for reverse proxies.
-- `GET /health` — detailed queue/provider snapshot, plus `asyncQueue` when enabled. Public profiles require Bearer auth.
-- `GET /v1/config` — non-secret config (sanitized). Public profiles require Bearer auth.
+- `GET /health` — detailed queue/provider snapshot, detected backend capabilities (`applyTemplate`, `chatTemplate`, `jsonMode`, context window, slots), plus `asyncQueue` when enabled. Public profiles require Bearer auth.
+- `GET /v1/config` — non-secret config (sanitized) with capability hints for the configured model/profile. Public profiles require Bearer auth.
 
 With the public profile, a minimal `curl` check is:
 
@@ -63,10 +63,11 @@ Benchmark the 1B email path with:
 ```bash
 pnpm benchmark:assert:cx23:1b
 pnpm benchmark:assert:8gb:1b
+pnpm benchmark:1b:prompt-formats
 pnpm autotune:1b
 ```
 
-The workload in [email-1b-workload.jsonl](../../examples/workloads/email-1b-workload.jsonl) exercises cold outreach, follow-up, reply classification, reply rewrite, and a direct section-generation prompt shaped like the app's product flow. It asserts JSON validity for classification and rejects common prompt echo, stop-token leakage, and generic email filler.
+The workload in [email-1b-workload.jsonl](../../examples/workloads/email-1b-workload.jsonl) exercises cold outreach, follow-up, reply classification, reply rewrite, and a direct section-generation prompt shaped like the app's product flow. It asserts JSON validity for classification and rejects common prompt echo, stop-token leakage, and generic email filler. Benchmark runs can append JSONL history under `.ray/benchmarks/history` so prompt/config changes can be compared over time.
 
 [email-prompt-families-1b.json](../../examples/evals/email-prompt-families-1b.json) is the smaller golden eval set for prompt wording changes. Run it with `pnpm eval:prompt-families:1b` against a live Ray gateway. The output includes provider diagnostics for `promptFormat`, `promptFormatReason`, `modelRef`, `launchPreset`, cached tokens, slot reuse, and context window.
 
diff --git a/examples/benchmarks/baselines/hetzner-cx23-1b.json b/examples/benchmarks/baselines/hetzner-cx23-1b.json
@@ -11,6 +11,7 @@
     "maxTtftP95Ms": 2800,
     "minCompletionTokensPerSecondAvg": 8,
     "minQualityPassRate": 80,
+    "minQualityScoreAvg": 80,
     "minValidJsonRate": 100
   },
   "notes": [
diff --git a/examples/benchmarks/baselines/single-node-8gb-1b.json b/examples/benchmarks/baselines/single-node-8gb-1b.json
@@ -11,6 +11,7 @@
     "maxTtftP95Ms": 2200,
     "minCompletionTokensPerSecondAvg": 14,
     "minQualityPassRate": 85,
+    "minQualityScoreAvg": 85,
     "minValidJsonRate": 100
   },
   "notes": [
diff --git a/package.json b/package.json
@@ -40,12 +40,13 @@
     "start:hetzner-email-ai:public": "node ./apps/gateway/dist/index.js --config ./examples/config/ray.hetzner-cx23-qwen0.6b.public.json",
     "benchmark": "node --import tsx ./scripts/benchmark.ts",
     "benchmark:email": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-workload.jsonl --concurrency 2",
-    "benchmark:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 1",
-    "eval:prompt-families:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/evals/email-prompt-families-1b.json --concurrency 1 --requests 5 --label email-prompt-families-1b --output ./.ray/evals/email-prompt-families-1b.latest.json",
+    "benchmark:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 1 --history-dir ./.ray/benchmarks/history",
+    "benchmark:1b:prompt-formats": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 1 --requests 10 --label email-1b-prompt-formats --prompt-format-sweep --output ./.ray/benchmarks/email-1b-prompt-formats.latest.json --history-dir ./.ray/benchmarks/history",
+    "eval:prompt-families:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/evals/email-prompt-families-1b.json --concurrency 1 --requests 5 --label email-prompt-families-1b --output ./.ray/evals/email-prompt-families-1b.latest.json --history-dir ./.ray/evals/history",
     "benchmark:assert:cx23": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-workload.jsonl --concurrency 2 --requests 16 --label hetzner-cx23-sub1b --baseline ./examples/benchmarks/baselines/hetzner-cx23-sub1b.json --assert-baseline --output ./.ray/benchmarks/hetzner-cx23-sub1b.latest.json",
     "benchmark:assert:cax11": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-workload.jsonl --concurrency 2 --requests 16 --label hetzner-cax11-sub1b --baseline ./examples/benchmarks/baselines/hetzner-cax11-sub1b.json --assert-baseline --output ./.ray/benchmarks/hetzner-cax11-sub1b.latest.json",
-    "benchmark:assert:cx23:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 1 --requests 10 --label hetzner-cx23-1b --baseline ./examples/benchmarks/baselines/hetzner-cx23-1b.json --assert-baseline --output ./.ray/benchmarks/hetzner-cx23-1b.latest.json",
-    "benchmark:assert:8gb:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 2 --requests 16 --label single-node-8gb-1b --baseline ./examples/benchmarks/baselines/single-node-8gb-1b.json --assert-baseline --output ./.ray/benchmarks/single-node-8gb-1b.latest.json",
+    "benchmark:assert:cx23:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 1 --requests 10 --label hetzner-cx23-1b --baseline ./examples/benchmarks/baselines/hetzner-cx23-1b.json --assert-baseline --output ./.ray/benchmarks/hetzner-cx23-1b.latest.json --history-dir ./.ray/benchmarks/history",
+    "benchmark:assert:8gb:1b": "node --import tsx ./scripts/benchmark.ts --base-url http://127.0.0.1:3000 --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 2 --requests 16 --label single-node-8gb-1b --baseline ./examples/benchmarks/baselines/single-node-8gb-1b.json --assert-baseline --output ./.ray/benchmarks/single-node-8gb-1b.latest.json --history-dir ./.ray/benchmarks/history",
     "autotune:hetzner-email-ai": "node --import tsx ./scripts/benchmark.ts --autotune --config ./examples/config/ray.hetzner-cx23-qwen0.6b.json --workload ./examples/workloads/email-workload.jsonl --concurrency 2 --requests 16",
     "autotune:1b": "node --import tsx ./scripts/benchmark.ts --autotune --config ./examples/config/ray.1b.json --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 1 --requests 10",
     "autotune:1b:8gb": "node --import tsx ./scripts/benchmark.ts --autotune --config ./examples/config/ray.1b.8gb.json --workload ./examples/workloads/email-1b-workload.jsonl --concurrency 2 --requests 16",
diff --git a/packages/config/src/defaults.test.ts b/packages/config/src/defaults.test.ts
@@ -94,7 +94,12 @@ test("sanitizeConfig redacts upstream adapter headers", () => {
         headers: Record<string, string>;
       };
     };
+    capabilityHints: {
+      modelId: string;
+      operational?: unknown;
+    };
   };
 
   assert.equal(safe.model.adapter.headers.authorization, "[redacted]");
+  assert.equal(safe.capabilityHints.modelId, "qwen2.5-3b-instruct-q4");
 });
diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts
@@ -641,6 +641,27 @@ export async function loadRayConfig(options: LoadRayConfigOptions = {}): Promise
 
 export function sanitizeConfig(config: RayConfig): Record<string, unknown> {
   const safe = structuredClone(config) as RayConfig;
+  const capabilityHints = {
+    profile: safe.profile,
+    modelId: safe.model.id,
+    family: safe.model.family,
+    quantization: safe.model.quantization,
+    contextWindow: safe.model.contextWindow,
+    maxOutputTokens: safe.model.maxOutputTokens,
+    operational: safe.model.operational,
+    ...(safe.model.adapter.kind === "llama.cpp"
+      ? {
+          llamaCpp: {
+            modelRef: safe.model.adapter.modelRef,
+            launchPreset: safe.model.adapter.launchProfile?.preset,
+            ctxSize: safe.model.adapter.launchProfile?.ctxSize,
+            parallel: safe.model.adapter.launchProfile?.parallel,
+            cacheRamMiB: safe.model.adapter.launchProfile?.cacheRamMiB,
+            cachePrompt: safe.model.adapter.cachePrompt,
+          },
+        }
+      : {}),
+  };
 
   if (
     (safe.model.adapter.kind === "openai-compatible" || safe.model.adapter.kind === "llama.cpp") &&
@@ -651,7 +672,10 @@ export function sanitizeConfig(config: RayConfig): Record<string, unknown> {
     );
   }
 
-  return safe as unknown as Record<string, unknown>;
+  return {
+    ...(safe as unknown as Record<string, unknown>),
+    capabilityHints,
+  };
 }
 
 export { createDefaultConfig, mergeConfig };
diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts
@@ -365,6 +365,8 @@ export interface ProviderDiagnostics {
   requestShape?: "openai-chat" | "llama.cpp-completion";
   promptFormat?: "llama.cpp-template" | "prompt-scaffold" | "ray-chat-fallback";
   promptFormatReason?: string;
+  jsonRepairAttempted?: boolean;
+  jsonRepairSucceeded?: boolean;
   modelRef?: string;
   backendModel?: string;
   launchPreset?: string;
@@ -454,10 +456,18 @@ export interface LearnedOutputCapDiagnostics {
   percentile: number;
 }
 
+export interface TaskRoutingDiagnostics {
+  taskKind: "classification" | "rewrite" | "draft" | "unknown";
+  recommendedModelRole: "classifier" | "drafter" | "general";
+  activeModelRole?: string;
+  matchedActiveRole: boolean;
+}
+
 export interface InferenceDiagnostics {
   promptCompiler?: PromptCompilerDiagnostics;
   learnedOutputCap?: LearnedOutputCapDiagnostics;
   adaptiveTuning?: AdaptiveTuningDiagnostics;
+  taskRouting?: TaskRoutingDiagnostics;
   provider?: ProviderDiagnostics;
 }
 
diff --git a/packages/models/src/llama-cpp.test.ts b/packages/models/src/llama-cpp.test.ts
@@ -471,6 +471,89 @@ test("llama.cpp provider falls back to chat completions for json_object requests
   assert.ok(!seenPaths.includes("/completion"));
 });
 
+test("llama.cpp provider repairs invalid json_object chat responses once", async (t) => {
+  let chatCalls = 0;
+
+  const server = createServer(async (request, response) => {
+    if (request.url === "/apply-template") {
+      response.writeHead(200, { "content-type": "application/json" });
+      response.end(JSON.stringify({ prompt: "<s>json prompt" }));
+      return;
+    }
+
+    if (request.url === "/tokenize") {
+      response.writeHead(200, { "content-type": "application/json" });
+      response.end(JSON.stringify({ tokens: [1, 2, 3] }));
+      return;
+    }
+
+    if (request.url === "/v1/chat/completions") {
+      chatCalls += 1;
+      response.writeHead(200, { "content-type": "application/json" });
+      response.end(
+        JSON.stringify({
+          choices: [
+            {
+              message: {
+                content: chatCalls === 1 ? "intent: positive" : '{"intent":"positive"}',
+              },
+            },
+          ],
+          usage: {
+            prompt_tokens: chatCalls === 1 ? 3 : 5,
+            completion_tokens: chatCalls === 1 ? 4 : 2,
+            total_tokens: chatCalls === 1 ? 7 : 7,
+          },
+        }),
+      );
+      return;
+    }
+
+    response.writeHead(404);
+    response.end();
+  });
+
+  await new Promise<void>((resolve) => server.listen(0, "127.0.0.1", resolve));
+  t.after(() => server.close());
+
+  const address = server.address();
+  if (!address || typeof address === "string") {
+    throw new Error("Expected a TCP server address");
+  }
+
+  const model = createModel(`http://127.0.0.1:${address.port}`, 500);
+  const provider = new LlamaCppProvider(model, model.adapter as LlamaCppProviderConfig);
+  const context = createContext(model, new AbortController().signal);
+  const request = {
+    input: "Classify the reply",
+    system: "Return only compact JSON.",
+    maxTokens: 64,
+    temperature: 0.2,
+    topP: 0.95,
+    cache: true,
+    metadata: {},
+    responseFormat: {
+      type: "json_object" as const,
+    },
+  };
+
+  const preparation = await provider.prepare(request, context);
+  const result = await provider.infer(request, {
+    ...context,
+    preparation,
+  });
+
+  assert.equal(result.output, '{"intent":"positive"}');
+  assert.equal(chatCalls, 2);
+  assert.equal(result.diagnostics?.jsonRepairAttempted, true);
+  assert.equal(result.diagnostics?.jsonRepairSucceeded, true);
+  assert.deepEqual(result.usage?.tokens, {
+    prompt: 8,
+    completion: 6,
+    total: 14,
+  });
+});
+
 test("llama.cpp provider degrades gracefully when slot snapshots time out", async (t) => {
   const seenPaths: string[] = [];
 
diff --git a/packages/models/src/providers/llama-cpp.ts b/packages/models/src/providers/llama-cpp.ts
diff --git a/packages/runtime/src/index.ts b/packages/runtime/src/index.ts
diff --git a/packages/runtime/src/runtime.test.ts b/packages/runtime/src/runtime.test.ts
diff --git a/scripts/benchmark.ts b/scripts/benchmark.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@razroo/ray-core": minor
 +---
++
 +Add JSON repair diagnostics for tiny model classification, task-routing diagnostics, prompt-format benchmark sweeps, scored quality metrics, benchmark history output, and sanitized capability hints.