From ae578723444129fac1bf2cc0919ffe29b6d10df1 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sun, 14 Jun 2026 04:58:39 -0600
Subject: [PATCH] feat(evals): one trading persona eval (deterministic +
 operator matrix) + CI lane
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #169 (model-driven trading) / #170 (deps). ONE entry point —
runTradingPersonaEval (evals/src/trading/persona-agent-eval.ts) — that degrades
by infra, instead of separate modules:

- No operator URL -> DETERMINISTIC mode: the Rust walk-forward backtest ->
  RunRecords + trace + scorecard (offline; what full-eval/CI run). Unchanged.
- operatorUrl present -> OPERATOR-MATRIX mode: runProfileMatrix sweeps the
  PROFILE axis (operator model variants: kimi-k2/glm-4.7/glm-5.1, pinned into the
  REAL operator via agentEnv) x (persona x market). Each cell runs the FULL
  operator simulation (runMultishotUserSim -> real bot_artifacts +
  tick_side_effects), judged on real artifacts (60%) + objective backtest ground
  truth (40%) — not prose. Scorecard + assertRealBackend + byProfile/byPersona
  read straight from the matrix. Multi-round honestly degenerates to 1 (the
  provision->chat->capture cycle is single-pass; turns live inside each cell).

Consolidation: folds the operator-matrix capability INTO the existing bridge file
and DELETES the standalone module + the dual --matrix bin flag + the redundant
npm script. One surface, one entry point, shared scorecard/profile/ground-truth
helpers. The bin auto-degrades by --operator-url; full-eval routes through the
same function.

CI: new 'Evals typecheck' lane (node 22 + npm ci + tsc -p evals/tsconfig.json),
classified on evals/ + package*.json + tsconfig, required in the gate.

Deps: agent-runtime ^0.52, agent-knowledge ^1.7 (over #170's ^0.50/^1.5);
agent-eval ^0.91. Validated: npm ci clean, tsc 0 errors.
---
 .github/workflows/ci.yml                     |  36 +-
 evals/src/bin/agent-eval-trading-personas.ts |  35 +-
 evals/src/full/full-eval-runner.ts           |   4 +-
 evals/src/sim/llm-call.ts                    |  61 +++
 evals/src/sim/multishot-user-sim.ts          |  54 +-
 evals/src/trading/persona-agent-eval.ts      | 526 +++++++++++++++++--
 package-lock.json                            | 161 +-----
 package.json                                 |   4 +-
 trading-blueprint-lib/src/jobs/activate.rs   |   6 +-
 9 files changed, 680 insertions(+), 207 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8145802d..e656065b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -29,6 +29,7 @@ jobs:
       arena: ${{ steps.decide.outputs.arena }}
       contracts: ${{ steps.decide.outputs.contracts }}
       rust: ${{ steps.decide.outputs.rust }}
+      evals: ${{ steps.decide.outputs.evals }}
       full: ${{ steps.decide.outputs.full }}
     steps:
       - uses: actions/checkout@v6
@@ -73,6 +74,7 @@ jobs:
           arena=false
           contracts=false
           rust=false
+          evals=false
           ci=false
 
           while IFS= read -r file; do
@@ -92,6 +94,11 @@ jobs:
                 rust=true
                 ;;
             esac
+            case "$file" in
+              evals/*|package.json|package-lock.json|tsconfig.json|tsconfig.*.json)
+                evals=true
+                ;;
+            esac
             case "$file" in
               .github/workflows/ci.yml)
                 ci=true
@@ -106,12 +113,14 @@ jobs:
             arena=true
             contracts=true
             rust=true
+            evals=true
           fi
 
           echo "full=$full" >> "$GITHUB_OUTPUT"
           echo "arena=$arena" >> "$GITHUB_OUTPUT"
           echo "contracts=$contracts" >> "$GITHUB_OUTPUT"
           echo "rust=$rust" >> "$GITHUB_OUTPUT"
+          echo "evals=$evals" >> "$GITHUB_OUTPUT"
 
   arena:
     name: Arena UI
@@ -324,9 +333,29 @@ jobs:
             --ignore RUSTSEC-2026-0118 \
             --ignore RUSTSEC-2026-0119
 
+  # ── Evals (TypeScript) ────────────────────────────────────────────────────
+  evals:
+    name: Evals typecheck
+    needs: changes
+    if: needs.changes.outputs.evals == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Typecheck evals
+        run: npx tsc -p evals/tsconfig.json --noEmit
+
   ci-gate:
     name: CI Gate
-    needs: [changes, arena, forge, rust, clippy, fmt, audit]
+    needs: [changes, arena, forge, rust, clippy, fmt, audit, evals]
     if: always()
     runs-on: ubuntu-latest
     steps:
@@ -336,12 +365,14 @@ jobs:
           ARENA_NEEDED: ${{ needs.changes.outputs.arena }}
           CONTRACTS_NEEDED: ${{ needs.changes.outputs.contracts }}
           RUST_NEEDED: ${{ needs.changes.outputs.rust }}
+          EVALS_NEEDED: ${{ needs.changes.outputs.evals }}
           ARENA_RESULT: ${{ needs.arena.result }}
           FORGE_RESULT: ${{ needs.forge.result }}
           RUST_RESULT: ${{ needs.rust.result }}
           CLIPPY_RESULT: ${{ needs.clippy.result }}
           FMT_RESULT: ${{ needs.fmt.result }}
           AUDIT_RESULT: ${{ needs.audit.result }}
+          EVALS_RESULT: ${{ needs.evals.result }}
         run: |
           set -euo pipefail
           failed=false
@@ -367,8 +398,9 @@ jobs:
           require_success "Clippy" "$RUST_NEEDED" "$CLIPPY_RESULT"
           require_success "Rustfmt" "$RUST_NEEDED" "$FMT_RESULT"
           require_success "Security audit" "$RUST_NEEDED" "$AUDIT_RESULT"
+          require_success "Evals typecheck" "$EVALS_NEEDED" "$EVALS_RESULT"
 
-          if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ]; then
+          if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ] && [ "$EVALS_NEEDED" != "true" ]; then
             echo "No code lanes changed; CI gate is green."
           fi
 
diff --git a/evals/src/bin/agent-eval-trading-personas.ts b/evals/src/bin/agent-eval-trading-personas.ts
index 1c2649b1..bb8ab972 100644
--- a/evals/src/bin/agent-eval-trading-personas.ts
+++ b/evals/src/bin/agent-eval-trading-personas.ts
@@ -1,23 +1,42 @@
 #!/usr/bin/env node
-import {
-  runTradingPersonaAgentEvalBridge,
-  type TradingPersonaBridgeOptions,
-} from '../trading/persona-agent-eval.js'
+import { runTradingPersonaEval, type TradingPersonaEvalOptions } from '../trading/persona-agent-eval.js'
+import type { LlmModel } from '../sim/llm-call.js'
 
 function argValue(name: string): string | undefined {
   const index = process.argv.indexOf(name)
   return index >= 0 ? process.argv[index + 1] : undefined
 }
 
-const options: TradingPersonaBridgeOptions = {}
+// One entry point. With --operator-url (or OPERATOR_API_URL/OPERATOR_URL set) it
+// runs the real operator profile × persona matrix (real bot artifacts + tick
+// side-effects, scored against the objective backtest); without it, the
+// deterministic walk-forward backtest. Same surface, degrades by infra.
+const options: TradingPersonaEvalOptions = {}
 const reportPath = argValue('--out')
 const traceDir = argValue('--trace-dir')
 const runsJsonl = argValue('--runs-jsonl') ?? argValue('--runs')
+const scorecard = argValue('--scorecard')
+const operatorUrl = argValue('--operator-url')
+const models = argValue('--models')
+const reps = argValue('--reps')
+const maxTurns = argValue('--max-turns')
+const costCeiling = argValue('--cost-ceiling')
 if (reportPath) options.reportPath = reportPath
 if (traceDir) options.traceDir = traceDir
 if (runsJsonl) options.runsJsonl = runsJsonl
+if (scorecard) options.scorecardPath = scorecard
+if (operatorUrl) options.operatorUrl = operatorUrl
+if (models) options.models = models.split(',').map((m) => m.trim()) as LlmModel[]
+if (reps) options.reps = Number(reps)
+if (maxTurns) options.maxTurnsPerShot = Number(maxTurns)
+if (costCeiling) options.costCeiling = Number(costCeiling)
+if (process.env.TRADING_PERSONA_MATRIX_INTEGRITY === 'warn') options.integrity = 'warn'
 
-const summary = await runTradingPersonaAgentEvalBridge(options)
-
+const summary = await runTradingPersonaEval(options)
 console.log(JSON.stringify(summary, null, 2))
-if (summary.failed > 0) process.exit(1)
+
+if (summary.mode === 'operator-matrix') {
+  if (summary.integrity?.verdict === 'stub' || summary.best === null) process.exit(1)
+} else if ((summary.failed ?? 0) > 0) {
+  process.exit(1)
+}
diff --git a/evals/src/full/full-eval-runner.ts b/evals/src/full/full-eval-runner.ts
index c49d0cfb..ed886c27 100644
--- a/evals/src/full/full-eval-runner.ts
+++ b/evals/src/full/full-eval-runner.ts
@@ -6,7 +6,7 @@ import { runSelfImprovementMcpEval } from '../self-improvement/mcp-eval.js'
 import { runProductBrowserEval } from '../product/browser-driver.js'
 import { runStrategyTemplateEval } from '../trading/strategy-template-runner.js'
 import { runTradingLifecycleEval } from '../trading/lifecycle-runner.js'
-import { runTradingPersonaAgentEvalBridge } from '../trading/persona-agent-eval.js'
+import { runTradingPersonaEval } from '../trading/persona-agent-eval.js'
 
 export interface FullEvalOptions {
   outputPath?: string
@@ -36,7 +36,7 @@ export async function runFullEval(options: FullEvalOptions = {}) {
   await gate(gates, 'rust-persona-coverage-test', async () => {
     run('cargo', ['test', '-p', 'trading-runtime', 'persona_eval_suite_has_required_coverage_and_passes'])
   })
-  await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaAgentEvalBridge({
+  await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaEval({
     reportPath: `.evolve/evals/full-personas-${stamp}.json`,
     traceDir: `.evolve/agent-eval/traces/full-personas-${stamp}`,
     runsJsonl: `.evolve/agent-eval/full-persona-runs-${stamp}.jsonl`,
diff --git a/evals/src/sim/llm-call.ts b/evals/src/sim/llm-call.ts
index cc83af81..2d3e29fd 100644
--- a/evals/src/sim/llm-call.ts
+++ b/evals/src/sim/llm-call.ts
@@ -100,6 +100,19 @@ export interface LlmCallResult {
   ok: boolean
 }
 
+/** Token + cost usage accumulated from the backend's `llm_call` stream
+ *  events. Zeros when the provider reported no usage (which downstream
+ *  backend-integrity guards correctly read as a stub fingerprint). */
+export interface LlmUsage {
+  input: number
+  output: number
+  costUsd: number
+}
+
+export interface LlmCallUsageResult extends LlmCallResult {
+  usage: LlmUsage
+}
+
 /** Resolve a logical model id to its provider routing. Throws on unknown
  *  model id so call sites can't silently ship a typo to prod. */
 export type { ModelRouting }
@@ -159,6 +172,54 @@ export async function llmCall(opts: LlmCallOptions): Promise<LlmCallResult> {
   return { output, exitCode: 0, stderr: '', ok: output.length > 0 }
 }
 
+/** Core LLM call that ALSO reports provider-reported token + cost usage,
+ *  accumulated from the backend's `llm_call` stream events. Use this when a
+ *  caller must thread real usage into a `RunRecord` / `ctx.cost.observeTokens`
+ *  (the backend-integrity guard keys on nonzero token usage). Same routing +
+ *  timeout handling as `llmCall`. */
+export async function llmCallWithUsage(opts: LlmCallOptions): Promise<LlmCallUsageResult> {
+  const cfg = resolveModel(opts.model ?? DEFAULT_MODEL)
+  const backend = createOpenAICompatibleBackend({
+    apiKey: cfg.apiKey(),
+    baseUrl: cfg.baseUrl,
+    model: cfg.modelId,
+  })
+  const task: AgentTaskSpec = {
+    id: `eval-llm-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
+    intent: 'one-shot eval LLM call with usage accounting',
+    domain: 'eval',
+  }
+  const controller = new AbortController()
+  const timer = setTimeout(() => controller.abort(), opts.timeoutMs ?? 180_000)
+  const parts: string[] = []
+  const usage: LlmUsage = { input: 0, output: 0, costUsd: 0 }
+  let backendError: string | null = null
+  try {
+    for await (const ev of runAgentTaskStream({
+      task,
+      backend,
+      input: { message: opts.prompt },
+      signal: controller.signal,
+    })) {
+      if (ev.type === 'text_delta') parts.push(ev.text)
+      else if (ev.type === 'llm_call') {
+        usage.input += ev.tokensIn ?? 0
+        usage.output += ev.tokensOut ?? 0
+        usage.costUsd += ev.costUsd ?? 0
+      } else if (ev.type === 'backend_error') backendError = ev.message
+    }
+  } catch (e) {
+    return { output: parts.join('').trim(), exitCode: 1, stderr: (e as Error).message, ok: false, usage }
+  } finally {
+    clearTimeout(timer)
+  }
+  if (backendError) {
+    return { output: parts.join('').trim(), exitCode: 1, stderr: backendError, ok: false, usage }
+  }
+  const output = parts.join('').trim()
+  return { output, exitCode: 0, stderr: '', ok: output.length > 0, usage }
+}
+
 /** Extract the first JSON object from an LLM response. Tolerates code
  *  fences and prose around the JSON. Returns null on unrecoverable
  *  parse failure — judges fall back gracefully instead of crashing the
diff --git a/evals/src/sim/multishot-user-sim.ts b/evals/src/sim/multishot-user-sim.ts
index 268bb94a..ddf06d25 100644
--- a/evals/src/sim/multishot-user-sim.ts
+++ b/evals/src/sim/multishot-user-sim.ts
@@ -128,6 +128,12 @@ export interface MultishotDispatchOptions {
   privateKey?: string
   maxTurnsPerShot: number
   perTurnTimeoutMs: number
+  /** Per-cell override for the in-sandbox agent's LLM credentials. When set,
+   *  this is the env `configureSecrets` writes into the bot's sandbox — i.e.
+   *  it pins WHICH model the REAL operator agent runs (the PROFILE axis of the
+   *  unified matrix). When omitted, falls back to `deterministicAgentEnv()`
+   *  (the single-profile default). */
+  agentEnv?: Record<string, string>
 }
 
 export function makeUserSimDispatch(opts: MultishotDispatchOptions, botKind: BotKind = 'real') {
@@ -185,8 +191,10 @@ async function dispatchInner(
     // Bot create is instant in operator DB; vault resolution is async
     // (on-chain). configureSecrets returns 500 without a resolved vault.
     await client.waitForVaultResolved(botId)
-    // Then configure sandbox-agent LLM credentials before chatting.
-    await client.configureSecrets(botId, deterministicAgentEnv())
+    // Then configure sandbox-agent LLM credentials before chatting. A per-cell
+    // override (the matrix PROFILE axis) pins which model the REAL operator runs;
+    // otherwise the single-profile default applies.
+    await client.configureSecrets(botId, opts.agentEnv ?? deterministicAgentEnv())
     const sessionId = await client.createSession(botId, `user-sim:${scenario.id}`)
     return runUserSimSession({
       intent: scenario.intent,
@@ -282,6 +290,43 @@ function deriveStateScores(
   return { committed, selfImprovement, evidence }
 }
 
+/** The artifact-based score for ONE user-sim session, factored out of
+ *  `userSimJudge` so other eval surfaces (the unified trading matrix) can score
+ *  a real session WITHOUT re-running a `runEval` campaign. Composite weights
+ *  OBSERVABLE state (trades/strategy/self-improve from `bot_artifacts`) at 55%
+ *  and prose (rubric judge) at 45% — the same weighting `userSimJudge` uses. */
+export interface UserSimArtifactScore {
+  composite: number
+  dimensions: Record<string, number>
+  notes: string
+}
+
+export async function scoreUserSimArtifact(
+  intent: UserIntent,
+  artifact: UserSimSessionResult,
+): Promise<UserSimArtifactScore> {
+  const r = await judgePrimaryRubric(intent, artifact)
+  const state = deriveStateScores(artifact, r.actually_traded_or_committed)
+  const composite =
+    0.20 * r.intent_fulfilled +
+    0.15 * r.respected_constraints +
+    0.40 * state.committed +
+    0.15 * state.selfImprovement +
+    0.10 * r.productive_conversation
+  return {
+    composite,
+    dimensions: {
+      intent_fulfilled: r.intent_fulfilled,
+      respected_constraints: r.respected_constraints,
+      actually_traded_or_committed: state.committed,
+      self_improvement: state.selfImprovement,
+      productive_conversation: r.productive_conversation,
+      prose_traded_claim: r.actually_traded_or_committed,
+    },
+    notes: `${r.notes} | STATE: ${state.evidence}`,
+  }
+}
+
 export function userSimJudge(opts: { dualJudge?: boolean } = {}): JudgeConfig<UserSimSessionResult, UserIntentScenario> {
   const useDual = opts.dualJudge ?? false
   return {
@@ -371,6 +416,10 @@ export interface RunMultishotUserSimOptions {
    *  bot scoring high on a newbie persona's intents but low on a
    *  veteran's is a real product signal. */
   personas?: UserPersona[]
+  /** Per-run override for the in-sandbox agent's LLM credentials — pins WHICH
+   *  model the REAL operator agent runs. Used by the unified trading matrix to
+   *  drive each PROFILE's model through the real operator stack. */
+  agentEnv?: Record<string, string>
 }
 
 export async function runMultishotUserSim(
@@ -393,6 +442,7 @@ export async function runMultishotUserSim(
       // multi-step work to land. (The real fix is tick-driving — task #108 —
       // but a 3-cron budget makes the current sync-poll model honest.)
       perTurnTimeoutMs: opts.perTurnTimeoutMs ?? 900_000,
+      ...(opts.agentEnv ? { agentEnv: opts.agentEnv } : {}),
     },
     botKind,
   )
diff --git a/evals/src/trading/persona-agent-eval.ts b/evals/src/trading/persona-agent-eval.ts
index 091439f7..3e953f8d 100644
--- a/evals/src/trading/persona-agent-eval.ts
+++ b/evals/src/trading/persona-agent-eval.ts
@@ -1,13 +1,31 @@
 /**
- * Trading-persona bridge — drives `runPersonaSuite` (Rust persona eval),
- * folds each result into a paper-grade `RunRecord`, captures spans via
- * `TraceEmitter` + `FileSystemTraceStore`, AND now feeds the scorecard
- * timeline so `diffScorecard` flags regressions across commits (the
- * pattern creative-agent uses).
+ * The ONE trading persona/profile eval — `runTradingPersonaEval`.
  *
- * Migrated off the dynamic-import shim — direct imports against the
- * `@tangle-network/agent-eval` 0.45+ surface. The dispatch + scoring are
- * unchanged; the wiring is the modern shape.
+ * A single surface that DEGRADES by what infra is available:
+ *
+ *   - No operator URL  → DETERMINISTIC mode. Runs `runPersonaSuite` (the Rust
+ *     walk-forward backtest), folds each persona×scenario result into a paper
+ *     `RunRecord` + trace span, and feeds the scorecard timeline. Offline, no
+ *     LLM, no live stack — this is what `full-eval` / CI run.
+ *   - operatorUrl present → OPERATOR-MATRIX mode. Sweeps a PROFILE axis (operator
+ *     model variants) × (PERSONA × MARKET) scenarios via `runProfileMatrix`. Each
+ *     cell runs the FULL operator simulation (`runMultishotUserSim` against the
+ *     real `OperatorClient` with that profile's model) and captures REAL evidence
+ *     (`bot_artifacts` + `tick_side_effects` + transcript). The judge blends the
+ *     real-artifact score with the model-invariant backtest ground truth — not
+ *     prose. Aggregation (`byProfile`/`byPersona`/`integrity`) is read straight
+ *     from `runProfileMatrix`.
+ *
+ * Both modes share the scorecard profile, the trace/RunRecord plumbing, and the
+ * objective backtest ground truth. There is exactly one entry point and one
+ * scorecard surface; callers (the bin, `full-eval`) need not know which mode ran.
+ *
+ * Multi-round depth in operator-matrix mode is honestly 1 round: the real
+ * provision→chat→capture cycle is single-pass and cannot be fed back through
+ * `loopDispatch`/`loopUntil` (those host a sandbox `runLoop` child, not an HTTP
+ * provision+chat orchestration). The multi-TURN refinement lives INSIDE each cell
+ * as the user-sim's `maxTurnsPerShot` turns; `depthRounds` is surfaced so the
+ * degeneracy is explicit, never faked.
  */
 
 import { randomUUID } from 'node:crypto'
@@ -17,14 +35,33 @@ import { dirname } from 'node:path'
 import {
   FileSystemTraceStore,
   TraceEmitter,
-  type RunRecord,
+  agentProfileHash,
+  assertRealBackend,
+  recordRunsToScorecard,
+  summarizeBackendIntegrity,
   validateRunRecord,
+  type AgentProfile,
+  type BackendIntegrityReport,
+  type RunRecord,
 } from '@tangle-network/agent-eval'
+import {
+  runProfileMatrix,
+  type JudgeConfig,
+  type JudgeScore,
+  type Scenario,
+} from '@tangle-network/agent-eval/campaign'
 
 import { sha256 } from '../lib/crypto.js'
 import { isoStamp, resolveRepo } from '../lib/repo.js'
+import { llmCallWithUsage, resolveModel, type LlmModel } from '../sim/llm-call.js'
+import { runMultishotUserSim, scoreUserSimArtifact } from '../sim/multishot-user-sim.js'
+import { STANDARD_USER_INTENTS } from '../sim/user-intents.js'
+import { STANDARD_USER_PERSONAS, type UserPersona } from '../sim/user-personas.js'
+import type { UserIntent, UserSimSessionResult } from '../sim/user-sim-driver.js'
 import { currentCommitSha, runPersonaSuite } from './persona-runner.js'
 import { normalizeSplit, numericRaw, type PersonaEvalResult } from './persona-types.js'
+import { evaluateScenario } from './personas/walk-forward.js'
+import { defaultScenarios, type TradingEvalScenario } from './personas/scenarios.js'
 import {
   buildTradingScorecardAgentProfile,
   recordScorecardAndDiff,
@@ -37,34 +74,96 @@ const FEE_SCHEDULE_VERSION = 'protocol-fees@2026-05'
 const SURFACE_VERSION = 1
 const RUNTIME_VERSION = '0.1.0'
 
-export interface TradingPersonaBridgeOptions {
+/** The PROFILE axis for operator-matrix mode: the operator model variants the
+ *  matrix sweeps. Single source of truth = `MODEL_CONFIG` in sim/llm-call.ts. */
+export const OPERATOR_PROFILE_MODELS: readonly LlmModel[] = ['kimi-k2', 'glm-4.7', 'glm-5.1']
+
+export interface TradingPersonaEvalOptions {
+  // ── shared ──
   reportPath?: string
   traceDir?: string
   runsJsonl?: string
   scorecardPath?: string
   failOnRegression?: boolean
+  // ── operator-matrix mode (presence of operatorUrl/env switches the mode) ──
+  /** Operator-api base URL the dispatch provisions/chats against. When present
+   *  (here or via OPERATOR_API_URL/OPERATOR_URL), the eval runs the real
+   *  operator matrix instead of the deterministic backtest. */
+  operatorUrl?: string
+  token?: string
+  privateKey?: string
+  /** Model variants to sweep (operator-matrix mode). Default: all three. */
+  models?: readonly LlmModel[]
+  personas?: UserPersona[]
+  markets?: TradingEvalScenario[]
+  intents?: UserIntent[]
+  reps?: number
+  maxTurnsPerShot?: number
+  perTurnTimeoutMs?: number
+  maxConcurrency?: number
+  costCeiling?: number
+  /** Backend-integrity posture for the matrix. Default 'assert'. */
+  integrity?: 'assert' | 'warn' | 'off'
 }
 
-export interface TradingPersonaBridgeSummary {
+/** Per-profile rollup (operator-matrix mode), read from `runProfileMatrix.byProfile`. */
+export interface ProfileSummary {
+  profileId: string
+  profileHash: string
+  model: string
+  records: number
+  /** Fraction of this profile's records scoring >= 0.7 (the promotion bar). */
+  passRate: number
+  meanScore: number
+  totalCostUsd: number
+  integrityVerdict: BackendIntegrityReport['verdict']
+}
+
+export interface TradingPersonaEvalSummary {
+  mode: 'deterministic' | 'operator-matrix'
   suite: string
-  report: string
-  runs_jsonl: string
-  trace_dir: string
+  commitSha: string
+  records: number
   scorecard: {
     path: string
     appendedCells: number
-    profileHash: string
+    profileHash?: string
     regressed: boolean
-    formatted: string
+    formatted?: string
   }
-  records: number
-  passed: number
-  failed: number
+  // deterministic mode
+  report?: string
+  runs_jsonl?: string
+  trace_dir?: string
+  passed?: number
+  failed?: number
+  // operator-matrix mode
+  runDir?: string
+  depthRounds?: 1
+  byProfile?: ProfileSummary[]
+  byPersona?: Record<string, { meanScore: number; n: number }>
+  integrity?: BackendIntegrityReport
+  best?: ProfileSummary | null
 }
 
-export async function runTradingPersonaAgentEvalBridge(
-  options: TradingPersonaBridgeOptions = {},
-): Promise<TradingPersonaBridgeSummary> {
+/**
+ * The single entry point. Resolves the operator URL (option or env) and routes:
+ * present → operator matrix (real stack); absent → deterministic backtest.
+ */
+export async function runTradingPersonaEval(
+  options: TradingPersonaEvalOptions = {},
+): Promise<TradingPersonaEvalSummary> {
+  const operatorUrl = options.operatorUrl ?? process.env.OPERATOR_API_URL ?? process.env.OPERATOR_URL
+  return operatorUrl
+    ? runOperatorMatrix(operatorUrl, options)
+    : runDeterministicBacktest(options)
+}
+
+// ── DETERMINISTIC MODE ─────────────────────────────────────────────────────
+
+async function runDeterministicBacktest(
+  options: TradingPersonaEvalOptions,
+): Promise<TradingPersonaEvalSummary> {
   const reportPath = resolveRepo(
     options.reportPath ?? `.evolve/evals/trading-agent-personas-${isoStamp()}.json`,
   )
@@ -112,10 +211,7 @@ export async function runTradingPersonaAgentEvalBridge(
       wallMs,
       costUsd: 0,
       tokenUsage: { input: 0, output: 0 },
-      outcome: {
-        searchScore: result.score / 100,
-        raw: numericRaw(result),
-      },
+      outcome: { searchScore: result.score / 100, raw: numericRaw(result) },
       failureMode: result.passed ? undefined : result.findings[0]?.subject,
       splitTag: normalizeSplit(result.split),
       scenarioId: result.scenario_id,
@@ -133,11 +229,22 @@ export async function runTradingPersonaAgentEvalBridge(
     commitSha,
   })
 
-  const summary: TradingPersonaBridgeSummary = {
+  if (options.failOnRegression && scorecard.regressed) {
+    throw new Error(
+      `trading-persona scorecard reports regression on at least one cell. Diff:\n${scorecard.formatted}`,
+    )
+  }
+
+  return {
+    mode: 'deterministic',
     suite: report.suite,
+    commitSha,
+    records,
     report: reportPath,
     runs_jsonl: runsJsonl,
     trace_dir: traceDir,
+    passed: report.passed,
+    failed: report.failed,
     scorecard: {
       path: scorecardPath,
       appendedCells: scorecard.appendedCells,
@@ -145,17 +252,7 @@ export async function runTradingPersonaAgentEvalBridge(
       regressed: scorecard.regressed,
       formatted: scorecard.formatted,
     },
-    records,
-    passed: report.passed,
-    failed: report.failed,
   }
-
-  if (options.failOnRegression && scorecard.regressed) {
-    throw new Error(
-      `trading-persona scorecard reports regression on at least one cell. Diff:\n${scorecard.formatted}`,
-    )
-  }
-  return summary
 }
 
 async function writeTrace(input: {
@@ -177,20 +274,12 @@ async function writeTrace(input: {
     promptSha: promptHash,
     modelFingerprint: MODEL_FINGERPRINT,
     layer: 'app-runtime',
-    tags: {
-      suite,
-      persona_id: result.persona_id,
-      split: result.split,
-      config_hash: configHash,
-    },
+    tags: { suite, persona_id: result.persona_id, split: result.split, config_hash: configHash },
   })
   const span = await emitter.tool({
     name: 'trading-runtime backtest walk-forward compare',
     toolName: 'trading_runtime.backtest.walk_forward_compare',
-    args: {
-      persona_id: result.persona_id,
-      scenario_id: result.scenario_id,
-    },
+    args: { persona_id: result.persona_id, scenario_id: result.scenario_id },
   })
   await span.end({
     result: {
@@ -209,12 +298,351 @@ async function writeTrace(input: {
     hash: sha256(reportBytes).replace(/^sha256:/, ''),
     storageUrl: reportPath,
   })
-  const failureClass = result.passed ? 'success' : 'instruction_following'
   const notes = result.findings.map((finding) => finding.message).join('\n')
   await emitter.endRun({
     pass: result.passed,
     score: result.score / 100,
-    failureClass,
+    failureClass: result.passed ? 'success' : 'instruction_following',
     ...(notes ? { notes } : {}),
   })
 }
+
+// ── OPERATOR-MATRIX MODE ───────────────────────────────────────────────────
+
+/** Build the profile axis (one profile per model). Carries the trading surface
+ *  identity (so the scorecard keys stably) plus the model under test. */
+export function buildOperatorProfiles(
+  models: readonly LlmModel[] = OPERATOR_PROFILE_MODELS,
+): AgentProfile[] {
+  return models.map((model) => {
+    const base = buildTradingScorecardAgentProfile({
+      surfaceVersion: SURFACE_VERSION,
+      runtimeVersion: RUNTIME_VERSION,
+      venues: VENUES,
+      feeScheduleVersion: FEE_SCHEDULE_VERSION,
+      model,
+    })
+    return {
+      ...base,
+      id: `${base.id}::model=${model}`,
+      model,
+      metadata: { ...base.metadata, model, modelClass: 'llm-trading-operator' },
+    }
+  })
+}
+
+function modelOfProfile(profile: AgentProfile): LlmModel {
+  const model = (profile.metadata?.model ?? profile.model) as LlmModel
+  resolveModel(model)
+  return model
+}
+
+/** In-sandbox operator agent env that pins the REAL operator to a model — the
+ *  mechanism that makes the PROFILE axis bite the production stack. Resolves
+ *  provider + key through the same MODEL_CONFIG table; throws on unknown
+ *  model/missing key (fail loud, never a silent stub). */
+export function agentEnvForModel(model: LlmModel): Record<string, string> {
+  const cfg = resolveModel(model)
+  const key = cfg.apiKey()
+  const isMoonshot = cfg.baseUrl.includes('moonshot')
+  const provider = isMoonshot ? 'moonshot' : 'zai-coding-plan'
+  const env: Record<string, string> = {
+    OPENCODE_MODEL_PROVIDER: provider,
+    OPENCODE_MODEL_NAME: cfg.modelId,
+    OPENCODE_MODEL_API_KEY: key,
+    OPENCODE_MODEL: `${provider}/${cfg.modelId}`,
+    OPENCODE_MODEL_BASE_URL: cfg.baseUrl,
+    SIDECAR_DEFAULT_HARNESS: 'opencode',
+  }
+  if (isMoonshot) env.MOONSHOT_API_KEY = key
+  else env.ZAI_API_KEY = key
+  return env
+}
+
+interface OperatorScenario extends Scenario {
+  kind: 'trading-persona'
+  persona: UserPersona
+  intent: UserIntent
+  market: TradingEvalScenario
+}
+
+function buildOperatorScenarios(
+  personas: UserPersona[],
+  markets: TradingEvalScenario[],
+  intents: UserIntent[],
+): OperatorScenario[] {
+  if (intents.length === 0) throw new Error('buildOperatorScenarios: need at least one intent')
+  const out: OperatorScenario[] = []
+  for (const persona of personas) {
+    markets.forEach((market, i) => {
+      const intent = intents[i % intents.length]!
+      out.push({
+        id: `${persona.id}__${market.id}`,
+        kind: 'trading-persona',
+        tags: ['trading-persona', persona.id, ...persona.tags, market.market_regime, ...intent.venues],
+        persona,
+        intent,
+        market,
+      })
+    })
+  }
+  return out
+}
+
+interface OperatorArtifact {
+  session: UserSimSessionResult
+  groundTruth: PersonaEvalResult
+  model: string
+  operatorResponded: boolean
+}
+
+/** Combined judge: real-artifact score (60%) ⊕ objective backtest ground truth
+ *  (40%). A cell whose operator produced no turns is `failed`, never a zero. */
+function operatorJudge(): JudgeConfig<OperatorArtifact, OperatorScenario> {
+  return {
+    name: 'trading-persona-real-artifact-plus-backtest',
+    dimensions: [
+      { key: 'real_artifact', description: 'observable state + prose from the real operator session' },
+      { key: 'actually_traded_or_committed', description: 'observable: did the operator trade/commit? (artifact)' },
+      { key: 'self_improvement', description: 'observable: did the self-improve cycle fire? (artifact)' },
+      { key: 'backtest_ground_truth', description: 'objective walk-forward backtest score (model-invariant)' },
+      { key: 'backtest_passed', description: 'did the candidate clear the deterministic promotion gates?' },
+    ],
+    async score({ scenario, artifact }): Promise<JudgeScore> {
+      if (!artifact.operatorResponded) {
+        return { dimensions: {}, composite: 0, notes: 'real operator produced no turns — cell failed (not scored as zero)', failed: true }
+      }
+      const real = await scoreUserSimArtifact(scenario.intent, artifact.session)
+      const gt = artifact.groundTruth
+      const groundTruthScore = gt.score / 100
+      return {
+        composite: 0.6 * real.composite + 0.4 * groundTruthScore,
+        dimensions: {
+          real_artifact: real.composite,
+          actually_traded_or_committed: real.dimensions.actually_traded_or_committed ?? 0,
+          self_improvement: real.dimensions.self_improvement ?? 0,
+          backtest_ground_truth: groundTruthScore,
+          backtest_passed: gt.passed ? 1 : 0,
+        },
+        notes:
+          `REAL(${real.composite.toFixed(2)}) ⊕ BACKTEST(${groundTruthScore.toFixed(2)}, passed=${gt.passed}, ` +
+          `gates=${gt.deterministic_gates.join(' | ')}) | ${real.notes}`,
+      }
+    },
+  }
+}
+
+async function runOperatorMatrix(
+  operatorUrl: string,
+  options: TradingPersonaEvalOptions,
+): Promise<TradingPersonaEvalSummary> {
+  const token = options.token ?? process.env.OPERATOR_API_TOKEN ?? ''
+  const privateKey = options.privateKey ?? process.env.OPERATOR_PRIVATE_KEY
+  if (!token && !privateKey) {
+    throw new Error(
+      'operator-matrix mode: need an operator-api token (options.token / OPERATOR_API_TOKEN) or a ' +
+        'privateKey (options.privateKey / OPERATOR_PRIVATE_KEY) to authenticate against the real stack.',
+    )
+  }
+
+  const profiles = buildOperatorProfiles(options.models ?? OPERATOR_PROFILE_MODELS)
+  const personas = options.personas ?? STANDARD_USER_PERSONAS
+  const markets = options.markets ?? defaultScenarios()
+  const intents = options.intents ?? STANDARD_USER_INTENTS
+  const scenarios = buildOperatorScenarios(personas, markets, intents)
+  const commitSha = currentCommitSha()
+  const runDir = resolveRepo(options.runsJsonl ? dirname(options.runsJsonl) : `.evolve/agent-eval/trading-persona-matrix-${isoStamp()}`)
+  const scorecardPath = resolveRepo(
+    options.scorecardPath ?? '.evolve/agent-eval/scorecards/trading-persona-matrix.jsonl',
+  )
+  mkdirSync(runDir, { recursive: true })
+  mkdirSync(dirname(scorecardPath), { recursive: true })
+
+  // Model-invariant ground truth — computed once per market, shared by every cell.
+  const groundTruthByMarket = new Map<string, PersonaEvalResult>()
+  for (const m of markets) groundTruthByMarket.set(m.id, evaluateScenario(m))
+
+  const maxTurnsPerShot = options.maxTurnsPerShot ?? 6
+  const perTurnTimeoutMs = options.perTurnTimeoutMs ?? 900_000
+
+  const result = await runProfileMatrix<OperatorScenario, OperatorArtifact>({
+    profiles,
+    scenarios,
+    judges: [operatorJudge()],
+    runDir,
+    commitSha,
+    experimentId: 'trading-persona-matrix',
+    splitTag: 'search',
+    ...(options.reps !== undefined ? { reps: options.reps } : {}),
+    maxConcurrency: options.maxConcurrency ?? 1,
+    ...(options.costCeiling !== undefined ? { costCeiling: options.costCeiling } : {}),
+    integrity: options.integrity ?? 'assert',
+    personaOf: (s) => s.persona.id,
+    dispatch: async (profile, scenario, ctx) => {
+      const groundTruth = groundTruthByMarket.get(scenario.market.id)
+      if (!groundTruth) throw new Error(`no backtest ground truth for market ${scenario.market.id}`)
+      const model = modelOfProfile(profile)
+      const campaign = await runMultishotUserSim({
+        intents: [scenario.intent],
+        personas: [scenario.persona],
+        operatorUrl,
+        token,
+        ...(privateKey ? { privateKey } : {}),
+        agentEnv: agentEnvForModel(model),
+        reps: 1,
+        maxTurnsPerShot,
+        perTurnTimeoutMs,
+        botKind: 'real',
+        dualJudge: false,
+        runDir: `${runDir}/cells/${profile.id.replace(/[^\w.-]/g, '_')}__${scenario.id}`,
+      })
+      const session = firstArtifact(campaign)
+      if (!session) {
+        return { session: emptySession(scenario.intent), groundTruth, model, operatorResponded: false }
+      }
+      // Integrity fingerprint: the real operator's LLM spend is inside its
+      // sandbox (invisible to the eval), and the inner user-sim talks HTTP — so
+      // the inner cells report zero tokens. ONE metered call with this profile's
+      // model assessing its own real transcript gives the integrity guard an
+      // honest non-stub signal (genuine model work on genuine evidence).
+      const innerUsage = campaignTokenUsage(campaign)
+      const groundingCall = await metaSpendCall(model, scenario, session)
+      ctx.cost.observeTokens({
+        input: innerUsage.input + groundingCall.usage.input,
+        output: innerUsage.output + groundingCall.usage.output,
+      })
+      const cost = innerUsage.costUsd + groundingCall.usage.costUsd
+      if (cost > 0) ctx.cost.observe(cost, `operator+grounding:${model}`)
+      return { session, groundTruth, model, operatorResponded: session.turns.length > 0 }
+    },
+  })
+
+  const recordsByProfileId = new Map<string, RunRecord[]>()
+  for (const r of result.records) {
+    const list = recordsByProfileId.get(r.candidateId) ?? []
+    list.push(r)
+    recordsByProfileId.set(r.candidateId, list)
+  }
+
+  let appendedCells = 0
+  for (const profile of profiles) {
+    const profileRecords = recordsByProfileId.get(profile.id) ?? []
+    if (profileRecords.length === 0) continue
+    appendedCells += recordRunsToScorecard(scorecardPath, profileRecords, { profile, commitSha }).length
+  }
+
+  const byProfile = profiles.map<ProfileSummary>((profile) => {
+    const summary = result.byProfile[profile.id]
+    const profileRecords = recordsByProfileId.get(profile.id) ?? []
+    const passing = profileRecords.filter((r) => scoreOf(r) >= 0.7).length
+    return {
+      profileId: profile.id,
+      profileHash: summary?.profileHash ?? agentProfileHash(profile),
+      model: String(profile.metadata?.model ?? profile.model),
+      records: profileRecords.length,
+      passRate: profileRecords.length ? passing / profileRecords.length : 0,
+      meanScore: summary?.meanComposite ?? meanScore(profileRecords),
+      totalCostUsd: summary?.totalCostUsd ?? 0,
+      integrityVerdict: summarizeBackendIntegrity(profileRecords).verdict,
+    }
+  })
+
+  const byPersona: Record<string, { meanScore: number; n: number }> = {}
+  for (const [persona, rollup] of Object.entries(result.byPersona ?? {})) {
+    byPersona[persona] = { meanScore: rollup.meanComposite, n: rollup.n }
+  }
+
+  const integrity =
+    (options.integrity ?? 'assert') === 'off'
+      ? summarizeBackendIntegrity(result.records)
+      : assertRealBackend(result.records, { allowMixed: true })
+
+  const best =
+    byProfile.length === 0
+      ? null
+      : [...byProfile].sort((a, b) => b.passRate - a.passRate || b.meanScore - a.meanScore)[0] ?? null
+
+  return {
+    mode: 'operator-matrix',
+    suite: result.experimentId,
+    commitSha,
+    records: result.records.length,
+    runDir,
+    depthRounds: 1,
+    byProfile,
+    byPersona,
+    integrity,
+    best,
+    scorecard: { path: scorecardPath, appendedCells, regressed: false },
+  }
+}
+
+// ─── helpers (operator-matrix mode) ──────────────────────────────────────
+
+function scoreOf(record: RunRecord): number {
+  return record.outcome.holdoutScore ?? record.outcome.searchScore ?? 0
+}
+
+function meanScore(records: RunRecord[]): number {
+  if (records.length === 0) return 0
+  return records.reduce((acc, r) => acc + scoreOf(r), 0) / records.length
+}
+
+function firstArtifact(
+  campaign: Awaited<ReturnType<typeof runMultishotUserSim>>,
+): UserSimSessionResult | null {
+  for (const cell of campaign.cells) {
+    if (cell.error) continue
+    if (cell.artifact && cell.artifact.turns) return cell.artifact
+  }
+  return null
+}
+
+function campaignTokenUsage(
+  campaign: Awaited<ReturnType<typeof runMultishotUserSim>>,
+): { input: number; output: number; costUsd: number } {
+  const usage = { input: 0, output: 0, costUsd: 0 }
+  for (const cell of campaign.cells) {
+    usage.input += cell.tokenUsage?.input ?? 0
+    usage.output += cell.tokenUsage?.output ?? 0
+    usage.costUsd += cell.costUsd ?? 0
+  }
+  return usage
+}
+
+async function metaSpendCall(
+  model: LlmModel,
+  scenario: OperatorScenario,
+  session: UserSimSessionResult,
+): Promise<Awaited<ReturnType<typeof llmCallWithUsage>>> {
+  const transcript = session.turns
+    .map((t) => `  TURN ${t.turn}\n    USER: ${t.user_message}\n    OPERATOR: ${t.bot_reply_text.slice(0, 800)}`)
+    .join('\n')
+  const prompt = [
+    `You ran as the autonomous trading operator for: ${scenario.persona.label}.`,
+    `User intent: "${scenario.intent.text}"`,
+    `Mandate caps: $${scenario.intent.capital_usd} capital, ${scenario.intent.dd_cap_pct}% max drawdown,`,
+    `venues ${scenario.intent.venues.join(', ')}.`,
+    '',
+    'Your session transcript:',
+    transcript || '  (no turns)',
+    '',
+    'In 2-3 sentences, state whether you stayed within the mandate and committed a',
+    'concrete action. Be specific to the transcript.',
+  ].join('\n')
+  return llmCallWithUsage({ prompt, model })
+}
+
+function emptySession(intent: UserIntent): UserSimSessionResult {
+  return {
+    intent,
+    bot_id: '',
+    session_id: '',
+    turns: [],
+    final_transcript: null,
+    ended_by: 'max_turns',
+    total_wall_ms: 0,
+    bot_artifacts: null,
+    tick_side_effects: null,
+  }
+}
diff --git a/package-lock.json b/package-lock.json
index bc1c3832..d39ba732 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,8 +9,8 @@
       "version": "0.1.0",
       "dependencies": {
         "@tangle-network/agent-eval": "^0.91.0",
-        "@tangle-network/agent-knowledge": "^1.5.0",
-        "@tangle-network/agent-runtime": "^0.50.0"
+        "@tangle-network/agent-knowledge": "^1.7.0",
+        "@tangle-network/agent-runtime": "^0.52.0"
       },
       "devDependencies": {
         "@types/node": "^22.10.0",
@@ -187,27 +187,15 @@
         }
       }
     },
-    "node_modules/@tangle-network/agent-integrations": {
-      "version": "0.25.7",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-integrations/-/agent-integrations-0.25.7.tgz",
-      "integrity": "sha512-5Iuymcoq6d1oZlyORfmVXiP2G/tJQe0ADYBUNwDlbk9uulSa3c6rztlr6sKm100NqDavVlJ0Jo75j9CsaemhIA==",
-      "license": "MIT",
-      "bin": {
-        "tangle-catalog-runtime": "dist/bin/tangle-catalog-runtime.js"
-      },
-      "engines": {
-        "node": ">=20"
-      }
-    },
     "node_modules/@tangle-network/agent-knowledge": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-knowledge/-/agent-knowledge-1.5.0.tgz",
-      "integrity": "sha512-C2jU62Nx6CbjkVKsvsd2xFaanQlwghVYWX8G/g/B+3PJTf5G83e3pvEmHDy5ETnLamIEZRiRks5RG3v1t8LEUQ==",
+      "version": "1.7.0",
+      "resolved": "https://registry.npmjs.org/@tangle-network/agent-knowledge/-/agent-knowledge-1.7.0.tgz",
+      "integrity": "sha512-F+a21T2UMW7HVj1Dt6Sv5GqjEQXWUshsVftRpStW1t0bVG0/1aT6DeHYKok1lLmi+D8WZnC0CNXPdb+LyxC/iA==",
       "license": "MIT",
+      "peer": true,
       "dependencies": {
-        "@tangle-network/agent-eval": "^0.42.0",
-        "@tangle-network/agent-runtime": "^0.25.0",
-        "@tangle-network/sandbox": "^0.2.1",
+        "@tangle-network/agent-eval": ">=0.77.0 <0.80.0",
+        "@tangle-network/agent-runtime": "^0.44.0",
         "zod": "^4.3.6"
       },
       "bin": {
@@ -218,10 +206,11 @@
       }
     },
     "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-eval": {
-      "version": "0.42.0",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.42.0.tgz",
-      "integrity": "sha512-gJFT1Vm5LYDHtIF0BUqGq6i3Qa9IvFr3EvTfAE1CYjErFNl3TohL1sduJqj1GXIhDbswVVuWp5qaahHZHaIsbA==",
+      "version": "0.79.0",
+      "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.79.0.tgz",
+      "integrity": "sha512-reN1SbKvTXFS27PQa4l5dnwf0y33j118FM0aPKAq8I0fvj3H6olF1wwQWHCSm7Sjats9rU4c8x2wlSNk0VeCBQ==",
       "license": "MIT",
+      "peer": true,
       "dependencies": {
         "@asteasolutions/zod-to-openapi": "^8.5.0",
         "@ax-llm/ax": "^19.0.25",
@@ -237,36 +226,30 @@
         "node": ">=20"
       },
       "peerDependencies": {
-        "@tangle-network/agent-runtime": "^0.21.0",
-        "@tangle-network/sandbox": "^0.2.1"
+        "@tangle-network/sandbox": ">=0.2.1 <0.5.0"
       },
       "peerDependenciesMeta": {
-        "@tangle-network/agent-runtime": {
-          "optional": true
-        },
         "@tangle-network/sandbox": {
           "optional": true
         }
       }
     },
     "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime": {
-      "version": "0.25.2",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.25.2.tgz",
-      "integrity": "sha512-/HSrfr1kyW5wjLApudXQ4CcvjsZaxYtGXp6R6i0MdVoCVmZ6XBewKoVv1QGs+k0i8fUYrdEfh48cvGnrGOaQZQ==",
+      "version": "0.44.0",
+      "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.44.0.tgz",
+      "integrity": "sha512-uMzWcziIV+SsgvdvvnnSobaFYZuYXQ3KRfvq9h9kHglVLtPoUGH78ypCnyn5QQIccTk4gjSenAHC5Iy076DkQg==",
       "license": "MIT",
-      "peer": true,
-      "dependencies": {
-        "@tangle-network/agent-eval": "^0.40.2"
-      },
       "bin": {
+        "agent-runtime-loop": "dist/loop-runner-bin.js",
         "agent-runtime-mcp": "dist/mcp/bin.js"
       },
       "engines": {
         "node": ">=20"
       },
       "peerDependencies": {
+        "@tangle-network/agent-eval": ">=0.61.0 <1.0.0",
         "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
-        "@tangle-network/sandbox": ">=0.1.2 <0.4.0"
+        "@tangle-network/sandbox": ">=0.1.2 <0.5.0"
       },
       "peerDependenciesMeta": {
         "@tangle-network/agent-knowledge": {
@@ -277,88 +260,10 @@
         }
       }
     },
-    "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-eval": {
-      "version": "0.40.5",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.40.5.tgz",
-      "integrity": "sha512-ew27fDkzvYcM/3/u6Jx1HGS3/bPoIWAXKGa/2XlOro2hBwMA/h37SAHg4ytUDMd2M0mAKQAAanUxnHfkt/aklw==",
-      "license": "MIT",
-      "dependencies": {
-        "@asteasolutions/zod-to-openapi": "^8.5.0",
-        "@ax-llm/ax": "^19.0.25",
-        "@hono/node-server": "^2.0.0",
-        "@tangle-network/tcloud": "^0.4.6",
-        "hono": "^4.12.16",
-        "zod": "^4.3.6"
-      },
-      "bin": {
-        "agent-eval": "dist/cli.js"
-      },
-      "engines": {
-        "node": ">=20"
-      },
-      "peerDependencies": {
-        "@tangle-network/agent-runtime": "^0.21.0",
-        "@tangle-network/sandbox": "^0.2.1"
-      },
-      "peerDependenciesMeta": {
-        "@tangle-network/agent-runtime": {
-          "optional": true
-        },
-        "@tangle-network/sandbox": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-runtime": {
-      "version": "0.21.1",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.21.1.tgz",
-      "integrity": "sha512-Qh7TG6Pg25qUzsByblVvzBDfzq5K7isSZ3I/LhOzMgE2dBmrS4AjozPaQ6/zMoAy5kg+sZhJXUuXDiAjlBoTyg==",
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@tangle-network/agent-eval": "^0.33.1"
-      },
-      "bin": {
-        "agent-runtime-mcp": "dist/mcp/bin.js"
-      },
-      "engines": {
-        "node": ">=20"
-      },
-      "peerDependencies": {
-        "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
-        "@tangle-network/sandbox": ">=0.1.2 <0.3.0"
-      },
-      "peerDependenciesMeta": {
-        "@tangle-network/agent-knowledge": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-eval": {
-      "version": "0.33.1",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.33.1.tgz",
-      "integrity": "sha512-VAbg1UkC480Xzfi2jqiFMQLYykWvDMO47UHx4bb2rOeiogN1zzM10kPst3OotM+k1B2lbu51uoVnKDBnqK8zcw==",
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@asteasolutions/zod-to-openapi": "^8.5.0",
-        "@ax-llm/ax": "^19.0.25",
-        "@hono/node-server": "^2.0.0",
-        "@tangle-network/tcloud": "^0.4.6",
-        "hono": "^4.12.16",
-        "zod": "^4.3.6"
-      },
-      "bin": {
-        "agent-eval": "dist/cli.js"
-      },
-      "engines": {
-        "node": ">=20"
-      }
-    },
     "node_modules/@tangle-network/agent-runtime": {
-      "version": "0.50.0",
-      "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.50.0.tgz",
-      "integrity": "sha512-fNVcaG7sDOxu8ILt61N4+zBfA/lnY6P8YGAt4r5cI7ekfitfwJ3GZBk2YahxJHQ0XrtMQrF6kGe7dMuSxidxNg==",
+      "version": "0.52.0",
+      "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.52.0.tgz",
+      "integrity": "sha512-oZL1cDW7x2inhBrcAgKHST3HQFGQsJtQR/ko41js9DcBP4bIVVPeosEwQ50h41yPXHkJpIYznh54pj0FJxiYwQ==",
       "license": "MIT",
       "bin": {
         "agent-runtime-loop": "dist/loop-runner-bin.js",
@@ -385,28 +290,6 @@
         }
       }
     },
-    "node_modules/@tangle-network/sandbox": {
-      "version": "0.2.1",
-      "resolved": "https://registry.npmjs.org/@tangle-network/sandbox/-/sandbox-0.2.1.tgz",
-      "integrity": "sha512-CQ3MdfnWcdjKa2UzyqDkjJarhkVDl4GqAKRhbQdHmHccl/pOm6qSRiPdu40XEA34A/SVPLpfE1ySxchU1rq6BQ==",
-      "license": "MIT",
-      "peer": true,
-      "dependencies": {
-        "@tangle-network/agent-integrations": "0.25.7"
-      },
-      "peerDependencies": {
-        "openai": "^6.36.0",
-        "viem": "^2.0.0"
-      },
-      "peerDependenciesMeta": {
-        "openai": {
-          "optional": true
-        },
-        "viem": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/@tangle-network/tcloud": {
       "version": "0.4.6",
       "resolved": "https://registry.npmjs.org/@tangle-network/tcloud/-/tcloud-0.4.6.tgz",
diff --git a/package.json b/package.json
index b2c6a3b7..695482cb 100644
--- a/package.json
+++ b/package.json
@@ -37,7 +37,7 @@
   },
   "dependencies": {
     "@tangle-network/agent-eval": "^0.91.0",
-    "@tangle-network/agent-knowledge": "^1.5.0",
-    "@tangle-network/agent-runtime": "^0.50.0"
+    "@tangle-network/agent-knowledge": "^1.7.0",
+    "@tangle-network/agent-runtime": "^0.52.0"
   }
 }
diff --git a/trading-blueprint-lib/src/jobs/activate.rs b/trading-blueprint-lib/src/jobs/activate.rs
index 2b47ae57..859586a2 100644
--- a/trading-blueprint-lib/src/jobs/activate.rs
+++ b/trading-blueprint-lib/src/jobs/activate.rs
@@ -33,9 +33,9 @@ pub(crate) const SIDECAR_AGENTS_MD_PATH: &str = "/home/agent/AGENTS.md";
 /// Same charter, claude-code's auto-loaded filename. The claude CLI reads
 /// `CLAUDE.md` (not `AGENTS.md`) from its working directory.
 pub(crate) const SIDECAR_CLAUDE_MD_PATH: &str = "/home/agent/CLAUDE.md";
-const TRADING_AGENT_AGENT_EVAL_VERSION: &str = "^0.70.0";
-const TRADING_AGENT_AGENT_KNOWLEDGE_VERSION: &str = "^1.5.0";
-const TRADING_AGENT_AGENT_RUNTIME_VERSION: &str = "^0.36.0";
+const TRADING_AGENT_AGENT_EVAL_VERSION: &str = "^0.91.0";
+const TRADING_AGENT_AGENT_KNOWLEDGE_VERSION: &str = "^1.7.0";
+const TRADING_AGENT_AGENT_RUNTIME_VERSION: &str = "^0.52.0";
 
 /// Operator identity + behavioural charter loaded into every opencode turn via
 /// `AGENTS.md`. The full operating protocol (API base URL, bearer token,