tangle-network · drewstone · Jun 14, 2026 · Jun 14, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,6 +29,7 @@ jobs:
       arena: ${{ steps.decide.outputs.arena }}
       contracts: ${{ steps.decide.outputs.contracts }}
       rust: ${{ steps.decide.outputs.rust }}
+      evals: ${{ steps.decide.outputs.evals }}
       full: ${{ steps.decide.outputs.full }}
     steps:
       - uses: actions/checkout@v6
@@ -73,6 +74,7 @@ jobs:
           arena=false
           contracts=false
           rust=false
+          evals=false
           ci=false
 
           while IFS= read -r file; do
@@ -92,6 +94,11 @@ jobs:
                 rust=true
                 ;;
             esac
+            case "$file" in
+              evals/*|package.json|package-lock.json|tsconfig.json|tsconfig.*.json)
+                evals=true
+                ;;
+            esac
             case "$file" in
               .github/workflows/ci.yml)
                 ci=true
@@ -106,12 +113,14 @@ jobs:
             arena=true
             contracts=true
             rust=true
+            evals=true
           fi
 
           echo "full=$full" >> "$GITHUB_OUTPUT"
           echo "arena=$arena" >> "$GITHUB_OUTPUT"
           echo "contracts=$contracts" >> "$GITHUB_OUTPUT"
           echo "rust=$rust" >> "$GITHUB_OUTPUT"
+          echo "evals=$evals" >> "$GITHUB_OUTPUT"
 
   arena:
     name: Arena UI
@@ -324,9 +333,29 @@ jobs:
             --ignore RUSTSEC-2026-0118 \
             --ignore RUSTSEC-2026-0119
 
+  # ── Evals (TypeScript) ────────────────────────────────────────────────────
+  evals:
+    name: Evals typecheck
+    needs: changes
+    if: needs.changes.outputs.evals == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Typecheck evals
+        run: npx tsc -p evals/tsconfig.json --noEmit
+
   ci-gate:
     name: CI Gate
-    needs: [changes, arena, forge, rust, clippy, fmt, audit]
+    needs: [changes, arena, forge, rust, clippy, fmt, audit, evals]
     if: always()
     runs-on: ubuntu-latest
     steps:
@@ -336,12 +365,14 @@ jobs:
           ARENA_NEEDED: ${{ needs.changes.outputs.arena }}
           CONTRACTS_NEEDED: ${{ needs.changes.outputs.contracts }}
           RUST_NEEDED: ${{ needs.changes.outputs.rust }}
+          EVALS_NEEDED: ${{ needs.changes.outputs.evals }}
           ARENA_RESULT: ${{ needs.arena.result }}
           FORGE_RESULT: ${{ needs.forge.result }}
           RUST_RESULT: ${{ needs.rust.result }}
           CLIPPY_RESULT: ${{ needs.clippy.result }}
           FMT_RESULT: ${{ needs.fmt.result }}
           AUDIT_RESULT: ${{ needs.audit.result }}
+          EVALS_RESULT: ${{ needs.evals.result }}
         run: |
           set -euo pipefail
           failed=false
@@ -367,8 +398,9 @@ jobs:
           require_success "Clippy" "$RUST_NEEDED" "$CLIPPY_RESULT"
           require_success "Rustfmt" "$RUST_NEEDED" "$FMT_RESULT"
           require_success "Security audit" "$RUST_NEEDED" "$AUDIT_RESULT"
+          require_success "Evals typecheck" "$EVALS_NEEDED" "$EVALS_RESULT"
 
-          if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ]; then
+          if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ] && [ "$EVALS_NEEDED" != "true" ]; then
             echo "No code lanes changed; CI gate is green."
           fi
 

diff --git a/evals/src/bin/agent-eval-trading-personas.ts b/evals/src/bin/agent-eval-trading-personas.ts
@@ -1,23 +1,42 @@
 #!/usr/bin/env node
-import {
-  runTradingPersonaAgentEvalBridge,
-  type TradingPersonaBridgeOptions,
-} from '../trading/persona-agent-eval.js'
+import { runTradingPersonaEval, type TradingPersonaEvalOptions } from '../trading/persona-agent-eval.js'
+import type { LlmModel } from '../sim/llm-call.js'
 
 function argValue(name: string): string | undefined {
   const index = process.argv.indexOf(name)
   return index >= 0 ? process.argv[index + 1] : undefined
 }
 
-const options: TradingPersonaBridgeOptions = {}
+// One entry point. With --operator-url (or OPERATOR_API_URL/OPERATOR_URL set) it
+// runs the real operator profile × persona matrix (real bot artifacts + tick
+// side-effects, scored against the objective backtest); without it, the
+// deterministic walk-forward backtest. Same surface, degrades by infra.
+const options: TradingPersonaEvalOptions = {}
 const reportPath = argValue('--out')
 const traceDir = argValue('--trace-dir')
 const runsJsonl = argValue('--runs-jsonl') ?? argValue('--runs')
+const scorecard = argValue('--scorecard')
+const operatorUrl = argValue('--operator-url')
+const models = argValue('--models')
+const reps = argValue('--reps')
+const maxTurns = argValue('--max-turns')
+const costCeiling = argValue('--cost-ceiling')
 if (reportPath) options.reportPath = reportPath
 if (traceDir) options.traceDir = traceDir
 if (runsJsonl) options.runsJsonl = runsJsonl
+if (scorecard) options.scorecardPath = scorecard
+if (operatorUrl) options.operatorUrl = operatorUrl
+if (models) options.models = models.split(',').map((m) => m.trim()) as LlmModel[]
+if (reps) options.reps = Number(reps)
+if (maxTurns) options.maxTurnsPerShot = Number(maxTurns)
+if (costCeiling) options.costCeiling = Number(costCeiling)
+if (process.env.TRADING_PERSONA_MATRIX_INTEGRITY === 'warn') options.integrity = 'warn'
 
-const summary = await runTradingPersonaAgentEvalBridge(options)
-
+const summary = await runTradingPersonaEval(options)
 console.log(JSON.stringify(summary, null, 2))
-if (summary.failed > 0) process.exit(1)
+
+if (summary.mode === 'operator-matrix') {
+  if (summary.integrity?.verdict === 'stub' || summary.best === null) process.exit(1)
+} else if ((summary.failed ?? 0) > 0) {
+  process.exit(1)
+}
diff --git a/evals/src/full/full-eval-runner.ts b/evals/src/full/full-eval-runner.ts
@@ -6,7 +6,7 @@ import { runSelfImprovementMcpEval } from '../self-improvement/mcp-eval.js'
 import { runProductBrowserEval } from '../product/browser-driver.js'
 import { runStrategyTemplateEval } from '../trading/strategy-template-runner.js'
 import { runTradingLifecycleEval } from '../trading/lifecycle-runner.js'
-import { runTradingPersonaAgentEvalBridge } from '../trading/persona-agent-eval.js'
+import { runTradingPersonaEval } from '../trading/persona-agent-eval.js'
 
 export interface FullEvalOptions {
   outputPath?: string
@@ -36,7 +36,7 @@ export async function runFullEval(options: FullEvalOptions = {}) {
   await gate(gates, 'rust-persona-coverage-test', async () => {
     run('cargo', ['test', '-p', 'trading-runtime', 'persona_eval_suite_has_required_coverage_and_passes'])
   })
-  await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaAgentEvalBridge({
+  await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaEval({
     reportPath: `.evolve/evals/full-personas-${stamp}.json`,
     traceDir: `.evolve/agent-eval/traces/full-personas-${stamp}`,
     runsJsonl: `.evolve/agent-eval/full-persona-runs-${stamp}.jsonl`,

diff --git a/evals/src/sim/llm-call.ts b/evals/src/sim/llm-call.ts
@@ -100,6 +100,19 @@ export interface LlmCallResult {
   ok: boolean
 }
 
+/** Token + cost usage accumulated from the backend's `llm_call` stream
+ *  events. Zeros when the provider reported no usage (which downstream
+ *  backend-integrity guards correctly read as a stub fingerprint). */
+export interface LlmUsage {
+  input: number
+  output: number
+  costUsd: number
+}
+
+export interface LlmCallUsageResult extends LlmCallResult {
+  usage: LlmUsage
+}
+
 /** Resolve a logical model id to its provider routing. Throws on unknown
  *  model id so call sites can't silently ship a typo to prod. */
 export type { ModelRouting }
@@ -159,6 +172,54 @@ export async function llmCall(opts: LlmCallOptions): Promise<LlmCallResult> {
   return { output, exitCode: 0, stderr: '', ok: output.length > 0 }
 }
 
+/** Core LLM call that ALSO reports provider-reported token + cost usage,
+ *  accumulated from the backend's `llm_call` stream events. Use this when a
+ *  caller must thread real usage into a `RunRecord` / `ctx.cost.observeTokens`
+ *  (the backend-integrity guard keys on nonzero token usage). Same routing +
+ *  timeout handling as `llmCall`. */
+export async function llmCallWithUsage(opts: LlmCallOptions): Promise<LlmCallUsageResult> {
+  const cfg = resolveModel(opts.model ?? DEFAULT_MODEL)
+  const backend = createOpenAICompatibleBackend({
+    apiKey: cfg.apiKey(),
+    baseUrl: cfg.baseUrl,
+    model: cfg.modelId,
+  })
+  const task: AgentTaskSpec = {
+    id: `eval-llm-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
+    intent: 'one-shot eval LLM call with usage accounting',
+    domain: 'eval',
+  }
+  const controller = new AbortController()
+  const timer = setTimeout(() => controller.abort(), opts.timeoutMs ?? 180_000)
+  const parts: string[] = []
+  const usage: LlmUsage = { input: 0, output: 0, costUsd: 0 }
+  let backendError: string | null = null
+  try {
+    for await (const ev of runAgentTaskStream({
+      task,
+      backend,
+      input: { message: opts.prompt },
+      signal: controller.signal,
+    })) {
+      if (ev.type === 'text_delta') parts.push(ev.text)
+      else if (ev.type === 'llm_call') {
+        usage.input += ev.tokensIn ?? 0
+        usage.output += ev.tokensOut ?? 0
+        usage.costUsd += ev.costUsd ?? 0
+      } else if (ev.type === 'backend_error') backendError = ev.message
+    }
+  } catch (e) {
+    return { output: parts.join('').trim(), exitCode: 1, stderr: (e as Error).message, ok: false, usage }
+  } finally {
+    clearTimeout(timer)
+  }
+  if (backendError) {
+    return { output: parts.join('').trim(), exitCode: 1, stderr: backendError, ok: false, usage }
+  }
+  const output = parts.join('').trim()
+  return { output, exitCode: 0, stderr: '', ok: output.length > 0, usage }
+}
+
 /** Extract the first JSON object from an LLM response. Tolerates code
  *  fences and prose around the JSON. Returns null on unrecoverable
  *  parse failure — judges fall back gracefully instead of crashing the

diff --git a/evals/src/sim/multishot-user-sim.ts b/evals/src/sim/multishot-user-sim.ts
@@ -128,6 +128,12 @@ export interface MultishotDispatchOptions {
   privateKey?: string
   maxTurnsPerShot: number
   perTurnTimeoutMs: number
+  /** Per-cell override for the in-sandbox agent's LLM credentials. When set,
+   *  this is the env `configureSecrets` writes into the bot's sandbox — i.e.
+   *  it pins WHICH model the REAL operator agent runs (the PROFILE axis of the
+   *  unified matrix). When omitted, falls back to `deterministicAgentEnv()`
+   *  (the single-profile default). */
+  agentEnv?: Record<string, string>
 }
 
 export function makeUserSimDispatch(opts: MultishotDispatchOptions, botKind: BotKind = 'real') {
@@ -185,8 +191,10 @@ async function dispatchInner(
     // Bot create is instant in operator DB; vault resolution is async
     // (on-chain). configureSecrets returns 500 without a resolved vault.
     await client.waitForVaultResolved(botId)
-    // Then configure sandbox-agent LLM credentials before chatting.
-    await client.configureSecrets(botId, deterministicAgentEnv())
+    // Then configure sandbox-agent LLM credentials before chatting. A per-cell
+    // override (the matrix PROFILE axis) pins which model the REAL operator runs;
+    // otherwise the single-profile default applies.
+    await client.configureSecrets(botId, opts.agentEnv ?? deterministicAgentEnv())
     const sessionId = await client.createSession(botId, `user-sim:${scenario.id}`)
     return runUserSimSession({
       intent: scenario.intent,
@@ -282,6 +290,43 @@ function deriveStateScores(
   return { committed, selfImprovement, evidence }
 }
 
+/** The artifact-based score for ONE user-sim session, factored out of
+ *  `userSimJudge` so other eval surfaces (the unified trading matrix) can score
+ *  a real session WITHOUT re-running a `runEval` campaign. Composite weights
+ *  OBSERVABLE state (trades/strategy/self-improve from `bot_artifacts`) at 55%
+ *  and prose (rubric judge) at 45% — the same weighting `userSimJudge` uses. */
+export interface UserSimArtifactScore {
+  composite: number
+  dimensions: Record<string, number>
+  notes: string
+}
+
+export async function scoreUserSimArtifact(
+  intent: UserIntent,
+  artifact: UserSimSessionResult,
+): Promise<UserSimArtifactScore> {
+  const r = await judgePrimaryRubric(intent, artifact)
+  const state = deriveStateScores(artifact, r.actually_traded_or_committed)
+  const composite =
+    0.20 * r.intent_fulfilled +
+    0.15 * r.respected_constraints +
+    0.40 * state.committed +
+    0.15 * state.selfImprovement +
+    0.10 * r.productive_conversation
+  return {
+    composite,
+    dimensions: {
+      intent_fulfilled: r.intent_fulfilled,
+      respected_constraints: r.respected_constraints,
+      actually_traded_or_committed: state.committed,
+      self_improvement: state.selfImprovement,
+      productive_conversation: r.productive_conversation,
+      prose_traded_claim: r.actually_traded_or_committed,
+    },
+    notes: `${r.notes} | STATE: ${state.evidence}`,
+  }
+}
+
 export function userSimJudge(opts: { dualJudge?: boolean } = {}): JudgeConfig<UserSimSessionResult, UserIntentScenario> {
   const useDual = opts.dualJudge ?? false
   return {
@@ -371,6 +416,10 @@ export interface RunMultishotUserSimOptions {
    *  bot scoring high on a newbie persona's intents but low on a
    *  veteran's is a real product signal. */
   personas?: UserPersona[]
+  /** Per-run override for the in-sandbox agent's LLM credentials — pins WHICH
+   *  model the REAL operator agent runs. Used by the unified trading matrix to
+   *  drive each PROFILE's model through the real operator stack. */
+  agentEnv?: Record<string, string>
 }
 
 export async function runMultishotUserSim(
@@ -393,6 +442,7 @@ export async function runMultishotUserSim(
       // multi-step work to land. (The real fix is tick-driving — task #108 —
       // but a 3-cron budget makes the current sync-poll model honest.)
       perTurnTimeoutMs: opts.perTurnTimeoutMs ?? 900_000,
+      ...(opts.agentEnv ? { agentEnv: opts.agentEnv } : {}),
     },
     botKind,
   )