Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
arena: ${{ steps.decide.outputs.arena }}
contracts: ${{ steps.decide.outputs.contracts }}
rust: ${{ steps.decide.outputs.rust }}
evals: ${{ steps.decide.outputs.evals }}
full: ${{ steps.decide.outputs.full }}
steps:
- uses: actions/checkout@v6
Expand Down Expand Up @@ -73,6 +74,7 @@ jobs:
arena=false
contracts=false
rust=false
evals=false
ci=false

while IFS= read -r file; do
Expand All @@ -92,6 +94,11 @@ jobs:
rust=true
;;
esac
case "$file" in
evals/*|package.json|package-lock.json|tsconfig.json|tsconfig.*.json)
evals=true
;;
esac
case "$file" in
.github/workflows/ci.yml)
ci=true
Expand All @@ -106,12 +113,14 @@ jobs:
arena=true
contracts=true
rust=true
evals=true
fi

echo "full=$full" >> "$GITHUB_OUTPUT"
echo "arena=$arena" >> "$GITHUB_OUTPUT"
echo "contracts=$contracts" >> "$GITHUB_OUTPUT"
echo "rust=$rust" >> "$GITHUB_OUTPUT"
echo "evals=$evals" >> "$GITHUB_OUTPUT"

arena:
name: Arena UI
Expand Down Expand Up @@ -324,9 +333,29 @@ jobs:
--ignore RUSTSEC-2026-0118 \
--ignore RUSTSEC-2026-0119

# ── Evals (TypeScript) ────────────────────────────────────────────────────
evals:
name: Evals typecheck
needs: changes
if: needs.changes.outputs.evals == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- uses: actions/setup-node@v6
with:
node-version: 22
cache: npm

- name: Install dependencies
run: npm ci

- name: Typecheck evals
run: npx tsc -p evals/tsconfig.json --noEmit

ci-gate:
name: CI Gate
needs: [changes, arena, forge, rust, clippy, fmt, audit]
needs: [changes, arena, forge, rust, clippy, fmt, audit, evals]
if: always()
runs-on: ubuntu-latest
steps:
Expand All @@ -336,12 +365,14 @@ jobs:
ARENA_NEEDED: ${{ needs.changes.outputs.arena }}
CONTRACTS_NEEDED: ${{ needs.changes.outputs.contracts }}
RUST_NEEDED: ${{ needs.changes.outputs.rust }}
EVALS_NEEDED: ${{ needs.changes.outputs.evals }}
ARENA_RESULT: ${{ needs.arena.result }}
FORGE_RESULT: ${{ needs.forge.result }}
RUST_RESULT: ${{ needs.rust.result }}
CLIPPY_RESULT: ${{ needs.clippy.result }}
FMT_RESULT: ${{ needs.fmt.result }}
AUDIT_RESULT: ${{ needs.audit.result }}
EVALS_RESULT: ${{ needs.evals.result }}
run: |
set -euo pipefail
failed=false
Expand All @@ -367,8 +398,9 @@ jobs:
require_success "Clippy" "$RUST_NEEDED" "$CLIPPY_RESULT"
require_success "Rustfmt" "$RUST_NEEDED" "$FMT_RESULT"
require_success "Security audit" "$RUST_NEEDED" "$AUDIT_RESULT"
require_success "Evals typecheck" "$EVALS_NEEDED" "$EVALS_RESULT"

if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ]; then
if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ] && [ "$EVALS_NEEDED" != "true" ]; then
echo "No code lanes changed; CI gate is green."
fi

Expand Down
35 changes: 27 additions & 8 deletions evals/src/bin/agent-eval-trading-personas.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,42 @@
#!/usr/bin/env node
import {
runTradingPersonaAgentEvalBridge,
type TradingPersonaBridgeOptions,
} from '../trading/persona-agent-eval.js'
import { runTradingPersonaEval, type TradingPersonaEvalOptions } from '../trading/persona-agent-eval.js'
import type { LlmModel } from '../sim/llm-call.js'

function argValue(name: string): string | undefined {
const index = process.argv.indexOf(name)
return index >= 0 ? process.argv[index + 1] : undefined
}

const options: TradingPersonaBridgeOptions = {}
// One entry point. With --operator-url (or OPERATOR_API_URL/OPERATOR_URL set) it
// runs the real operator profile × persona matrix (real bot artifacts + tick
// side-effects, scored against the objective backtest); without it, the
// deterministic walk-forward backtest. Same surface, degrades by infra.
const options: TradingPersonaEvalOptions = {}
const reportPath = argValue('--out')
const traceDir = argValue('--trace-dir')
const runsJsonl = argValue('--runs-jsonl') ?? argValue('--runs')
const scorecard = argValue('--scorecard')
const operatorUrl = argValue('--operator-url')
const models = argValue('--models')
const reps = argValue('--reps')
const maxTurns = argValue('--max-turns')
const costCeiling = argValue('--cost-ceiling')
if (reportPath) options.reportPath = reportPath
if (traceDir) options.traceDir = traceDir
if (runsJsonl) options.runsJsonl = runsJsonl
if (scorecard) options.scorecardPath = scorecard
if (operatorUrl) options.operatorUrl = operatorUrl
if (models) options.models = models.split(',').map((m) => m.trim()) as LlmModel[]
if (reps) options.reps = Number(reps)
if (maxTurns) options.maxTurnsPerShot = Number(maxTurns)
if (costCeiling) options.costCeiling = Number(costCeiling)
if (process.env.TRADING_PERSONA_MATRIX_INTEGRITY === 'warn') options.integrity = 'warn'

const summary = await runTradingPersonaAgentEvalBridge(options)

const summary = await runTradingPersonaEval(options)
console.log(JSON.stringify(summary, null, 2))
if (summary.failed > 0) process.exit(1)

if (summary.mode === 'operator-matrix') {
if (summary.integrity?.verdict === 'stub' || summary.best === null) process.exit(1)
} else if ((summary.failed ?? 0) > 0) {
process.exit(1)
}
4 changes: 2 additions & 2 deletions evals/src/full/full-eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { runSelfImprovementMcpEval } from '../self-improvement/mcp-eval.js'
import { runProductBrowserEval } from '../product/browser-driver.js'
import { runStrategyTemplateEval } from '../trading/strategy-template-runner.js'
import { runTradingLifecycleEval } from '../trading/lifecycle-runner.js'
import { runTradingPersonaAgentEvalBridge } from '../trading/persona-agent-eval.js'
import { runTradingPersonaEval } from '../trading/persona-agent-eval.js'

export interface FullEvalOptions {
outputPath?: string
Expand Down Expand Up @@ -36,7 +36,7 @@ export async function runFullEval(options: FullEvalOptions = {}) {
await gate(gates, 'rust-persona-coverage-test', async () => {
run('cargo', ['test', '-p', 'trading-runtime', 'persona_eval_suite_has_required_coverage_and_passes'])
})
await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaAgentEvalBridge({
await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaEval({
reportPath: `.evolve/evals/full-personas-${stamp}.json`,
traceDir: `.evolve/agent-eval/traces/full-personas-${stamp}`,
runsJsonl: `.evolve/agent-eval/full-persona-runs-${stamp}.jsonl`,
Expand Down
61 changes: 61 additions & 0 deletions evals/src/sim/llm-call.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,19 @@ export interface LlmCallResult {
ok: boolean
}

/** Token + cost usage accumulated from the backend's `llm_call` stream
* events. Zeros when the provider reported no usage (which downstream
* backend-integrity guards correctly read as a stub fingerprint). */
export interface LlmUsage {
input: number
output: number
costUsd: number
}

export interface LlmCallUsageResult extends LlmCallResult {
usage: LlmUsage
}

/** Resolve a logical model id to its provider routing. Throws on unknown
* model id so call sites can't silently ship a typo to prod. */
export type { ModelRouting }
Expand Down Expand Up @@ -159,6 +172,54 @@ export async function llmCall(opts: LlmCallOptions): Promise<LlmCallResult> {
return { output, exitCode: 0, stderr: '', ok: output.length > 0 }
}

/** Core LLM call that ALSO reports provider-reported token + cost usage,
* accumulated from the backend's `llm_call` stream events. Use this when a
* caller must thread real usage into a `RunRecord` / `ctx.cost.observeTokens`
* (the backend-integrity guard keys on nonzero token usage). Same routing +
* timeout handling as `llmCall`. */
export async function llmCallWithUsage(opts: LlmCallOptions): Promise<LlmCallUsageResult> {
const cfg = resolveModel(opts.model ?? DEFAULT_MODEL)
const backend = createOpenAICompatibleBackend({
apiKey: cfg.apiKey(),
baseUrl: cfg.baseUrl,
model: cfg.modelId,
})
const task: AgentTaskSpec = {
id: `eval-llm-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
intent: 'one-shot eval LLM call with usage accounting',
domain: 'eval',
}
const controller = new AbortController()
const timer = setTimeout(() => controller.abort(), opts.timeoutMs ?? 180_000)
const parts: string[] = []
const usage: LlmUsage = { input: 0, output: 0, costUsd: 0 }
let backendError: string | null = null
try {
for await (const ev of runAgentTaskStream({
task,
backend,
input: { message: opts.prompt },
signal: controller.signal,
})) {
if (ev.type === 'text_delta') parts.push(ev.text)
else if (ev.type === 'llm_call') {
usage.input += ev.tokensIn ?? 0
usage.output += ev.tokensOut ?? 0
usage.costUsd += ev.costUsd ?? 0
} else if (ev.type === 'backend_error') backendError = ev.message
}
} catch (e) {
return { output: parts.join('').trim(), exitCode: 1, stderr: (e as Error).message, ok: false, usage }
} finally {
clearTimeout(timer)
}
if (backendError) {
return { output: parts.join('').trim(), exitCode: 1, stderr: backendError, ok: false, usage }
}
const output = parts.join('').trim()
return { output, exitCode: 0, stderr: '', ok: output.length > 0, usage }
}

/** Extract the first JSON object from an LLM response. Tolerates code
* fences and prose around the JSON. Returns null on unrecoverable
* parse failure — judges fall back gracefully instead of crashing the
Expand Down
54 changes: 52 additions & 2 deletions evals/src/sim/multishot-user-sim.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ export interface MultishotDispatchOptions {
privateKey?: string
maxTurnsPerShot: number
perTurnTimeoutMs: number
/** Per-cell override for the in-sandbox agent's LLM credentials. When set,
* this is the env `configureSecrets` writes into the bot's sandbox — i.e.
* it pins WHICH model the REAL operator agent runs (the PROFILE axis of the
* unified matrix). When omitted, falls back to `deterministicAgentEnv()`
* (the single-profile default). */
agentEnv?: Record<string, string>
}

export function makeUserSimDispatch(opts: MultishotDispatchOptions, botKind: BotKind = 'real') {
Expand Down Expand Up @@ -185,8 +191,10 @@ async function dispatchInner(
// Bot create is instant in operator DB; vault resolution is async
// (on-chain). configureSecrets returns 500 without a resolved vault.
await client.waitForVaultResolved(botId)
// Then configure sandbox-agent LLM credentials before chatting.
await client.configureSecrets(botId, deterministicAgentEnv())
// Then configure sandbox-agent LLM credentials before chatting. A per-cell
// override (the matrix PROFILE axis) pins which model the REAL operator runs;
// otherwise the single-profile default applies.
await client.configureSecrets(botId, opts.agentEnv ?? deterministicAgentEnv())
const sessionId = await client.createSession(botId, `user-sim:${scenario.id}`)
return runUserSimSession({
intent: scenario.intent,
Expand Down Expand Up @@ -282,6 +290,43 @@ function deriveStateScores(
return { committed, selfImprovement, evidence }
}

/** The artifact-based score for ONE user-sim session, factored out of
* `userSimJudge` so other eval surfaces (the unified trading matrix) can score
* a real session WITHOUT re-running a `runEval` campaign. Composite weights
* OBSERVABLE state (trades/strategy/self-improve from `bot_artifacts`) at 55%
* and prose (rubric judge) at 45% — the same weighting `userSimJudge` uses. */
export interface UserSimArtifactScore {
composite: number
dimensions: Record<string, number>
notes: string
}

export async function scoreUserSimArtifact(
intent: UserIntent,
artifact: UserSimSessionResult,
): Promise<UserSimArtifactScore> {
const r = await judgePrimaryRubric(intent, artifact)
const state = deriveStateScores(artifact, r.actually_traded_or_committed)
const composite =
0.20 * r.intent_fulfilled +
0.15 * r.respected_constraints +
0.40 * state.committed +
0.15 * state.selfImprovement +
0.10 * r.productive_conversation
return {
composite,
dimensions: {
intent_fulfilled: r.intent_fulfilled,
respected_constraints: r.respected_constraints,
actually_traded_or_committed: state.committed,
self_improvement: state.selfImprovement,
productive_conversation: r.productive_conversation,
prose_traded_claim: r.actually_traded_or_committed,
},
notes: `${r.notes} | STATE: ${state.evidence}`,
}
}

export function userSimJudge(opts: { dualJudge?: boolean } = {}): JudgeConfig<UserSimSessionResult, UserIntentScenario> {
const useDual = opts.dualJudge ?? false
return {
Expand Down Expand Up @@ -371,6 +416,10 @@ export interface RunMultishotUserSimOptions {
* bot scoring high on a newbie persona's intents but low on a
* veteran's is a real product signal. */
personas?: UserPersona[]
/** Per-run override for the in-sandbox agent's LLM credentials — pins WHICH
* model the REAL operator agent runs. Used by the unified trading matrix to
* drive each PROFILE's model through the real operator stack. */
agentEnv?: Record<string, string>
}

export async function runMultishotUserSim(
Expand All @@ -393,6 +442,7 @@ export async function runMultishotUserSim(
// multi-step work to land. (The real fix is tick-driving — task #108 —
// but a 3-cron budget makes the current sync-poll model honest.)
perTurnTimeoutMs: opts.perTurnTimeoutMs ?? 900_000,
...(opts.agentEnv ? { agentEnv: opts.agentEnv } : {}),
},
botKind,
)
Expand Down
Loading