From ae578723444129fac1bf2cc0919ffe29b6d10df1 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 14 Jun 2026 04:58:39 -0600 Subject: [PATCH] feat(evals): one trading persona eval (deterministic + operator matrix) + CI lane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #169 (model-driven trading) / #170 (deps). ONE entry point — runTradingPersonaEval (evals/src/trading/persona-agent-eval.ts) — that degrades by infra, instead of separate modules: - No operator URL -> DETERMINISTIC mode: the Rust walk-forward backtest -> RunRecords + trace + scorecard (offline; what full-eval/CI run). Unchanged. - operatorUrl present -> OPERATOR-MATRIX mode: runProfileMatrix sweeps the PROFILE axis (operator model variants: kimi-k2/glm-4.7/glm-5.1, pinned into the REAL operator via agentEnv) x (persona x market). Each cell runs the FULL operator simulation (runMultishotUserSim -> real bot_artifacts + tick_side_effects), judged on real artifacts (60%) + objective backtest ground truth (40%) — not prose. Scorecard + assertRealBackend + byProfile/byPersona read straight from the matrix. Multi-round honestly degenerates to 1 (the provision->chat->capture cycle is single-pass; turns live inside each cell). Consolidation: folds the operator-matrix capability INTO the existing bridge file and DELETES the standalone module + the dual --matrix bin flag + the redundant npm script. One surface, one entry point, shared scorecard/profile/ground-truth helpers. The bin auto-degrades by --operator-url; full-eval routes through the same function. CI: new 'Evals typecheck' lane (node 22 + npm ci + tsc -p evals/tsconfig.json), classified on evals/ + package*.json + tsconfig, required in the gate. Deps: agent-runtime ^0.52, agent-knowledge ^1.7 (over #170's ^0.50/^1.5); agent-eval ^0.91. Validated: npm ci clean, tsc 0 errors. --- .github/workflows/ci.yml | 36 +- evals/src/bin/agent-eval-trading-personas.ts | 35 +- evals/src/full/full-eval-runner.ts | 4 +- evals/src/sim/llm-call.ts | 61 +++ evals/src/sim/multishot-user-sim.ts | 54 +- evals/src/trading/persona-agent-eval.ts | 526 +++++++++++++++++-- package-lock.json | 161 +----- package.json | 4 +- trading-blueprint-lib/src/jobs/activate.rs | 6 +- 9 files changed, 680 insertions(+), 207 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8145802d..e656065b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,7 @@ jobs: arena: ${{ steps.decide.outputs.arena }} contracts: ${{ steps.decide.outputs.contracts }} rust: ${{ steps.decide.outputs.rust }} + evals: ${{ steps.decide.outputs.evals }} full: ${{ steps.decide.outputs.full }} steps: - uses: actions/checkout@v6 @@ -73,6 +74,7 @@ jobs: arena=false contracts=false rust=false + evals=false ci=false while IFS= read -r file; do @@ -92,6 +94,11 @@ jobs: rust=true ;; esac + case "$file" in + evals/*|package.json|package-lock.json|tsconfig.json|tsconfig.*.json) + evals=true + ;; + esac case "$file" in .github/workflows/ci.yml) ci=true @@ -106,12 +113,14 @@ jobs: arena=true contracts=true rust=true + evals=true fi echo "full=$full" >> "$GITHUB_OUTPUT" echo "arena=$arena" >> "$GITHUB_OUTPUT" echo "contracts=$contracts" >> "$GITHUB_OUTPUT" echo "rust=$rust" >> "$GITHUB_OUTPUT" + echo "evals=$evals" >> "$GITHUB_OUTPUT" arena: name: Arena UI @@ -324,9 +333,29 @@ jobs: --ignore RUSTSEC-2026-0118 \ --ignore RUSTSEC-2026-0119 + # ── Evals (TypeScript) ──────────────────────────────────────────────────── + evals: + name: Evals typecheck + needs: changes + if: needs.changes.outputs.evals == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-node@v6 + with: + node-version: 22 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Typecheck evals + run: npx tsc -p evals/tsconfig.json --noEmit + ci-gate: name: CI Gate - needs: [changes, arena, forge, rust, clippy, fmt, audit] + needs: [changes, arena, forge, rust, clippy, fmt, audit, evals] if: always() runs-on: ubuntu-latest steps: @@ -336,12 +365,14 @@ jobs: ARENA_NEEDED: ${{ needs.changes.outputs.arena }} CONTRACTS_NEEDED: ${{ needs.changes.outputs.contracts }} RUST_NEEDED: ${{ needs.changes.outputs.rust }} + EVALS_NEEDED: ${{ needs.changes.outputs.evals }} ARENA_RESULT: ${{ needs.arena.result }} FORGE_RESULT: ${{ needs.forge.result }} RUST_RESULT: ${{ needs.rust.result }} CLIPPY_RESULT: ${{ needs.clippy.result }} FMT_RESULT: ${{ needs.fmt.result }} AUDIT_RESULT: ${{ needs.audit.result }} + EVALS_RESULT: ${{ needs.evals.result }} run: | set -euo pipefail failed=false @@ -367,8 +398,9 @@ jobs: require_success "Clippy" "$RUST_NEEDED" "$CLIPPY_RESULT" require_success "Rustfmt" "$RUST_NEEDED" "$FMT_RESULT" require_success "Security audit" "$RUST_NEEDED" "$AUDIT_RESULT" + require_success "Evals typecheck" "$EVALS_NEEDED" "$EVALS_RESULT" - if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ]; then + if [ "$ARENA_NEEDED" != "true" ] && [ "$CONTRACTS_NEEDED" != "true" ] && [ "$RUST_NEEDED" != "true" ] && [ "$EVALS_NEEDED" != "true" ]; then echo "No code lanes changed; CI gate is green." fi diff --git a/evals/src/bin/agent-eval-trading-personas.ts b/evals/src/bin/agent-eval-trading-personas.ts index 1c2649b1..bb8ab972 100644 --- a/evals/src/bin/agent-eval-trading-personas.ts +++ b/evals/src/bin/agent-eval-trading-personas.ts @@ -1,23 +1,42 @@ #!/usr/bin/env node -import { - runTradingPersonaAgentEvalBridge, - type TradingPersonaBridgeOptions, -} from '../trading/persona-agent-eval.js' +import { runTradingPersonaEval, type TradingPersonaEvalOptions } from '../trading/persona-agent-eval.js' +import type { LlmModel } from '../sim/llm-call.js' function argValue(name: string): string | undefined { const index = process.argv.indexOf(name) return index >= 0 ? process.argv[index + 1] : undefined } -const options: TradingPersonaBridgeOptions = {} +// One entry point. With --operator-url (or OPERATOR_API_URL/OPERATOR_URL set) it +// runs the real operator profile × persona matrix (real bot artifacts + tick +// side-effects, scored against the objective backtest); without it, the +// deterministic walk-forward backtest. Same surface, degrades by infra. +const options: TradingPersonaEvalOptions = {} const reportPath = argValue('--out') const traceDir = argValue('--trace-dir') const runsJsonl = argValue('--runs-jsonl') ?? argValue('--runs') +const scorecard = argValue('--scorecard') +const operatorUrl = argValue('--operator-url') +const models = argValue('--models') +const reps = argValue('--reps') +const maxTurns = argValue('--max-turns') +const costCeiling = argValue('--cost-ceiling') if (reportPath) options.reportPath = reportPath if (traceDir) options.traceDir = traceDir if (runsJsonl) options.runsJsonl = runsJsonl +if (scorecard) options.scorecardPath = scorecard +if (operatorUrl) options.operatorUrl = operatorUrl +if (models) options.models = models.split(',').map((m) => m.trim()) as LlmModel[] +if (reps) options.reps = Number(reps) +if (maxTurns) options.maxTurnsPerShot = Number(maxTurns) +if (costCeiling) options.costCeiling = Number(costCeiling) +if (process.env.TRADING_PERSONA_MATRIX_INTEGRITY === 'warn') options.integrity = 'warn' -const summary = await runTradingPersonaAgentEvalBridge(options) - +const summary = await runTradingPersonaEval(options) console.log(JSON.stringify(summary, null, 2)) -if (summary.failed > 0) process.exit(1) + +if (summary.mode === 'operator-matrix') { + if (summary.integrity?.verdict === 'stub' || summary.best === null) process.exit(1) +} else if ((summary.failed ?? 0) > 0) { + process.exit(1) +} diff --git a/evals/src/full/full-eval-runner.ts b/evals/src/full/full-eval-runner.ts index c49d0cfb..ed886c27 100644 --- a/evals/src/full/full-eval-runner.ts +++ b/evals/src/full/full-eval-runner.ts @@ -6,7 +6,7 @@ import { runSelfImprovementMcpEval } from '../self-improvement/mcp-eval.js' import { runProductBrowserEval } from '../product/browser-driver.js' import { runStrategyTemplateEval } from '../trading/strategy-template-runner.js' import { runTradingLifecycleEval } from '../trading/lifecycle-runner.js' -import { runTradingPersonaAgentEvalBridge } from '../trading/persona-agent-eval.js' +import { runTradingPersonaEval } from '../trading/persona-agent-eval.js' export interface FullEvalOptions { outputPath?: string @@ -36,7 +36,7 @@ export async function runFullEval(options: FullEvalOptions = {}) { await gate(gates, 'rust-persona-coverage-test', async () => { run('cargo', ['test', '-p', 'trading-runtime', 'persona_eval_suite_has_required_coverage_and_passes']) }) - await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaAgentEvalBridge({ + await gate(gates, 'trading-persona-agent-eval', async () => runTradingPersonaEval({ reportPath: `.evolve/evals/full-personas-${stamp}.json`, traceDir: `.evolve/agent-eval/traces/full-personas-${stamp}`, runsJsonl: `.evolve/agent-eval/full-persona-runs-${stamp}.jsonl`, diff --git a/evals/src/sim/llm-call.ts b/evals/src/sim/llm-call.ts index cc83af81..2d3e29fd 100644 --- a/evals/src/sim/llm-call.ts +++ b/evals/src/sim/llm-call.ts @@ -100,6 +100,19 @@ export interface LlmCallResult { ok: boolean } +/** Token + cost usage accumulated from the backend's `llm_call` stream + * events. Zeros when the provider reported no usage (which downstream + * backend-integrity guards correctly read as a stub fingerprint). */ +export interface LlmUsage { + input: number + output: number + costUsd: number +} + +export interface LlmCallUsageResult extends LlmCallResult { + usage: LlmUsage +} + /** Resolve a logical model id to its provider routing. Throws on unknown * model id so call sites can't silently ship a typo to prod. */ export type { ModelRouting } @@ -159,6 +172,54 @@ export async function llmCall(opts: LlmCallOptions): Promise { return { output, exitCode: 0, stderr: '', ok: output.length > 0 } } +/** Core LLM call that ALSO reports provider-reported token + cost usage, + * accumulated from the backend's `llm_call` stream events. Use this when a + * caller must thread real usage into a `RunRecord` / `ctx.cost.observeTokens` + * (the backend-integrity guard keys on nonzero token usage). Same routing + + * timeout handling as `llmCall`. */ +export async function llmCallWithUsage(opts: LlmCallOptions): Promise { + const cfg = resolveModel(opts.model ?? DEFAULT_MODEL) + const backend = createOpenAICompatibleBackend({ + apiKey: cfg.apiKey(), + baseUrl: cfg.baseUrl, + model: cfg.modelId, + }) + const task: AgentTaskSpec = { + id: `eval-llm-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`, + intent: 'one-shot eval LLM call with usage accounting', + domain: 'eval', + } + const controller = new AbortController() + const timer = setTimeout(() => controller.abort(), opts.timeoutMs ?? 180_000) + const parts: string[] = [] + const usage: LlmUsage = { input: 0, output: 0, costUsd: 0 } + let backendError: string | null = null + try { + for await (const ev of runAgentTaskStream({ + task, + backend, + input: { message: opts.prompt }, + signal: controller.signal, + })) { + if (ev.type === 'text_delta') parts.push(ev.text) + else if (ev.type === 'llm_call') { + usage.input += ev.tokensIn ?? 0 + usage.output += ev.tokensOut ?? 0 + usage.costUsd += ev.costUsd ?? 0 + } else if (ev.type === 'backend_error') backendError = ev.message + } + } catch (e) { + return { output: parts.join('').trim(), exitCode: 1, stderr: (e as Error).message, ok: false, usage } + } finally { + clearTimeout(timer) + } + if (backendError) { + return { output: parts.join('').trim(), exitCode: 1, stderr: backendError, ok: false, usage } + } + const output = parts.join('').trim() + return { output, exitCode: 0, stderr: '', ok: output.length > 0, usage } +} + /** Extract the first JSON object from an LLM response. Tolerates code * fences and prose around the JSON. Returns null on unrecoverable * parse failure — judges fall back gracefully instead of crashing the diff --git a/evals/src/sim/multishot-user-sim.ts b/evals/src/sim/multishot-user-sim.ts index 268bb94a..ddf06d25 100644 --- a/evals/src/sim/multishot-user-sim.ts +++ b/evals/src/sim/multishot-user-sim.ts @@ -128,6 +128,12 @@ export interface MultishotDispatchOptions { privateKey?: string maxTurnsPerShot: number perTurnTimeoutMs: number + /** Per-cell override for the in-sandbox agent's LLM credentials. When set, + * this is the env `configureSecrets` writes into the bot's sandbox — i.e. + * it pins WHICH model the REAL operator agent runs (the PROFILE axis of the + * unified matrix). When omitted, falls back to `deterministicAgentEnv()` + * (the single-profile default). */ + agentEnv?: Record } export function makeUserSimDispatch(opts: MultishotDispatchOptions, botKind: BotKind = 'real') { @@ -185,8 +191,10 @@ async function dispatchInner( // Bot create is instant in operator DB; vault resolution is async // (on-chain). configureSecrets returns 500 without a resolved vault. await client.waitForVaultResolved(botId) - // Then configure sandbox-agent LLM credentials before chatting. - await client.configureSecrets(botId, deterministicAgentEnv()) + // Then configure sandbox-agent LLM credentials before chatting. A per-cell + // override (the matrix PROFILE axis) pins which model the REAL operator runs; + // otherwise the single-profile default applies. + await client.configureSecrets(botId, opts.agentEnv ?? deterministicAgentEnv()) const sessionId = await client.createSession(botId, `user-sim:${scenario.id}`) return runUserSimSession({ intent: scenario.intent, @@ -282,6 +290,43 @@ function deriveStateScores( return { committed, selfImprovement, evidence } } +/** The artifact-based score for ONE user-sim session, factored out of + * `userSimJudge` so other eval surfaces (the unified trading matrix) can score + * a real session WITHOUT re-running a `runEval` campaign. Composite weights + * OBSERVABLE state (trades/strategy/self-improve from `bot_artifacts`) at 55% + * and prose (rubric judge) at 45% — the same weighting `userSimJudge` uses. */ +export interface UserSimArtifactScore { + composite: number + dimensions: Record + notes: string +} + +export async function scoreUserSimArtifact( + intent: UserIntent, + artifact: UserSimSessionResult, +): Promise { + const r = await judgePrimaryRubric(intent, artifact) + const state = deriveStateScores(artifact, r.actually_traded_or_committed) + const composite = + 0.20 * r.intent_fulfilled + + 0.15 * r.respected_constraints + + 0.40 * state.committed + + 0.15 * state.selfImprovement + + 0.10 * r.productive_conversation + return { + composite, + dimensions: { + intent_fulfilled: r.intent_fulfilled, + respected_constraints: r.respected_constraints, + actually_traded_or_committed: state.committed, + self_improvement: state.selfImprovement, + productive_conversation: r.productive_conversation, + prose_traded_claim: r.actually_traded_or_committed, + }, + notes: `${r.notes} | STATE: ${state.evidence}`, + } +} + export function userSimJudge(opts: { dualJudge?: boolean } = {}): JudgeConfig { const useDual = opts.dualJudge ?? false return { @@ -371,6 +416,10 @@ export interface RunMultishotUserSimOptions { * bot scoring high on a newbie persona's intents but low on a * veteran's is a real product signal. */ personas?: UserPersona[] + /** Per-run override for the in-sandbox agent's LLM credentials — pins WHICH + * model the REAL operator agent runs. Used by the unified trading matrix to + * drive each PROFILE's model through the real operator stack. */ + agentEnv?: Record } export async function runMultishotUserSim( @@ -393,6 +442,7 @@ export async function runMultishotUserSim( // multi-step work to land. (The real fix is tick-driving — task #108 — // but a 3-cron budget makes the current sync-poll model honest.) perTurnTimeoutMs: opts.perTurnTimeoutMs ?? 900_000, + ...(opts.agentEnv ? { agentEnv: opts.agentEnv } : {}), }, botKind, ) diff --git a/evals/src/trading/persona-agent-eval.ts b/evals/src/trading/persona-agent-eval.ts index 091439f7..3e953f8d 100644 --- a/evals/src/trading/persona-agent-eval.ts +++ b/evals/src/trading/persona-agent-eval.ts @@ -1,13 +1,31 @@ /** - * Trading-persona bridge — drives `runPersonaSuite` (Rust persona eval), - * folds each result into a paper-grade `RunRecord`, captures spans via - * `TraceEmitter` + `FileSystemTraceStore`, AND now feeds the scorecard - * timeline so `diffScorecard` flags regressions across commits (the - * pattern creative-agent uses). + * The ONE trading persona/profile eval — `runTradingPersonaEval`. * - * Migrated off the dynamic-import shim — direct imports against the - * `@tangle-network/agent-eval` 0.45+ surface. The dispatch + scoring are - * unchanged; the wiring is the modern shape. + * A single surface that DEGRADES by what infra is available: + * + * - No operator URL → DETERMINISTIC mode. Runs `runPersonaSuite` (the Rust + * walk-forward backtest), folds each persona×scenario result into a paper + * `RunRecord` + trace span, and feeds the scorecard timeline. Offline, no + * LLM, no live stack — this is what `full-eval` / CI run. + * - operatorUrl present → OPERATOR-MATRIX mode. Sweeps a PROFILE axis (operator + * model variants) × (PERSONA × MARKET) scenarios via `runProfileMatrix`. Each + * cell runs the FULL operator simulation (`runMultishotUserSim` against the + * real `OperatorClient` with that profile's model) and captures REAL evidence + * (`bot_artifacts` + `tick_side_effects` + transcript). The judge blends the + * real-artifact score with the model-invariant backtest ground truth — not + * prose. Aggregation (`byProfile`/`byPersona`/`integrity`) is read straight + * from `runProfileMatrix`. + * + * Both modes share the scorecard profile, the trace/RunRecord plumbing, and the + * objective backtest ground truth. There is exactly one entry point and one + * scorecard surface; callers (the bin, `full-eval`) need not know which mode ran. + * + * Multi-round depth in operator-matrix mode is honestly 1 round: the real + * provision→chat→capture cycle is single-pass and cannot be fed back through + * `loopDispatch`/`loopUntil` (those host a sandbox `runLoop` child, not an HTTP + * provision+chat orchestration). The multi-TURN refinement lives INSIDE each cell + * as the user-sim's `maxTurnsPerShot` turns; `depthRounds` is surfaced so the + * degeneracy is explicit, never faked. */ import { randomUUID } from 'node:crypto' @@ -17,14 +35,33 @@ import { dirname } from 'node:path' import { FileSystemTraceStore, TraceEmitter, - type RunRecord, + agentProfileHash, + assertRealBackend, + recordRunsToScorecard, + summarizeBackendIntegrity, validateRunRecord, + type AgentProfile, + type BackendIntegrityReport, + type RunRecord, } from '@tangle-network/agent-eval' +import { + runProfileMatrix, + type JudgeConfig, + type JudgeScore, + type Scenario, +} from '@tangle-network/agent-eval/campaign' import { sha256 } from '../lib/crypto.js' import { isoStamp, resolveRepo } from '../lib/repo.js' +import { llmCallWithUsage, resolveModel, type LlmModel } from '../sim/llm-call.js' +import { runMultishotUserSim, scoreUserSimArtifact } from '../sim/multishot-user-sim.js' +import { STANDARD_USER_INTENTS } from '../sim/user-intents.js' +import { STANDARD_USER_PERSONAS, type UserPersona } from '../sim/user-personas.js' +import type { UserIntent, UserSimSessionResult } from '../sim/user-sim-driver.js' import { currentCommitSha, runPersonaSuite } from './persona-runner.js' import { normalizeSplit, numericRaw, type PersonaEvalResult } from './persona-types.js' +import { evaluateScenario } from './personas/walk-forward.js' +import { defaultScenarios, type TradingEvalScenario } from './personas/scenarios.js' import { buildTradingScorecardAgentProfile, recordScorecardAndDiff, @@ -37,34 +74,96 @@ const FEE_SCHEDULE_VERSION = 'protocol-fees@2026-05' const SURFACE_VERSION = 1 const RUNTIME_VERSION = '0.1.0' -export interface TradingPersonaBridgeOptions { +/** The PROFILE axis for operator-matrix mode: the operator model variants the + * matrix sweeps. Single source of truth = `MODEL_CONFIG` in sim/llm-call.ts. */ +export const OPERATOR_PROFILE_MODELS: readonly LlmModel[] = ['kimi-k2', 'glm-4.7', 'glm-5.1'] + +export interface TradingPersonaEvalOptions { + // ── shared ── reportPath?: string traceDir?: string runsJsonl?: string scorecardPath?: string failOnRegression?: boolean + // ── operator-matrix mode (presence of operatorUrl/env switches the mode) ── + /** Operator-api base URL the dispatch provisions/chats against. When present + * (here or via OPERATOR_API_URL/OPERATOR_URL), the eval runs the real + * operator matrix instead of the deterministic backtest. */ + operatorUrl?: string + token?: string + privateKey?: string + /** Model variants to sweep (operator-matrix mode). Default: all three. */ + models?: readonly LlmModel[] + personas?: UserPersona[] + markets?: TradingEvalScenario[] + intents?: UserIntent[] + reps?: number + maxTurnsPerShot?: number + perTurnTimeoutMs?: number + maxConcurrency?: number + costCeiling?: number + /** Backend-integrity posture for the matrix. Default 'assert'. */ + integrity?: 'assert' | 'warn' | 'off' } -export interface TradingPersonaBridgeSummary { +/** Per-profile rollup (operator-matrix mode), read from `runProfileMatrix.byProfile`. */ +export interface ProfileSummary { + profileId: string + profileHash: string + model: string + records: number + /** Fraction of this profile's records scoring >= 0.7 (the promotion bar). */ + passRate: number + meanScore: number + totalCostUsd: number + integrityVerdict: BackendIntegrityReport['verdict'] +} + +export interface TradingPersonaEvalSummary { + mode: 'deterministic' | 'operator-matrix' suite: string - report: string - runs_jsonl: string - trace_dir: string + commitSha: string + records: number scorecard: { path: string appendedCells: number - profileHash: string + profileHash?: string regressed: boolean - formatted: string + formatted?: string } - records: number - passed: number - failed: number + // deterministic mode + report?: string + runs_jsonl?: string + trace_dir?: string + passed?: number + failed?: number + // operator-matrix mode + runDir?: string + depthRounds?: 1 + byProfile?: ProfileSummary[] + byPersona?: Record + integrity?: BackendIntegrityReport + best?: ProfileSummary | null } -export async function runTradingPersonaAgentEvalBridge( - options: TradingPersonaBridgeOptions = {}, -): Promise { +/** + * The single entry point. Resolves the operator URL (option or env) and routes: + * present → operator matrix (real stack); absent → deterministic backtest. + */ +export async function runTradingPersonaEval( + options: TradingPersonaEvalOptions = {}, +): Promise { + const operatorUrl = options.operatorUrl ?? process.env.OPERATOR_API_URL ?? process.env.OPERATOR_URL + return operatorUrl + ? runOperatorMatrix(operatorUrl, options) + : runDeterministicBacktest(options) +} + +// ── DETERMINISTIC MODE ───────────────────────────────────────────────────── + +async function runDeterministicBacktest( + options: TradingPersonaEvalOptions, +): Promise { const reportPath = resolveRepo( options.reportPath ?? `.evolve/evals/trading-agent-personas-${isoStamp()}.json`, ) @@ -112,10 +211,7 @@ export async function runTradingPersonaAgentEvalBridge( wallMs, costUsd: 0, tokenUsage: { input: 0, output: 0 }, - outcome: { - searchScore: result.score / 100, - raw: numericRaw(result), - }, + outcome: { searchScore: result.score / 100, raw: numericRaw(result) }, failureMode: result.passed ? undefined : result.findings[0]?.subject, splitTag: normalizeSplit(result.split), scenarioId: result.scenario_id, @@ -133,11 +229,22 @@ export async function runTradingPersonaAgentEvalBridge( commitSha, }) - const summary: TradingPersonaBridgeSummary = { + if (options.failOnRegression && scorecard.regressed) { + throw new Error( + `trading-persona scorecard reports regression on at least one cell. Diff:\n${scorecard.formatted}`, + ) + } + + return { + mode: 'deterministic', suite: report.suite, + commitSha, + records, report: reportPath, runs_jsonl: runsJsonl, trace_dir: traceDir, + passed: report.passed, + failed: report.failed, scorecard: { path: scorecardPath, appendedCells: scorecard.appendedCells, @@ -145,17 +252,7 @@ export async function runTradingPersonaAgentEvalBridge( regressed: scorecard.regressed, formatted: scorecard.formatted, }, - records, - passed: report.passed, - failed: report.failed, } - - if (options.failOnRegression && scorecard.regressed) { - throw new Error( - `trading-persona scorecard reports regression on at least one cell. Diff:\n${scorecard.formatted}`, - ) - } - return summary } async function writeTrace(input: { @@ -177,20 +274,12 @@ async function writeTrace(input: { promptSha: promptHash, modelFingerprint: MODEL_FINGERPRINT, layer: 'app-runtime', - tags: { - suite, - persona_id: result.persona_id, - split: result.split, - config_hash: configHash, - }, + tags: { suite, persona_id: result.persona_id, split: result.split, config_hash: configHash }, }) const span = await emitter.tool({ name: 'trading-runtime backtest walk-forward compare', toolName: 'trading_runtime.backtest.walk_forward_compare', - args: { - persona_id: result.persona_id, - scenario_id: result.scenario_id, - }, + args: { persona_id: result.persona_id, scenario_id: result.scenario_id }, }) await span.end({ result: { @@ -209,12 +298,351 @@ async function writeTrace(input: { hash: sha256(reportBytes).replace(/^sha256:/, ''), storageUrl: reportPath, }) - const failureClass = result.passed ? 'success' : 'instruction_following' const notes = result.findings.map((finding) => finding.message).join('\n') await emitter.endRun({ pass: result.passed, score: result.score / 100, - failureClass, + failureClass: result.passed ? 'success' : 'instruction_following', ...(notes ? { notes } : {}), }) } + +// ── OPERATOR-MATRIX MODE ─────────────────────────────────────────────────── + +/** Build the profile axis (one profile per model). Carries the trading surface + * identity (so the scorecard keys stably) plus the model under test. */ +export function buildOperatorProfiles( + models: readonly LlmModel[] = OPERATOR_PROFILE_MODELS, +): AgentProfile[] { + return models.map((model) => { + const base = buildTradingScorecardAgentProfile({ + surfaceVersion: SURFACE_VERSION, + runtimeVersion: RUNTIME_VERSION, + venues: VENUES, + feeScheduleVersion: FEE_SCHEDULE_VERSION, + model, + }) + return { + ...base, + id: `${base.id}::model=${model}`, + model, + metadata: { ...base.metadata, model, modelClass: 'llm-trading-operator' }, + } + }) +} + +function modelOfProfile(profile: AgentProfile): LlmModel { + const model = (profile.metadata?.model ?? profile.model) as LlmModel + resolveModel(model) + return model +} + +/** In-sandbox operator agent env that pins the REAL operator to a model — the + * mechanism that makes the PROFILE axis bite the production stack. Resolves + * provider + key through the same MODEL_CONFIG table; throws on unknown + * model/missing key (fail loud, never a silent stub). */ +export function agentEnvForModel(model: LlmModel): Record { + const cfg = resolveModel(model) + const key = cfg.apiKey() + const isMoonshot = cfg.baseUrl.includes('moonshot') + const provider = isMoonshot ? 'moonshot' : 'zai-coding-plan' + const env: Record = { + OPENCODE_MODEL_PROVIDER: provider, + OPENCODE_MODEL_NAME: cfg.modelId, + OPENCODE_MODEL_API_KEY: key, + OPENCODE_MODEL: `${provider}/${cfg.modelId}`, + OPENCODE_MODEL_BASE_URL: cfg.baseUrl, + SIDECAR_DEFAULT_HARNESS: 'opencode', + } + if (isMoonshot) env.MOONSHOT_API_KEY = key + else env.ZAI_API_KEY = key + return env +} + +interface OperatorScenario extends Scenario { + kind: 'trading-persona' + persona: UserPersona + intent: UserIntent + market: TradingEvalScenario +} + +function buildOperatorScenarios( + personas: UserPersona[], + markets: TradingEvalScenario[], + intents: UserIntent[], +): OperatorScenario[] { + if (intents.length === 0) throw new Error('buildOperatorScenarios: need at least one intent') + const out: OperatorScenario[] = [] + for (const persona of personas) { + markets.forEach((market, i) => { + const intent = intents[i % intents.length]! + out.push({ + id: `${persona.id}__${market.id}`, + kind: 'trading-persona', + tags: ['trading-persona', persona.id, ...persona.tags, market.market_regime, ...intent.venues], + persona, + intent, + market, + }) + }) + } + return out +} + +interface OperatorArtifact { + session: UserSimSessionResult + groundTruth: PersonaEvalResult + model: string + operatorResponded: boolean +} + +/** Combined judge: real-artifact score (60%) ⊕ objective backtest ground truth + * (40%). A cell whose operator produced no turns is `failed`, never a zero. */ +function operatorJudge(): JudgeConfig { + return { + name: 'trading-persona-real-artifact-plus-backtest', + dimensions: [ + { key: 'real_artifact', description: 'observable state + prose from the real operator session' }, + { key: 'actually_traded_or_committed', description: 'observable: did the operator trade/commit? (artifact)' }, + { key: 'self_improvement', description: 'observable: did the self-improve cycle fire? (artifact)' }, + { key: 'backtest_ground_truth', description: 'objective walk-forward backtest score (model-invariant)' }, + { key: 'backtest_passed', description: 'did the candidate clear the deterministic promotion gates?' }, + ], + async score({ scenario, artifact }): Promise { + if (!artifact.operatorResponded) { + return { dimensions: {}, composite: 0, notes: 'real operator produced no turns — cell failed (not scored as zero)', failed: true } + } + const real = await scoreUserSimArtifact(scenario.intent, artifact.session) + const gt = artifact.groundTruth + const groundTruthScore = gt.score / 100 + return { + composite: 0.6 * real.composite + 0.4 * groundTruthScore, + dimensions: { + real_artifact: real.composite, + actually_traded_or_committed: real.dimensions.actually_traded_or_committed ?? 0, + self_improvement: real.dimensions.self_improvement ?? 0, + backtest_ground_truth: groundTruthScore, + backtest_passed: gt.passed ? 1 : 0, + }, + notes: + `REAL(${real.composite.toFixed(2)}) ⊕ BACKTEST(${groundTruthScore.toFixed(2)}, passed=${gt.passed}, ` + + `gates=${gt.deterministic_gates.join(' | ')}) | ${real.notes}`, + } + }, + } +} + +async function runOperatorMatrix( + operatorUrl: string, + options: TradingPersonaEvalOptions, +): Promise { + const token = options.token ?? process.env.OPERATOR_API_TOKEN ?? '' + const privateKey = options.privateKey ?? process.env.OPERATOR_PRIVATE_KEY + if (!token && !privateKey) { + throw new Error( + 'operator-matrix mode: need an operator-api token (options.token / OPERATOR_API_TOKEN) or a ' + + 'privateKey (options.privateKey / OPERATOR_PRIVATE_KEY) to authenticate against the real stack.', + ) + } + + const profiles = buildOperatorProfiles(options.models ?? OPERATOR_PROFILE_MODELS) + const personas = options.personas ?? STANDARD_USER_PERSONAS + const markets = options.markets ?? defaultScenarios() + const intents = options.intents ?? STANDARD_USER_INTENTS + const scenarios = buildOperatorScenarios(personas, markets, intents) + const commitSha = currentCommitSha() + const runDir = resolveRepo(options.runsJsonl ? dirname(options.runsJsonl) : `.evolve/agent-eval/trading-persona-matrix-${isoStamp()}`) + const scorecardPath = resolveRepo( + options.scorecardPath ?? '.evolve/agent-eval/scorecards/trading-persona-matrix.jsonl', + ) + mkdirSync(runDir, { recursive: true }) + mkdirSync(dirname(scorecardPath), { recursive: true }) + + // Model-invariant ground truth — computed once per market, shared by every cell. + const groundTruthByMarket = new Map() + for (const m of markets) groundTruthByMarket.set(m.id, evaluateScenario(m)) + + const maxTurnsPerShot = options.maxTurnsPerShot ?? 6 + const perTurnTimeoutMs = options.perTurnTimeoutMs ?? 900_000 + + const result = await runProfileMatrix({ + profiles, + scenarios, + judges: [operatorJudge()], + runDir, + commitSha, + experimentId: 'trading-persona-matrix', + splitTag: 'search', + ...(options.reps !== undefined ? { reps: options.reps } : {}), + maxConcurrency: options.maxConcurrency ?? 1, + ...(options.costCeiling !== undefined ? { costCeiling: options.costCeiling } : {}), + integrity: options.integrity ?? 'assert', + personaOf: (s) => s.persona.id, + dispatch: async (profile, scenario, ctx) => { + const groundTruth = groundTruthByMarket.get(scenario.market.id) + if (!groundTruth) throw new Error(`no backtest ground truth for market ${scenario.market.id}`) + const model = modelOfProfile(profile) + const campaign = await runMultishotUserSim({ + intents: [scenario.intent], + personas: [scenario.persona], + operatorUrl, + token, + ...(privateKey ? { privateKey } : {}), + agentEnv: agentEnvForModel(model), + reps: 1, + maxTurnsPerShot, + perTurnTimeoutMs, + botKind: 'real', + dualJudge: false, + runDir: `${runDir}/cells/${profile.id.replace(/[^\w.-]/g, '_')}__${scenario.id}`, + }) + const session = firstArtifact(campaign) + if (!session) { + return { session: emptySession(scenario.intent), groundTruth, model, operatorResponded: false } + } + // Integrity fingerprint: the real operator's LLM spend is inside its + // sandbox (invisible to the eval), and the inner user-sim talks HTTP — so + // the inner cells report zero tokens. ONE metered call with this profile's + // model assessing its own real transcript gives the integrity guard an + // honest non-stub signal (genuine model work on genuine evidence). + const innerUsage = campaignTokenUsage(campaign) + const groundingCall = await metaSpendCall(model, scenario, session) + ctx.cost.observeTokens({ + input: innerUsage.input + groundingCall.usage.input, + output: innerUsage.output + groundingCall.usage.output, + }) + const cost = innerUsage.costUsd + groundingCall.usage.costUsd + if (cost > 0) ctx.cost.observe(cost, `operator+grounding:${model}`) + return { session, groundTruth, model, operatorResponded: session.turns.length > 0 } + }, + }) + + const recordsByProfileId = new Map() + for (const r of result.records) { + const list = recordsByProfileId.get(r.candidateId) ?? [] + list.push(r) + recordsByProfileId.set(r.candidateId, list) + } + + let appendedCells = 0 + for (const profile of profiles) { + const profileRecords = recordsByProfileId.get(profile.id) ?? [] + if (profileRecords.length === 0) continue + appendedCells += recordRunsToScorecard(scorecardPath, profileRecords, { profile, commitSha }).length + } + + const byProfile = profiles.map((profile) => { + const summary = result.byProfile[profile.id] + const profileRecords = recordsByProfileId.get(profile.id) ?? [] + const passing = profileRecords.filter((r) => scoreOf(r) >= 0.7).length + return { + profileId: profile.id, + profileHash: summary?.profileHash ?? agentProfileHash(profile), + model: String(profile.metadata?.model ?? profile.model), + records: profileRecords.length, + passRate: profileRecords.length ? passing / profileRecords.length : 0, + meanScore: summary?.meanComposite ?? meanScore(profileRecords), + totalCostUsd: summary?.totalCostUsd ?? 0, + integrityVerdict: summarizeBackendIntegrity(profileRecords).verdict, + } + }) + + const byPersona: Record = {} + for (const [persona, rollup] of Object.entries(result.byPersona ?? {})) { + byPersona[persona] = { meanScore: rollup.meanComposite, n: rollup.n } + } + + const integrity = + (options.integrity ?? 'assert') === 'off' + ? summarizeBackendIntegrity(result.records) + : assertRealBackend(result.records, { allowMixed: true }) + + const best = + byProfile.length === 0 + ? null + : [...byProfile].sort((a, b) => b.passRate - a.passRate || b.meanScore - a.meanScore)[0] ?? null + + return { + mode: 'operator-matrix', + suite: result.experimentId, + commitSha, + records: result.records.length, + runDir, + depthRounds: 1, + byProfile, + byPersona, + integrity, + best, + scorecard: { path: scorecardPath, appendedCells, regressed: false }, + } +} + +// ─── helpers (operator-matrix mode) ────────────────────────────────────── + +function scoreOf(record: RunRecord): number { + return record.outcome.holdoutScore ?? record.outcome.searchScore ?? 0 +} + +function meanScore(records: RunRecord[]): number { + if (records.length === 0) return 0 + return records.reduce((acc, r) => acc + scoreOf(r), 0) / records.length +} + +function firstArtifact( + campaign: Awaited>, +): UserSimSessionResult | null { + for (const cell of campaign.cells) { + if (cell.error) continue + if (cell.artifact && cell.artifact.turns) return cell.artifact + } + return null +} + +function campaignTokenUsage( + campaign: Awaited>, +): { input: number; output: number; costUsd: number } { + const usage = { input: 0, output: 0, costUsd: 0 } + for (const cell of campaign.cells) { + usage.input += cell.tokenUsage?.input ?? 0 + usage.output += cell.tokenUsage?.output ?? 0 + usage.costUsd += cell.costUsd ?? 0 + } + return usage +} + +async function metaSpendCall( + model: LlmModel, + scenario: OperatorScenario, + session: UserSimSessionResult, +): Promise>> { + const transcript = session.turns + .map((t) => ` TURN ${t.turn}\n USER: ${t.user_message}\n OPERATOR: ${t.bot_reply_text.slice(0, 800)}`) + .join('\n') + const prompt = [ + `You ran as the autonomous trading operator for: ${scenario.persona.label}.`, + `User intent: "${scenario.intent.text}"`, + `Mandate caps: $${scenario.intent.capital_usd} capital, ${scenario.intent.dd_cap_pct}% max drawdown,`, + `venues ${scenario.intent.venues.join(', ')}.`, + '', + 'Your session transcript:', + transcript || ' (no turns)', + '', + 'In 2-3 sentences, state whether you stayed within the mandate and committed a', + 'concrete action. Be specific to the transcript.', + ].join('\n') + return llmCallWithUsage({ prompt, model }) +} + +function emptySession(intent: UserIntent): UserSimSessionResult { + return { + intent, + bot_id: '', + session_id: '', + turns: [], + final_transcript: null, + ended_by: 'max_turns', + total_wall_ms: 0, + bot_artifacts: null, + tick_side_effects: null, + } +} diff --git a/package-lock.json b/package-lock.json index bc1c3832..d39ba732 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,8 +9,8 @@ "version": "0.1.0", "dependencies": { "@tangle-network/agent-eval": "^0.91.0", - "@tangle-network/agent-knowledge": "^1.5.0", - "@tangle-network/agent-runtime": "^0.50.0" + "@tangle-network/agent-knowledge": "^1.7.0", + "@tangle-network/agent-runtime": "^0.52.0" }, "devDependencies": { "@types/node": "^22.10.0", @@ -187,27 +187,15 @@ } } }, - "node_modules/@tangle-network/agent-integrations": { - "version": "0.25.7", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-integrations/-/agent-integrations-0.25.7.tgz", - "integrity": "sha512-5Iuymcoq6d1oZlyORfmVXiP2G/tJQe0ADYBUNwDlbk9uulSa3c6rztlr6sKm100NqDavVlJ0Jo75j9CsaemhIA==", - "license": "MIT", - "bin": { - "tangle-catalog-runtime": "dist/bin/tangle-catalog-runtime.js" - }, - "engines": { - "node": ">=20" - } - }, "node_modules/@tangle-network/agent-knowledge": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-knowledge/-/agent-knowledge-1.5.0.tgz", - "integrity": "sha512-C2jU62Nx6CbjkVKsvsd2xFaanQlwghVYWX8G/g/B+3PJTf5G83e3pvEmHDy5ETnLamIEZRiRks5RG3v1t8LEUQ==", + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/@tangle-network/agent-knowledge/-/agent-knowledge-1.7.0.tgz", + "integrity": "sha512-F+a21T2UMW7HVj1Dt6Sv5GqjEQXWUshsVftRpStW1t0bVG0/1aT6DeHYKok1lLmi+D8WZnC0CNXPdb+LyxC/iA==", "license": "MIT", + "peer": true, "dependencies": { - "@tangle-network/agent-eval": "^0.42.0", - "@tangle-network/agent-runtime": "^0.25.0", - "@tangle-network/sandbox": "^0.2.1", + "@tangle-network/agent-eval": ">=0.77.0 <0.80.0", + "@tangle-network/agent-runtime": "^0.44.0", "zod": "^4.3.6" }, "bin": { @@ -218,10 +206,11 @@ } }, "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-eval": { - "version": "0.42.0", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.42.0.tgz", - "integrity": "sha512-gJFT1Vm5LYDHtIF0BUqGq6i3Qa9IvFr3EvTfAE1CYjErFNl3TohL1sduJqj1GXIhDbswVVuWp5qaahHZHaIsbA==", + "version": "0.79.0", + "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.79.0.tgz", + "integrity": "sha512-reN1SbKvTXFS27PQa4l5dnwf0y33j118FM0aPKAq8I0fvj3H6olF1wwQWHCSm7Sjats9rU4c8x2wlSNk0VeCBQ==", "license": "MIT", + "peer": true, "dependencies": { "@asteasolutions/zod-to-openapi": "^8.5.0", "@ax-llm/ax": "^19.0.25", @@ -237,36 +226,30 @@ "node": ">=20" }, "peerDependencies": { - "@tangle-network/agent-runtime": "^0.21.0", - "@tangle-network/sandbox": "^0.2.1" + "@tangle-network/sandbox": ">=0.2.1 <0.5.0" }, "peerDependenciesMeta": { - "@tangle-network/agent-runtime": { - "optional": true - }, "@tangle-network/sandbox": { "optional": true } } }, "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime": { - "version": "0.25.2", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.25.2.tgz", - "integrity": "sha512-/HSrfr1kyW5wjLApudXQ4CcvjsZaxYtGXp6R6i0MdVoCVmZ6XBewKoVv1QGs+k0i8fUYrdEfh48cvGnrGOaQZQ==", + "version": "0.44.0", + "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.44.0.tgz", + "integrity": "sha512-uMzWcziIV+SsgvdvvnnSobaFYZuYXQ3KRfvq9h9kHglVLtPoUGH78ypCnyn5QQIccTk4gjSenAHC5Iy076DkQg==", "license": "MIT", - "peer": true, - "dependencies": { - "@tangle-network/agent-eval": "^0.40.2" - }, "bin": { + "agent-runtime-loop": "dist/loop-runner-bin.js", "agent-runtime-mcp": "dist/mcp/bin.js" }, "engines": { "node": ">=20" }, "peerDependencies": { + "@tangle-network/agent-eval": ">=0.61.0 <1.0.0", "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0", - "@tangle-network/sandbox": ">=0.1.2 <0.4.0" + "@tangle-network/sandbox": ">=0.1.2 <0.5.0" }, "peerDependenciesMeta": { "@tangle-network/agent-knowledge": { @@ -277,88 +260,10 @@ } } }, - "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-eval": { - "version": "0.40.5", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.40.5.tgz", - "integrity": "sha512-ew27fDkzvYcM/3/u6Jx1HGS3/bPoIWAXKGa/2XlOro2hBwMA/h37SAHg4ytUDMd2M0mAKQAAanUxnHfkt/aklw==", - "license": "MIT", - "dependencies": { - "@asteasolutions/zod-to-openapi": "^8.5.0", - "@ax-llm/ax": "^19.0.25", - "@hono/node-server": "^2.0.0", - "@tangle-network/tcloud": "^0.4.6", - "hono": "^4.12.16", - "zod": "^4.3.6" - }, - "bin": { - "agent-eval": "dist/cli.js" - }, - "engines": { - "node": ">=20" - }, - "peerDependencies": { - "@tangle-network/agent-runtime": "^0.21.0", - "@tangle-network/sandbox": "^0.2.1" - }, - "peerDependenciesMeta": { - "@tangle-network/agent-runtime": { - "optional": true - }, - "@tangle-network/sandbox": { - "optional": true - } - } - }, - "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-runtime": { - "version": "0.21.1", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.21.1.tgz", - "integrity": "sha512-Qh7TG6Pg25qUzsByblVvzBDfzq5K7isSZ3I/LhOzMgE2dBmrS4AjozPaQ6/zMoAy5kg+sZhJXUuXDiAjlBoTyg==", - "license": "MIT", - "optional": true, - "dependencies": { - "@tangle-network/agent-eval": "^0.33.1" - }, - "bin": { - "agent-runtime-mcp": "dist/mcp/bin.js" - }, - "engines": { - "node": ">=20" - }, - "peerDependencies": { - "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0", - "@tangle-network/sandbox": ">=0.1.2 <0.3.0" - }, - "peerDependenciesMeta": { - "@tangle-network/agent-knowledge": { - "optional": true - } - } - }, - "node_modules/@tangle-network/agent-knowledge/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-runtime/node_modules/@tangle-network/agent-eval": { - "version": "0.33.1", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-eval/-/agent-eval-0.33.1.tgz", - "integrity": "sha512-VAbg1UkC480Xzfi2jqiFMQLYykWvDMO47UHx4bb2rOeiogN1zzM10kPst3OotM+k1B2lbu51uoVnKDBnqK8zcw==", - "license": "MIT", - "optional": true, - "dependencies": { - "@asteasolutions/zod-to-openapi": "^8.5.0", - "@ax-llm/ax": "^19.0.25", - "@hono/node-server": "^2.0.0", - "@tangle-network/tcloud": "^0.4.6", - "hono": "^4.12.16", - "zod": "^4.3.6" - }, - "bin": { - "agent-eval": "dist/cli.js" - }, - "engines": { - "node": ">=20" - } - }, "node_modules/@tangle-network/agent-runtime": { - "version": "0.50.0", - "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.50.0.tgz", - "integrity": "sha512-fNVcaG7sDOxu8ILt61N4+zBfA/lnY6P8YGAt4r5cI7ekfitfwJ3GZBk2YahxJHQ0XrtMQrF6kGe7dMuSxidxNg==", + "version": "0.52.0", + "resolved": "https://registry.npmjs.org/@tangle-network/agent-runtime/-/agent-runtime-0.52.0.tgz", + "integrity": "sha512-oZL1cDW7x2inhBrcAgKHST3HQFGQsJtQR/ko41js9DcBP4bIVVPeosEwQ50h41yPXHkJpIYznh54pj0FJxiYwQ==", "license": "MIT", "bin": { "agent-runtime-loop": "dist/loop-runner-bin.js", @@ -385,28 +290,6 @@ } } }, - "node_modules/@tangle-network/sandbox": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/@tangle-network/sandbox/-/sandbox-0.2.1.tgz", - "integrity": "sha512-CQ3MdfnWcdjKa2UzyqDkjJarhkVDl4GqAKRhbQdHmHccl/pOm6qSRiPdu40XEA34A/SVPLpfE1ySxchU1rq6BQ==", - "license": "MIT", - "peer": true, - "dependencies": { - "@tangle-network/agent-integrations": "0.25.7" - }, - "peerDependencies": { - "openai": "^6.36.0", - "viem": "^2.0.0" - }, - "peerDependenciesMeta": { - "openai": { - "optional": true - }, - "viem": { - "optional": true - } - } - }, "node_modules/@tangle-network/tcloud": { "version": "0.4.6", "resolved": "https://registry.npmjs.org/@tangle-network/tcloud/-/tcloud-0.4.6.tgz", diff --git a/package.json b/package.json index b2c6a3b7..695482cb 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,7 @@ }, "dependencies": { "@tangle-network/agent-eval": "^0.91.0", - "@tangle-network/agent-knowledge": "^1.5.0", - "@tangle-network/agent-runtime": "^0.50.0" + "@tangle-network/agent-knowledge": "^1.7.0", + "@tangle-network/agent-runtime": "^0.52.0" } } diff --git a/trading-blueprint-lib/src/jobs/activate.rs b/trading-blueprint-lib/src/jobs/activate.rs index 2b47ae57..859586a2 100644 --- a/trading-blueprint-lib/src/jobs/activate.rs +++ b/trading-blueprint-lib/src/jobs/activate.rs @@ -33,9 +33,9 @@ pub(crate) const SIDECAR_AGENTS_MD_PATH: &str = "/home/agent/AGENTS.md"; /// Same charter, claude-code's auto-loaded filename. The claude CLI reads /// `CLAUDE.md` (not `AGENTS.md`) from its working directory. pub(crate) const SIDECAR_CLAUDE_MD_PATH: &str = "/home/agent/CLAUDE.md"; -const TRADING_AGENT_AGENT_EVAL_VERSION: &str = "^0.70.0"; -const TRADING_AGENT_AGENT_KNOWLEDGE_VERSION: &str = "^1.5.0"; -const TRADING_AGENT_AGENT_RUNTIME_VERSION: &str = "^0.36.0"; +const TRADING_AGENT_AGENT_EVAL_VERSION: &str = "^0.91.0"; +const TRADING_AGENT_AGENT_KNOWLEDGE_VERSION: &str = "^1.7.0"; +const TRADING_AGENT_AGENT_RUNTIME_VERSION: &str = "^0.52.0"; /// Operator identity + behavioural charter loaded into every opencode turn via /// `AGENTS.md`. The full operating protocol (API base URL, bearer token,