Skip to content

Commit eb0afdc

Browse files
authored
feat(campaign): playback dispatch — product-flow E2E playback on the existing matrix (#188)
Closes the fleet's launch-readiness gap (1-of-7 apps had real product-flow playback): a substrate adapter that plugs a PlaybackDriver into runProfileMatrix. - UserStory { id, title, steps[], requirements } extends Scenario — each requirement is one Jira ticket line. - PlaybackDriver { run(story, ctx) => RuntimeEventLike[] } — the contract; concrete SandboxPlaybackDriver / PlaywrightPlaybackDriver live in consumers (they need runtime/browser infra the substrate must not import). - makePlaybackDispatch(driver): ProfileDispatchFn<UserStory, ProducedState> — drives the real product, pipes events through extractProducedState. - scoreUserStory + userStoryScoreboard — verifyCompletion per requirement → storyId×reqId PASS/FAIL, the literal tick-off. Background-cheap by construction: runProfileMatrix is headless, durable, cost-ceilinged, cron-dispatchable; pin AgentProfile.model to a cheap router model. Zero upward deps. typecheck + 4 tests + biome green.
1 parent d9f36a6 commit eb0afdc

3 files changed

Lines changed: 249 additions & 0 deletions

File tree

src/campaign/index.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,17 @@ export {
108108
type OptimizerEntryConfig,
109109
skillOptEntry,
110110
} from './presets/compare-drivers'
111+
export {
112+
makePlaybackDispatch,
113+
type PlaybackContext,
114+
type PlaybackDriver,
115+
type PlaybackStep,
116+
type ScoreboardRow,
117+
scoreUserStory,
118+
type UserStory,
119+
type UserStoryVerdict,
120+
userStoryScoreboard,
121+
} from './presets/playback'
111122
export { type RunEvalOptions, runEval } from './presets/run-eval'
112123
export {
113124
defaultRenderDiff,
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import { describe, expect, it } from 'vitest'
2+
3+
import type { AgentProfile } from '../../agent-profile'
4+
import type { CorrectnessChecker } from '../../completion-verifier'
5+
import type { RuntimeEventLike } from '../../produced-state'
6+
import type { DispatchContext } from '../types'
7+
import {
8+
makePlaybackDispatch,
9+
type PlaybackContext,
10+
type PlaybackDriver,
11+
scoreUserStory,
12+
type UserStory,
13+
userStoryScoreboard,
14+
} from './playback'
15+
16+
// A story whose first requirement the agent produces (an artifact) and whose
17+
// second it does NOT (an approval proposal) — so the scoreboard must show one
18+
// PASS and one FAIL, the core Jira tick-off behaviour.
19+
const STORY: UserStory = {
20+
id: 'tax-filing-flow',
21+
kind: 'user-story',
22+
title: 'Produce a tax filing and route it for approval',
23+
steps: [{ action: 'open a new return' }, { action: 'send: prepare the 1065 for the LLC' }],
24+
requirements: [
25+
{ reqId: 'r1', title: 'tax filing artifact', satisfiedBy: 'artifact' },
26+
{ reqId: 'r2', title: 'approval proposal', satisfiedBy: 'proposal' },
27+
],
28+
}
29+
30+
// Driver that emits the artifact (satisfies r1) but no proposal (fails r2).
31+
const fakeDriver: PlaybackDriver = {
32+
async run(): Promise<readonly RuntimeEventLike[]> {
33+
return [
34+
{
35+
type: 'artifact',
36+
artifactId: 'a1',
37+
name: 'tax-filing-1065.md',
38+
mimeType: 'text/markdown',
39+
content:
40+
'A complete tax filing: Form 1065 for the LLC with partner capital ' +
41+
'account allocations, Schedule K-1s, and the balance sheet reconciled.',
42+
},
43+
]
44+
},
45+
}
46+
47+
// Deterministic checker — a structurally-present artifact with content passes.
48+
const passingChecker: CorrectnessChecker = async () => ({ correct: true, reason: 'stub' })
49+
50+
const ctx = {
51+
cellId: 'c0',
52+
rep: 0,
53+
seed: 1,
54+
signal: new AbortController().signal,
55+
} as unknown as DispatchContext
56+
const profile = { id: 'haiku', model: 'anthropic/claude-haiku-4-5' } as unknown as AgentProfile
57+
58+
describe('makePlaybackDispatch', () => {
59+
it('pipes driver events through extractProducedState into ProducedState', async () => {
60+
const dispatch = makePlaybackDispatch(fakeDriver)
61+
const produced = await dispatch(profile, STORY, ctx)
62+
expect(produced.artifacts).toHaveLength(1)
63+
expect(produced.artifacts[0]!.path).toBe('tax-filing-1065.md')
64+
expect(produced.proposals).toHaveLength(0)
65+
})
66+
67+
it('forwards the profile to the driver as PlaybackContext', async () => {
68+
let seen: PlaybackContext | undefined
69+
const spy: PlaybackDriver = {
70+
async run(_story, c) {
71+
seen = c
72+
return []
73+
},
74+
}
75+
await makePlaybackDispatch(spy)(profile, STORY, ctx)
76+
expect(seen?.profile.model).toBe('anthropic/claude-haiku-4-5')
77+
expect(seen?.cellId).toBe('c0')
78+
})
79+
})
80+
81+
describe('scoreUserStory + userStoryScoreboard', () => {
82+
it('produces a per-requirement PASS/FAIL tick-off', async () => {
83+
const dispatch = makePlaybackDispatch(fakeDriver)
84+
const produced = await dispatch(profile, STORY, ctx)
85+
const verdict = await scoreUserStory(STORY, produced, passingChecker)
86+
87+
expect(verdict.title).toBe(STORY.title)
88+
expect(verdict.fullyComplete).toBe(false)
89+
expect(verdict.completionRate).toBeCloseTo(0.5)
90+
91+
const board = userStoryScoreboard([verdict])
92+
expect(board).toHaveLength(2)
93+
const byReq = Object.fromEntries(board.map((r) => [r.reqId, r.status]))
94+
expect(byReq).toEqual({ r1: 'PASS', r2: 'FAIL' })
95+
expect(board.every((r) => r.storyId === 'tax-filing-flow')).toBe(true)
96+
})
97+
98+
it('every row carries evidence for the verdict', async () => {
99+
const produced = await makePlaybackDispatch(fakeDriver)(profile, STORY, ctx)
100+
const board = userStoryScoreboard([await scoreUserStory(STORY, produced, passingChecker)])
101+
for (const row of board) expect(Array.isArray(row.evidence)).toBe(true)
102+
})
103+
})

src/campaign/presets/playback.ts

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/**
2+
* Product-flow playback — drive the REAL product through a user story and
3+
* score the produced state per requirement (the launch "Jira tick-off").
4+
*
5+
* This is the substrate adapter + contract only. It plugs a `PlaybackDriver`
6+
* into the existing `runProfileMatrix` dispatch seam: a driver drives the real
7+
* product (a Playwright UI session or a sandbox workspace) and returns the
8+
* runtime event stream; `extractProducedState` + `verifyCompletion` then score
9+
* each requirement PASS/FAIL. The concrete drivers live in consumers — they
10+
* depend on browser / runtime infra the substrate must not import — so
11+
* agent-eval owns the seam, the `UserStory` contract, and the scoreboard.
12+
*/
13+
14+
import type { AgentProfile } from '../../agent-profile'
15+
import {
16+
type CompletionRequirement,
17+
type CompletionVerdict,
18+
type CorrectnessChecker,
19+
type ProducedState,
20+
verifyCompletion,
21+
} from '../../completion-verifier'
22+
import { extractProducedState, type RuntimeEventLike } from '../../produced-state'
23+
import type { DispatchContext, Scenario } from '../types'
24+
import type { ProfileDispatchFn } from './run-profile-matrix'
25+
26+
/** One step of a user story — what the user does. The driver interprets
27+
* `payload` (a Playwright selector + action, or a sandbox chat turn). */
28+
export interface PlaybackStep {
29+
/** Human-readable action, captured verbatim in the UX narrative. */
30+
action: string
31+
/** Driver-specific payload (e.g. `{ selector, fill }` or `{ turn }`). */
32+
payload?: Record<string, unknown>
33+
}
34+
35+
/**
36+
* A user story = a runnable product journey plus the requirements that define
37+
* "this story works". Each requirement is one Jira ticket line. Extends
38+
* `Scenario` so a catalog drops straight into `runProfileMatrix({ scenarios })`.
39+
*/
40+
export interface UserStory extends Scenario {
41+
/** Human-readable story title (the ticket headline). */
42+
title: string
43+
/** Ordered steps the driver executes. */
44+
steps: PlaybackStep[]
45+
/** What must hold in the produced state for the story to pass. */
46+
requirements: CompletionRequirement[]
47+
}
48+
49+
/** Dispatch context plus the profile under test (which cheap model, etc.). */
50+
export interface PlaybackContext extends DispatchContext {
51+
profile: AgentProfile
52+
}
53+
54+
/**
55+
* Drives the real product through a story and returns the runtime event stream
56+
* `extractProducedState` consumes. Implemented by CONSUMERS —
57+
* `SandboxPlaybackDriver` (real API / sandbox workspace) and
58+
* `PlaywrightPlaybackDriver` (real UI) — because they depend on runtime /
59+
* browser infra the substrate must not import. The driver MUST report LLM
60+
* usage via `ctx.cost.observeTokens` so the backend-integrity guard sees real
61+
* tokens (a run that never reports tokens reads as a stub).
62+
*/
63+
export interface PlaybackDriver<TStory extends UserStory = UserStory> {
64+
run(story: TStory, ctx: PlaybackContext): Promise<readonly RuntimeEventLike[]>
65+
}
66+
67+
/**
68+
* Adapt a `PlaybackDriver` into a `runProfileMatrix` dispatch. The artifact the
69+
* matrix scores is the `ProducedState` extracted from the driver's event
70+
* stream — grade it with `scoreUserStory` (or a judge wrapping it).
71+
*/
72+
export function makePlaybackDispatch<TStory extends UserStory>(
73+
driver: PlaybackDriver<TStory>,
74+
): ProfileDispatchFn<TStory, ProducedState> {
75+
return async (profile, scenario, ctx) => {
76+
const events = await driver.run(scenario, { ...ctx, profile })
77+
return extractProducedState(events)
78+
}
79+
}
80+
81+
/** A scored user story — the completion verdict plus its human title. */
82+
export interface UserStoryVerdict extends CompletionVerdict {
83+
title: string
84+
}
85+
86+
/**
87+
* Score one story's produced state against its requirements. Thin wrapper over
88+
* `verifyCompletion` that builds the gold from the story and returns a
89+
* per-requirement PASS/FAIL verdict. `checkCorrectness` is injected — a
90+
* deterministic stub in tests, `createLlmCorrectnessChecker` in production.
91+
*/
92+
export async function scoreUserStory(
93+
story: UserStory,
94+
state: ProducedState,
95+
checkCorrectness: CorrectnessChecker,
96+
): Promise<UserStoryVerdict> {
97+
const verdict = await verifyCompletion(
98+
{ taskId: story.id, requirements: story.requirements },
99+
state,
100+
checkCorrectness,
101+
)
102+
return { ...verdict, title: story.title }
103+
}
104+
105+
/** One row of the launch scoreboard — story × requirement → PASS/FAIL. */
106+
export interface ScoreboardRow {
107+
storyId: string
108+
storyTitle: string
109+
reqId: string
110+
reqTitle: string
111+
status: 'PASS' | 'FAIL'
112+
evidence: string[]
113+
}
114+
115+
/**
116+
* Flatten story verdicts into the per-requirement scoreboard — the literal
117+
* Jira tick-off: one row per (story, requirement) with PASS/FAIL and the
118+
* evidence behind the verdict.
119+
*/
120+
export function userStoryScoreboard(verdicts: readonly UserStoryVerdict[]): ScoreboardRow[] {
121+
const rows: ScoreboardRow[] = []
122+
for (const v of verdicts) {
123+
for (const r of v.requirements) {
124+
rows.push({
125+
storyId: v.taskId,
126+
storyTitle: v.title,
127+
reqId: r.reqId,
128+
reqTitle: r.title,
129+
status: r.satisfied ? 'PASS' : 'FAIL',
130+
evidence: r.evidence,
131+
})
132+
}
133+
}
134+
return rows
135+
}

0 commit comments

Comments
 (0)