feat(campaign): playback dispatch — product-flow E2E playback on the existing matrix (#188)

drewstone · web-flow · commit eb0afdc37a08 · 2026-06-02T13:45:34.000+03:00
Closes the fleet's launch-readiness gap (1-of-7 apps had real product-flow
playback): a substrate adapter that plugs a PlaybackDriver into runProfileMatrix.

- UserStory { id, title, steps[], requirements } extends Scenario — each
  requirement is one Jira ticket line.
- PlaybackDriver { run(story, ctx) =&gt; RuntimeEventLike[] } — the contract;
  concrete SandboxPlaybackDriver / PlaywrightPlaybackDriver live in consumers
  (they need runtime/browser infra the substrate must not import).
- makePlaybackDispatch(driver): ProfileDispatchFn&lt;UserStory, ProducedState&gt; —
  drives the real product, pipes events through extractProducedState.
- scoreUserStory + userStoryScoreboard — verifyCompletion per requirement →
  storyId×reqId PASS/FAIL, the literal tick-off.

Background-cheap by construction: runProfileMatrix is headless, durable,
cost-ceilinged, cron-dispatchable; pin AgentProfile.model to a cheap router
model. Zero upward deps. typecheck + 4 tests + biome green.
diff --git a/src/campaign/index.ts b/src/campaign/index.ts
@@ -108,6 +108,17 @@ export {
   type OptimizerEntryConfig,
   skillOptEntry,
 } from './presets/compare-drivers'
+export {
+  makePlaybackDispatch,
+  type PlaybackContext,
+  type PlaybackDriver,
+  type PlaybackStep,
+  type ScoreboardRow,
+  scoreUserStory,
+  type UserStory,
+  type UserStoryVerdict,
+  userStoryScoreboard,
+} from './presets/playback'
 export { type RunEvalOptions, runEval } from './presets/run-eval'
 export {
   defaultRenderDiff,
diff --git a/src/campaign/presets/playback.test.ts b/src/campaign/presets/playback.test.ts
@@ -0,0 +1,103 @@
+import { describe, expect, it } from 'vitest'
+
+import type { AgentProfile } from '../../agent-profile'
+import type { CorrectnessChecker } from '../../completion-verifier'
+import type { RuntimeEventLike } from '../../produced-state'
+import type { DispatchContext } from '../types'
+import {
+  makePlaybackDispatch,
+  type PlaybackContext,
+  type PlaybackDriver,
+  scoreUserStory,
+  type UserStory,
+  userStoryScoreboard,
+} from './playback'
+
+// A story whose first requirement the agent produces (an artifact) and whose
+// second it does NOT (an approval proposal) — so the scoreboard must show one
+// PASS and one FAIL, the core Jira tick-off behaviour.
+const STORY: UserStory = {
+  id: 'tax-filing-flow',
+  kind: 'user-story',
+  title: 'Produce a tax filing and route it for approval',
+  steps: [{ action: 'open a new return' }, { action: 'send: prepare the 1065 for the LLC' }],
+  requirements: [
+    { reqId: 'r1', title: 'tax filing artifact', satisfiedBy: 'artifact' },
+    { reqId: 'r2', title: 'approval proposal', satisfiedBy: 'proposal' },
+  ],
+}
+
+// Driver that emits the artifact (satisfies r1) but no proposal (fails r2).
+const fakeDriver: PlaybackDriver = {
+  async run(): Promise<readonly RuntimeEventLike[]> {
+    return [
+      {
+        type: 'artifact',
+        artifactId: 'a1',
+        name: 'tax-filing-1065.md',
+        mimeType: 'text/markdown',
+        content:
+          'A complete tax filing: Form 1065 for the LLC with partner capital ' +
+          'account allocations, Schedule K-1s, and the balance sheet reconciled.',
+      },
+    ]
+  },
+}
+
+// Deterministic checker — a structurally-present artifact with content passes.
+const passingChecker: CorrectnessChecker = async () => ({ correct: true, reason: 'stub' })
+
+const ctx = {
+  cellId: 'c0',
+  rep: 0,
+  seed: 1,
+  signal: new AbortController().signal,
+} as unknown as DispatchContext
+const profile = { id: 'haiku', model: 'anthropic/claude-haiku-4-5' } as unknown as AgentProfile
+
+describe('makePlaybackDispatch', () => {
+  it('pipes driver events through extractProducedState into ProducedState', async () => {
+    const dispatch = makePlaybackDispatch(fakeDriver)
+    const produced = await dispatch(profile, STORY, ctx)
+    expect(produced.artifacts).toHaveLength(1)
+    expect(produced.artifacts[0]!.path).toBe('tax-filing-1065.md')
+    expect(produced.proposals).toHaveLength(0)
+  })
+
+  it('forwards the profile to the driver as PlaybackContext', async () => {
+    let seen: PlaybackContext | undefined
+    const spy: PlaybackDriver = {
+      async run(_story, c) {
+        seen = c
+        return []
+      },
+    }
+    await makePlaybackDispatch(spy)(profile, STORY, ctx)
+    expect(seen?.profile.model).toBe('anthropic/claude-haiku-4-5')
+    expect(seen?.cellId).toBe('c0')
+  })
+})
+
+describe('scoreUserStory + userStoryScoreboard', () => {
+  it('produces a per-requirement PASS/FAIL tick-off', async () => {
+    const dispatch = makePlaybackDispatch(fakeDriver)
+    const produced = await dispatch(profile, STORY, ctx)
+    const verdict = await scoreUserStory(STORY, produced, passingChecker)
+
+    expect(verdict.title).toBe(STORY.title)
+    expect(verdict.fullyComplete).toBe(false)
+    expect(verdict.completionRate).toBeCloseTo(0.5)
+
+    const board = userStoryScoreboard([verdict])
+    expect(board).toHaveLength(2)
+    const byReq = Object.fromEntries(board.map((r) => [r.reqId, r.status]))
+    expect(byReq).toEqual({ r1: 'PASS', r2: 'FAIL' })
+    expect(board.every((r) => r.storyId === 'tax-filing-flow')).toBe(true)
+  })
+
+  it('every row carries evidence for the verdict', async () => {
+    const produced = await makePlaybackDispatch(fakeDriver)(profile, STORY, ctx)
+    const board = userStoryScoreboard([await scoreUserStory(STORY, produced, passingChecker)])
+    for (const row of board) expect(Array.isArray(row.evidence)).toBe(true)
+  })
+})
diff --git a/src/campaign/presets/playback.ts b/src/campaign/presets/playback.ts
@@ -0,0 +1,135 @@
+/**
+ * Product-flow playback — drive the REAL product through a user story and
+ * score the produced state per requirement (the launch "Jira tick-off").
+ *
+ * This is the substrate adapter + contract only. It plugs a `PlaybackDriver`
+ * into the existing `runProfileMatrix` dispatch seam: a driver drives the real
+ * product (a Playwright UI session or a sandbox workspace) and returns the
+ * runtime event stream; `extractProducedState` + `verifyCompletion` then score
+ * each requirement PASS/FAIL. The concrete drivers live in consumers — they
+ * depend on browser / runtime infra the substrate must not import — so
+ * agent-eval owns the seam, the `UserStory` contract, and the scoreboard.
+ */
+
+import type { AgentProfile } from '../../agent-profile'
+import {
+  type CompletionRequirement,
+  type CompletionVerdict,
+  type CorrectnessChecker,
+  type ProducedState,
+  verifyCompletion,
+} from '../../completion-verifier'
+import { extractProducedState, type RuntimeEventLike } from '../../produced-state'
+import type { DispatchContext, Scenario } from '../types'
+import type { ProfileDispatchFn } from './run-profile-matrix'
+
+/** One step of a user story — what the user does. The driver interprets
+ *  `payload` (a Playwright selector + action, or a sandbox chat turn). */
+export interface PlaybackStep {
+  /** Human-readable action, captured verbatim in the UX narrative. */
+  action: string
+  /** Driver-specific payload (e.g. `{ selector, fill }` or `{ turn }`). */
+  payload?: Record<string, unknown>
+}
+
+/**
+ * A user story = a runnable product journey plus the requirements that define
+ * "this story works". Each requirement is one Jira ticket line. Extends
+ * `Scenario` so a catalog drops straight into `runProfileMatrix({ scenarios })`.
+ */
+export interface UserStory extends Scenario {
+  /** Human-readable story title (the ticket headline). */
+  title: string
+  /** Ordered steps the driver executes. */
+  steps: PlaybackStep[]
+  /** What must hold in the produced state for the story to pass. */
+  requirements: CompletionRequirement[]
+}
+
+/** Dispatch context plus the profile under test (which cheap model, etc.). */
+export interface PlaybackContext extends DispatchContext {
+  profile: AgentProfile
+}
+
+/**
+ * Drives the real product through a story and returns the runtime event stream
+ * `extractProducedState` consumes. Implemented by CONSUMERS —
+ * `SandboxPlaybackDriver` (real API / sandbox workspace) and
+ * `PlaywrightPlaybackDriver` (real UI) — because they depend on runtime /
+ * browser infra the substrate must not import. The driver MUST report LLM
+ * usage via `ctx.cost.observeTokens` so the backend-integrity guard sees real
+ * tokens (a run that never reports tokens reads as a stub).
+ */
+export interface PlaybackDriver<TStory extends UserStory = UserStory> {
+  run(story: TStory, ctx: PlaybackContext): Promise<readonly RuntimeEventLike[]>
+}
+
+/**
+ * Adapt a `PlaybackDriver` into a `runProfileMatrix` dispatch. The artifact the
+ * matrix scores is the `ProducedState` extracted from the driver's event
+ * stream — grade it with `scoreUserStory` (or a judge wrapping it).
+ */
+export function makePlaybackDispatch<TStory extends UserStory>(
+  driver: PlaybackDriver<TStory>,
+): ProfileDispatchFn<TStory, ProducedState> {
+  return async (profile, scenario, ctx) => {
+    const events = await driver.run(scenario, { ...ctx, profile })
+    return extractProducedState(events)
+  }
+}
+
+/** A scored user story — the completion verdict plus its human title. */
+export interface UserStoryVerdict extends CompletionVerdict {
+  title: string
+}
+
+/**
+ * Score one story's produced state against its requirements. Thin wrapper over
+ * `verifyCompletion` that builds the gold from the story and returns a
+ * per-requirement PASS/FAIL verdict. `checkCorrectness` is injected — a
+ * deterministic stub in tests, `createLlmCorrectnessChecker` in production.
+ */
+export async function scoreUserStory(
+  story: UserStory,
+  state: ProducedState,
+  checkCorrectness: CorrectnessChecker,
+): Promise<UserStoryVerdict> {
+  const verdict = await verifyCompletion(
+    { taskId: story.id, requirements: story.requirements },
+    state,
+    checkCorrectness,
+  )
+  return { ...verdict, title: story.title }
+}
+
+/** One row of the launch scoreboard — story × requirement → PASS/FAIL. */
+export interface ScoreboardRow {
+  storyId: string
+  storyTitle: string
+  reqId: string
+  reqTitle: string
+  status: 'PASS' | 'FAIL'
+  evidence: string[]
+}
+
+/**
+ * Flatten story verdicts into the per-requirement scoreboard — the literal
+ * Jira tick-off: one row per (story, requirement) with PASS/FAIL and the
+ * evidence behind the verdict.
+ */
+export function userStoryScoreboard(verdicts: readonly UserStoryVerdict[]): ScoreboardRow[] {
+  const rows: ScoreboardRow[] = []
+  for (const v of verdicts) {
+    for (const r of v.requirements) {
+      rows.push({
+        storyId: v.taskId,
+        storyTitle: v.title,
+        reqId: r.reqId,
+        reqTitle: r.title,
+        status: r.satisfied ? 'PASS' : 'FAIL',
+        evidence: r.evidence,
+      })
+    }
+  }
+  return rows
+}