Merge pull request #7 from lewisnsmith/feat/slash-commands

lewisnsmith · web-flow · commit 75332b218e38 · 2026-04-18T18:49:34.000-07:00
feat: add /flight-review, /flight-compare, /flight-annotate slash commands
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## Unreleased
+
+### Added
+- `/flight-review` slash command: structured session critique (retries, errors, tool overuse, good decisions) via `flight show` and `flight logs verbose`.
+- `/flight-compare` slash command: 3-bullet experiment diff (winner, biggest delta, suggested next test) via `flight experiment diff`.
+- `/flight-annotate` slash command: per-turn labelling with strict one-command-per-turn output for persisting annotations via `flight annotate`.
+
 ## 1.5.0
 
 ### Breaking
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -106,6 +106,9 @@ Installed in `~/.claude/settings.json` by `flight claude setup`:
 Installed in `~/.claude/commands/` by `flight claude setup`:
 - **`/flight`** — quick session audit (runs `flight logs audit`)
 - **`/flight-log`** — comprehensive view (runs `flight logs verbose`)
+- **`/flight-review`** — annotates a session for retries, errors, tool overuse, and good decisions (runs `flight show` + `flight logs verbose`)
+- **`/flight-compare`** — diffs two experiments with a 3-bullet summary: winner, biggest delta, next test (runs `flight experiment diff`)
+- **`/flight-annotate`** — labels each turn and emits `flight annotate` shell commands to persist labels (runs `flight logs verbose`)
 
 ### Data Locations
 - `~/.flight/experiments/<name>.json` — experiment registry (one JSON file per experiment)
diff --git a/README.md b/README.md
@@ -126,6 +126,16 @@ flight claude init code --apply       # Wrap MCP servers for full traffic record
 flight logs tail
 ```
 
+**Slash commands** (installed by `flight claude setup`):
+
+| Command | What it does |
+|---|---|
+| `/flight` | Quick session audit — overview, tool breakdown, issues |
+| `/flight-log` | Full session view with complete input/output payloads |
+| `/flight-review` | Structured critique: retries, errors, tool overuse, good decisions |
+| `/flight-compare` | 3-bullet experiment diff: winner, biggest delta, suggested next test |
+| `/flight-annotate` | Label each turn and emit `flight annotate` commands to persist labels |
+
 ```
 ● Tailing session_20260315_142201
 
diff --git a/src/cli.ts b/src/cli.ts
@@ -1051,7 +1051,7 @@ claude
         console.log(`\x1b[32m✓\x1b[0m Restored original MCP config from backup`);
       }
       if (result.slashCommandRemoved) {
-        console.log(`\x1b[32m✓\x1b[0m Removed /flight and /flight-log slash commands`);
+        console.log(`\x1b[32m✓\x1b[0m Removed Flight slash commands`);
       }
       return;
     }
diff --git a/src/setup.ts b/src/setup.ts
@@ -38,6 +38,60 @@ Read the output carefully and present it to the user. This is the detailed view
 If the output is very long, focus on errors and notable calls first, then offer to walk through specific sections.
 
 For a quick summary instead, the user can run \`/flight\`.
+`,
+  },
+  {
+    filename: "flight-review.md",
+    content: `Run \`flight show $ARGUMENTS\` to load the session. If you need more detail (full payloads, turn-by-turn breakdown), also run \`flight logs verbose $ARGUMENTS\`.
+
+Analyse the session output and produce a structured critique:
+
+## Session Review
+
+**Retries** — identify any tool calls that were retried; note the original failure reason and whether the retry succeeded.
+
+**Errors** — list every error with: tool name, timestamp, error message, and a diagnosis (transient vs. logic bug vs. permission issue).
+
+**Tool overuse** — flag any tool called more than 3 times in a row for the same purpose, or any redundant read→read sequences where the file did not change between calls.
+
+**Good decisions** — call out at least one thing the agent did well (e.g. correct tool selection, efficient batching, clean error recovery).
+
+**Overall verdict** — one sentence: was this session efficient, acceptable, or problematic?
+
+Be specific. Reference turn IDs and tool names from the output, not vague generalities.
+`,
+  },
+  {
+    filename: "flight-compare.md",
+    content: `Run \`flight experiment diff $ARGUMENTS\` where $ARGUMENTS is two experiment names separated by a space (e.g. \`bench-a bench-b\`).
+
+Read the diff output and produce a 3-bullet summary:
+
+- **Winner** — which experiment performed better overall, and on which primary metric (e.g. total tokens, error rate, latency).
+- **Biggest delta** — the single metric with the largest absolute or relative difference between the two experiments; include the numbers.
+- **Suggested next test** — one concrete follow-up experiment to run, based on what the diff reveals (e.g. "isolate the model change", "test with a smaller tool set", "re-run with stricter system prompt").
+
+Be specific about which metrics differ. Do not produce prose summaries — use the three bullets only.
+`,
+  },
+  {
+    filename: "flight-annotate.md",
+    content: `Run \`flight logs verbose $ARGUMENTS\` to load the full session.
+
+For each turn in the output, assign exactly one label from: \`good\`, \`bad\`, \`redundant_call\`, \`hallucination\`, \`correct_tool\`.
+
+Emit exactly one shell command per turn, in this format:
+\`\`\`
+flight annotate <turn-id> --label <label> --type turn
+\`\`\`
+
+Rules:
+- No prose between the shell commands.
+- Do not skip any turn — every turn gets exactly one command.
+- Use the turn ID from the verbose output (e.g. \`turn_001\`).
+- Choose the most specific label: prefer \`hallucination\` or \`redundant_call\` over \`bad\` when they apply.
+
+After emitting all commands, say: "Run the above commands to persist labels."
 `,
   },
 ] as const;
@@ -319,9 +373,9 @@ export async function runSetupWizard(
 
   if (features.slashCommands) {
     if (result.slashCommandInstalled) {
-      console.log(`${C.green}  ✓${C.reset} Installed /flight and /flight-log slash commands`);
+      console.log(`${C.green}  ✓${C.reset} Installed Flight slash commands`);
     } else {
-      console.log(`${C.yellow}  !${C.reset} /flight and /flight-log slash commands already installed`);
+      console.log(`${C.yellow}  !${C.reset} Flight slash commands already installed`);
     }
   } else {
     console.log(`${C.dim}  - Slash commands: skipped${C.reset}`);
diff --git a/test/setup.test.ts b/test/setup.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect, afterEach } from "vitest";
-import { writeFile, mkdir, rm, readFile } from "node:fs/promises";
+import { writeFile, mkdir, rm, readFile, access } from "node:fs/promises";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
 import { runSetup, runRemove } from "../src/setup.js";
@@ -90,3 +90,125 @@ describe("runRemove", () => {
     expect(JSON.parse(restored).mcpServers.myserver.command).toBe("my-mcp");
   });
 });
+
+describe("slash commands", () => {
+  const testDir = join(tmpdir(), `flight-slash-${Date.now()}`);
+  const commandsDir = join(testDir, ".claude", "commands");
+  const expectedFiles = [
+    "flight.md",
+    "flight-log.md",
+    "flight-review.md",
+    "flight-compare.md",
+    "flight-annotate.md",
+  ];
+
+  afterEach(async () => {
+    try { await rm(testDir, { recursive: true }); } catch { /* ignore */ }
+  });
+
+  it("installs all five slash command files when slashCommands: true", async () => {
+    const claudeDir = join(testDir, ".claude");
+    await mkdir(claudeDir, { recursive: true });
+    await writeFile(join(claudeDir, "settings.json"), JSON.stringify({}));
+
+    await runSetup(
+      {
+        homeDir: testDir,
+        settingsPath: join(claudeDir, "settings.json"),
+        claudeCodeConfigPath: join(testDir, ".claude.json"),
+      },
+      { hooks: false, proxy: false, pd: false, slashCommands: true, banner: true },
+    );
+
+    for (const filename of expectedFiles) {
+      const filePath = join(commandsDir, filename);
+      await expect(access(filePath)).resolves.toBeUndefined();
+    }
+  });
+
+  it("each command body references flight logs (not flight log)", async () => {
+    const claudeDir = join(testDir, ".claude");
+    await mkdir(claudeDir, { recursive: true });
+    await writeFile(join(claudeDir, "settings.json"), JSON.stringify({}));
+
+    await runSetup(
+      {
+        homeDir: testDir,
+        settingsPath: join(claudeDir, "settings.json"),
+        claudeCodeConfigPath: join(testDir, ".claude.json"),
+      },
+      { hooks: false, proxy: false, pd: false, slashCommands: true, banner: true },
+    );
+
+    for (const filename of expectedFiles) {
+      const body = await readFile(join(commandsDir, filename), "utf-8");
+      // No body should reference the old "flight log " pattern (singular, with trailing space)
+      expect(body).not.toMatch(/`flight log /);
+    }
+  });
+
+  it("command bodies reference the correct CLI verbs", async () => {
+    const claudeDir = join(testDir, ".claude");
+    await mkdir(claudeDir, { recursive: true });
+    await writeFile(join(claudeDir, "settings.json"), JSON.stringify({}));
+
+    await runSetup(
+      {
+        homeDir: testDir,
+        settingsPath: join(claudeDir, "settings.json"),
+        claudeCodeConfigPath: join(testDir, ".claude.json"),
+      },
+      { hooks: false, proxy: false, pd: false, slashCommands: true, banner: true },
+    );
+
+    const flightBody = await readFile(join(commandsDir, "flight.md"), "utf-8");
+    expect(flightBody).toContain("flight logs audit");
+
+    const flightLogBody = await readFile(join(commandsDir, "flight-log.md"), "utf-8");
+    expect(flightLogBody).toContain("flight logs verbose");
+
+    const reviewBody = await readFile(join(commandsDir, "flight-review.md"), "utf-8");
+    expect(reviewBody).toContain("flight show $ARGUMENTS");
+    expect(reviewBody).toContain("flight logs verbose $ARGUMENTS");
+
+    const compareBody = await readFile(join(commandsDir, "flight-compare.md"), "utf-8");
+    expect(compareBody).toContain("flight experiment diff $ARGUMENTS");
+
+    const annotateBody = await readFile(join(commandsDir, "flight-annotate.md"), "utf-8");
+    expect(annotateBody).toContain("flight logs verbose $ARGUMENTS");
+    expect(annotateBody).toContain("flight annotate <turn-id> --label <label> --type turn");
+  });
+
+  it("removes all five slash command files on runRemove", async () => {
+    const claudeDir = join(testDir, ".claude");
+    await mkdir(claudeDir, { recursive: true });
+    await writeFile(join(claudeDir, "settings.json"), JSON.stringify({}));
+
+    // Install first
+    await runSetup(
+      {
+        homeDir: testDir,
+        settingsPath: join(claudeDir, "settings.json"),
+        claudeCodeConfigPath: join(testDir, ".claude.json"),
+      },
+      { hooks: false, proxy: false, pd: false, slashCommands: true, banner: true },
+    );
+
+    // Verify they exist
+    for (const filename of expectedFiles) {
+      await expect(access(join(commandsDir, filename))).resolves.toBeUndefined();
+    }
+
+    // Remove
+    await runRemove({
+      homeDir: testDir,
+      settingsPath: join(claudeDir, "settings.json"),
+      claudeCodeConfigPath: join(testDir, ".claude.json"),
+    });
+
+    // Verify they're gone
+    for (const filename of expectedFiles) {
+      await expect(access(join(commandsDir, filename))).rejects.toThrow();
+    }
+  });
+});

Original file line number	Diff line number	Diff line change
`@@ -1051,7 +1051,7 @@ claude`
`1051`	`1051`	console.log(`\x1b[32m✓\x1b[0m Restored original MCP config from backup`);
`1052`	`1052`	`}`
`1053`	`1053`	`if (result.slashCommandRemoved) {`
`1054`		- console.log(`\x1b[32m✓\x1b[0m Removed /flight and /flight-log slash commands`);
	`1054`	+ console.log(`\x1b[32m✓\x1b[0m Removed Flight slash commands`);
`1055`	`1055`	`}`
`1056`	`1056`	`return;`
`1057`	`1057`	`}`