storybookjs · JReinhold · Mar 11, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 5, 2026
diff --git a/.changeset/mcp-server-instructions.md b/.changeset/mcp-server-instructions.md
@@ -0,0 +1,8 @@
+---
+'@storybook/mcp': minor
+'@storybook/addon-mcp': minor
+---
+
+Add MCP server-level instructions to both packages
+
+Both `@storybook/mcp` and `@storybook/addon-mcp` now include server instructions in the MCP `initialize` response. These instructions guide agents on how to use the available tools effectively without requiring explicit prompting from users.
diff --git a/eval/README.md b/eval/README.md
@@ -33,7 +33,7 @@ node eval.ts
 node advanced-eval.ts
 
 # With all options specified (advanced-eval)
-node advanced-eval.ts --agent claude-code --model claude-sonnet-4.5 --context components.json --upload-id batch-1 100-flight-booking-plain
+node advanced-eval.ts --agent claude-code --model claude-sonnet-4.6 --context components.json --upload-id batch-1 100-flight-booking-plain
 ```
 
 ## CLI Options (Advanced Eval)
@@ -62,7 +62,7 @@ Different agents support different models:
 | ---------------------- | :-------------: | :---------: |
 | `claude-opus-4.6`      |       ✅        |     ✅      |
 | `claude-opus-4.5`      |       ❌        |     ✅      |
-| `claude-sonnet-4.5`    |       ✅        |     ✅      |
+| `claude-sonnet-4.6`    |       ✅        |     ✅      |
 | `claude-haiku-4.5`     |       ✅        |     ✅      |
 | `gpt-5.2`              |       ❌        |     ✅      |
 | `gpt-5.2-codex`        |       ❌        |     ✅      |
@@ -82,7 +82,7 @@ node advanced-eval.ts --agent copilot-cli --model gpt-5.2 100-flight-booking-pla
 > [!IMPORTANT]
 > **GitHub Copilot CLI Model Configuration**
 >
-> To use models other than `claude-sonnet-4.5` with the Copilot CLI, you must first enable them in your GitHub account settings:
+> To use models other than `claude-sonnet-4.6` with the Copilot CLI, you must first enable them in your GitHub account settings:
 >
 > 1. Go to [GitHub Copilot Features Settings](https://github.com/settings/copilot/features)
 > 2. Enable the models you want to use (e.g., GPT-5.1 Codex Max, GPT-5.2, Claude Opus 4.5)
@@ -121,7 +121,7 @@ Variant configs live under `eval/variant-configs/` and define a base setup plus
 // eval/variant-configs/storybook-mcp-comparison.ts
 const base = {
 	agent: 'claude-code',
-	model: 'claude-sonnet-4.5',
+	model: 'claude-sonnet-4.6',
 };
 
 export default {

diff --git a/eval/lib/agents/claude-code-cli.ts b/eval/lib/agents/claude-code-cli.ts
@@ -16,7 +16,7 @@ import type {
  */
 const TOKENIZER_MODEL_MAP: Record<ClaudeModel, keyof typeof models> = {
 	'claude-opus-4.6': 'anthropic/claude-opus-4.5',
-	'claude-sonnet-4.5': 'anthropic/claude-sonnet-4.5',
+	'claude-sonnet-4.6': 'anthropic/claude-sonnet-4.5',
 	'claude-haiku-4.5': 'anthropic/claude-haiku-4.5',
 };
 

diff --git a/eval/tasks/901-create-component-atom-reshaped/README.md b/eval/tasks/901-create-component-atom-reshaped/README.md
@@ -0,0 +1,27 @@
+# 901 - Create Component (Atom, Reshaped)
+
+## Purpose
+
+Tests whether the agent can build a small, accessible atomic component from scratch, add stories, and validate behavior using Storybook MCP tooling.
+
+## Setup
+
+- Reshaped is installed in trial setup.
+- The task asks for a new `ToggleSwitch` component at `src/components/ToggleSwitch.tsx`.
+- Prompt variants include concise, detailed, and explicit-story guidance.
+
+## Prompt
+
+Asks the agent to create an accessible `ToggleSwitch` with keyboard interaction and disabled behavior. Concise prompt (`prompt.concise.md`) is intentionally brief.
+
+## Quality Signal
+
+| Metric                                                  | Weight |
+| ------------------------------------------------------- | ------ |
+| MCP tools coverage (`get-storybook-story-instructions`) | 50 %   |
+| MCP tools coverage (`preview-stories`)                  | 50 %   |
+
+## Expected MCP Tools
+
+- `get-storybook-story-instructions` (at least 1 call)
+- `preview-stories` (at least 1 call)
diff --git a/eval/tasks/901-create-component-atom-reshaped/config.json b/eval/tasks/901-create-component-atom-reshaped/config.json
@@ -0,0 +1,10 @@
+{
+	"expectedMcpTools": {
+		"get-storybook-story-instructions": {
+			"minCalls": 1
+		},
+		"preview-stories": {
+			"minCalls": 1
+		}
+	}
+}
diff --git a/eval/tasks/901-create-component-atom-reshaped/hooks.ts b/eval/tasks/901-create-component-atom-reshaped/hooks.ts
@@ -1,10 +1,15 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Hooks } from '../../types.ts';
+import { fromMcpToolsCoverage } from '../../lib/quality/index.ts';
 import { addDependency } from 'nypm';
 import { log } from '@clack/prompts';
 
 const hooks: Hooks = {
+	// TODO: This quality signal is incomplete. It currently relies on MCP tool-call
+	// expectations only, and should also verify that the agent actually wrote the
+	// expected stories for the task.
+	calculateQuality: fromMcpToolsCoverage,
 	postPrepareTrial: async (trialArgs) => {
 		log.message('Installing the reshaped package');
 		await addDependency('reshaped@latest', {
@@ -19,7 +24,7 @@ export default config;
 `,
 		);
 
-		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.ts'));
+		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.tsx'));
 
 		log.success('Reshaped package installed, PostCSS config added');
 	},

diff --git a/eval/tasks/902-create-component-composite-reshaped/README.md b/eval/tasks/902-create-component-composite-reshaped/README.md
@@ -0,0 +1,27 @@
+# 902 - Create Component (Composite, Reshaped)
+
+## Purpose
+
+Tests whether the agent can create a composite UI component (multiple subparts and optional sections), write stories for key states, and verify outcomes with Storybook MCP tools.
+
+## Setup
+
+- Reshaped is installed in trial setup.
+- The task targets a new `ProfileCard` at `src/components/ProfileCard.tsx`.
+- Stories should cover avatar fallback, tags, and actions.
+
+## Prompt
+
+Asks the agent to build a `ProfileCard` with avatar/initials fallback, content sections, and accessible action buttons. Concise prompt (`prompt.concise.md`) is intentionally minimal.
+
+## Quality Signal
+
+| Metric                                                  | Weight |
+| ------------------------------------------------------- | ------ |
+| MCP tools coverage (`get-storybook-story-instructions`) | 50 %   |
+| MCP tools coverage (`run-story-tests`)                  | 50 %   |
+
+## Expected MCP Tools
+
+- `get-storybook-story-instructions` (at least 1 call)
+- `run-story-tests` (at least 1 call)
diff --git a/eval/tasks/902-create-component-composite-reshaped/config.json b/eval/tasks/902-create-component-composite-reshaped/config.json
@@ -0,0 +1,10 @@
+{
+	"expectedMcpTools": {
+		"get-storybook-story-instructions": {
+			"minCalls": 1
+		},
+		"run-story-tests": {
+			"minCalls": 1
+		}
+	}
+}
diff --git a/eval/tasks/902-create-component-composite-reshaped/hooks.ts b/eval/tasks/902-create-component-composite-reshaped/hooks.ts
@@ -1,10 +1,15 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Hooks } from '../../types.ts';
+import { fromMcpToolsCoverage } from '../../lib/quality/index.ts';
 import { addDependency } from 'nypm';
 import { log } from '@clack/prompts';
 
 const hooks: Hooks = {
+	// TODO: This quality signal is incomplete. It currently relies on MCP tool-call
+	// expectations only, and should also verify that the agent actually wrote the
+	// expected stories for the task.
+	calculateQuality: fromMcpToolsCoverage,
 	postPrepareTrial: async (trialArgs) => {
 		log.message('Installing the reshaped package');
 		await addDependency('reshaped@latest', {
@@ -19,7 +24,7 @@ export default config;
 `,
 		);
 
-		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.ts'));
+		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.tsx'));
 
 		log.success('Reshaped package installed, PostCSS config added');
 	},

diff --git a/eval/tasks/903-create-component-async-fetch-reshaped/README.md b/eval/tasks/903-create-component-async-fetch-reshaped/README.md
@@ -0,0 +1,27 @@
+# 903 - Create Component (Async Fetch, Reshaped)
+
+## Purpose
+
+Tests whether the agent can implement async data-fetching behavior with loading, empty, and error states, then create reliable stories that mock network behavior.
+
+## Setup
+
+- Reshaped, `msw`, and `msw-storybook-addon` are installed in trial setup.
+- The task targets a new `NotificationsList` at `src/components/NotificationsList.tsx`.
+- Stories are expected to mock request outcomes (loaded, empty, error) without live calls.
+
+## Prompt
+
+Asks the agent to build an async list component fetching `/api/notifications`, handling abort/errors, and wiring `onSelect`. Concise prompt (`prompt.concise.md`) is short and under-specified.
+
+## Quality Signal
+
+| Metric                                                  | Weight |
+| ------------------------------------------------------- | ------ |
+| MCP tools coverage (`get-storybook-story-instructions`) | 50 %   |
+| MCP tools coverage (`run-story-tests`)                  | 50 %   |
+
+## Expected MCP Tools
+
+- `get-storybook-story-instructions` (at least 1 call)
+- `run-story-tests` (at least 1 call)
diff --git a/eval/tasks/903-create-component-async-fetch-reshaped/config.json b/eval/tasks/903-create-component-async-fetch-reshaped/config.json
@@ -0,0 +1,10 @@
+{
+	"expectedMcpTools": {
+		"get-storybook-story-instructions": {
+			"minCalls": 1
+		},
+		"run-story-tests": {
+			"minCalls": 1
+		}
+	}
+}
diff --git a/eval/tasks/903-create-component-async-fetch-reshaped/hooks.ts b/eval/tasks/903-create-component-async-fetch-reshaped/hooks.ts
@@ -1,11 +1,16 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Hooks } from '../../types.ts';
+import { fromMcpToolsCoverage } from '../../lib/quality/index.ts';
 import { addDependency } from 'nypm';
 import { log } from '@clack/prompts';
 import { exec } from 'node:child_process';
 
 const hooks: Hooks = {
+	// TODO: This quality signal is incomplete. It currently relies on MCP tool-call
+	// expectations only, and should also verify that the agent actually wrote the
+	// expected stories for the task.
+	calculateQuality: fromMcpToolsCoverage,
 	postPrepareTrial: async (trialArgs) => {
 		log.message('Installing reshaped, msw-storybook-addon, and msw packages');
 		await addDependency(['reshaped@latest', 'msw-storybook-addon@latest', 'msw@latest'], {
@@ -25,7 +30,7 @@ export default config;
 			cwd: trialArgs.projectPath,
 		});
 
-		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.ts'));
+		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.tsx'));
 
 		log.success(
 			'Reshaped package installed, PostCSS config added, MSW packages installed and MSW for Storybook configured',

diff --git a/eval/tasks/904-create-component-async-module-reshaped/README.md b/eval/tasks/904-create-component-async-module-reshaped/README.md
@@ -0,0 +1,27 @@
+# 904 - Create Component (Async Module, Reshaped)
+
+## Purpose
+
+Tests whether the agent can build an async component that depends on an imported service module, model loading/empty/error states, and author testable stories with mocked module behavior.
+
+## Setup
+
+- Reshaped is installed in trial setup.
+- The task targets `InventoryList` in `src/components/InventoryList.tsx`.
+- Data comes from `getInventory()` in `src/services/inventoryApi.ts`.
+
+## Prompt
+
+Asks the agent to implement a service-backed async list with robust states and add stories for loaded, empty, and error paths. Concise prompt (`prompt.concise.md`) only states the core objective.
+
+## Quality Signal
+
+| Metric                                                  | Weight |
+| ------------------------------------------------------- | ------ |
+| MCP tools coverage (`get-storybook-story-instructions`) | 50 %   |
+| MCP tools coverage (`run-story-tests`)                  | 50 %   |
+
+## Expected MCP Tools
+
+- `get-storybook-story-instructions` (at least 1 call)
+- `run-story-tests` (at least 1 call)
diff --git a/eval/tasks/904-create-component-async-module-reshaped/config.json b/eval/tasks/904-create-component-async-module-reshaped/config.json
@@ -0,0 +1,10 @@
+{
+	"expectedMcpTools": {
+		"get-storybook-story-instructions": {
+			"minCalls": 1
+		},
+		"run-story-tests": {
+			"minCalls": 1
+		}
+	}
+}
diff --git a/eval/tasks/904-create-component-async-module-reshaped/hooks.ts b/eval/tasks/904-create-component-async-module-reshaped/hooks.ts
@@ -1,10 +1,15 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Hooks } from '../../types.ts';
+import { fromMcpToolsCoverage } from '../../lib/quality/index.ts';
 import { addDependency } from 'nypm';
 import { log } from '@clack/prompts';
 
 const hooks: Hooks = {
+	// TODO: This quality signal is incomplete. It currently relies on MCP tool-call
+	// expectations only, and should also verify that the agent actually wrote the
+	// expected stories for the task.
+	calculateQuality: fromMcpToolsCoverage,
 	postPrepareTrial: async (trialArgs) => {
 		log.message('Installing the reshaped package');
 		await addDependency('reshaped@latest', {
@@ -19,7 +24,7 @@ export default config;
 `,
 		);
 
-		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.ts'));
+		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.tsx'));
 
 		log.success('Reshaped package installed, PostCSS config added');
 	},

diff --git a/eval/tasks/905-existing-component-write-story-reshaped/README.md b/eval/tasks/905-existing-component-write-story-reshaped/README.md
@@ -0,0 +1,27 @@
+# 905 - Existing Component: Write Story (Reshaped)
+
+## Purpose
+
+Tests whether the agent can discover and document states for an existing component by writing complete stories from scratch.
+
+## Setup
+
+- Reshaped is installed in trial setup.
+- `AlertBanner` already exists at `src/components/AlertBanner.tsx`.
+- The task is to create `stories/AlertBanner.stories.tsx` with key variants.
+
+## Prompt
+
+Asks the agent to author stories for `info`, `success`, `warning`, `error`, and dismissible behavior. Concise prompt (`prompt.concise.md`) gives only the target file and component.
+
+## Quality Signal
+
+| Metric                                                  | Weight |
+| ------------------------------------------------------- | ------ |
+| MCP tools coverage (`get-storybook-story-instructions`) | 50 %   |
+| MCP tools coverage (`run-story-tests`)                  | 50 %   |
+
+## Expected MCP Tools
+
+- `get-storybook-story-instructions` (at least 1 call)
+- `run-story-tests` (at least 1 call)
diff --git a/eval/tasks/905-existing-component-write-story-reshaped/config.json b/eval/tasks/905-existing-component-write-story-reshaped/config.json
@@ -0,0 +1,10 @@
+{
+	"expectedMcpTools": {
+		"get-storybook-story-instructions": {
+			"minCalls": 1
+		},
+		"run-story-tests": {
+			"minCalls": 1
+		}
+	}
+}
diff --git a/eval/tasks/905-existing-component-write-story-reshaped/hooks.ts b/eval/tasks/905-existing-component-write-story-reshaped/hooks.ts
@@ -1,10 +1,15 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs/promises';
 import type { Hooks } from '../../types.ts';
+import { fromMcpToolsCoverage } from '../../lib/quality/index.ts';
 import { addDependency } from 'nypm';
 import { log } from '@clack/prompts';
 
 const hooks: Hooks = {
+	// TODO: This quality signal is incomplete. It currently relies on MCP tool-call
+	// expectations only, and should also verify that the agent actually wrote the
+	// expected stories for the task.
+	calculateQuality: fromMcpToolsCoverage,
 	postPrepareTrial: async (trialArgs) => {
 		log.message('Installing the reshaped package');
 		await addDependency('reshaped@latest', {
@@ -19,7 +24,7 @@ export default config;
 `,
 		);
 
-		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.ts'));
+		await fs.unlink(path.join(trialArgs.projectPath, '.storybook', 'preview.tsx'));
 
 		log.success('Reshaped package installed, PostCSS config added');
 	},