Skip to content

Commit 0d487bb

Browse files
authored
Merge branch 'main' into docs
2 parents 7b7f573 + fd88a99 commit 0d487bb

61 files changed

Lines changed: 128511 additions & 6218 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

apps/internal-storybook/pnpm-lock.yaml

Lines changed: 74 additions & 74 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

eval/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ node eval.ts
3333
node advanced-eval.ts
3434

3535
# With all options specified (advanced-eval)
36-
node advanced-eval.ts --agent claude-code --model claude-sonnet-4.5 --context components.json --upload-id batch-1 100-flight-booking-plain
36+
node advanced-eval.ts --agent claude-code --model claude-sonnet-4.6 --context components.json --upload-id batch-1 100-flight-booking-plain
3737
```
3838

3939
## CLI Options (Advanced Eval)
@@ -62,7 +62,7 @@ Different agents support different models:
6262
| ---------------------- | :-------------: | :---------: |
6363
| `claude-opus-4.6` |||
6464
| `claude-opus-4.5` |||
65-
| `claude-sonnet-4.5` |||
65+
| `claude-sonnet-4.6` |||
6666
| `claude-haiku-4.5` |||
6767
| `gpt-5.2` |||
6868
| `gpt-5.2-codex` |||
@@ -82,7 +82,7 @@ node advanced-eval.ts --agent copilot-cli --model gpt-5.2 100-flight-booking-pla
8282
> [!IMPORTANT]
8383
> **GitHub Copilot CLI Model Configuration**
8484
>
85-
> To use models other than `claude-sonnet-4.5` with the Copilot CLI, you must first enable them in your GitHub account settings:
85+
> To use models other than `claude-sonnet-4.6` with the Copilot CLI, you must first enable them in your GitHub account settings:
8686
>
8787
> 1. Go to [GitHub Copilot Features Settings](https://github.com/settings/copilot/features)
8888
> 2. Enable the models you want to use (e.g., GPT-5.1 Codex Max, GPT-5.2, Claude Opus 4.5)
@@ -121,7 +121,7 @@ Variant configs live under `eval/variant-configs/` and define a base setup plus
121121
// eval/variant-configs/storybook-mcp-comparison.ts
122122
const base = {
123123
agent: 'claude-code',
124-
model: 'claude-sonnet-4.5',
124+
model: 'claude-sonnet-4.6',
125125
};
126126

127127
export default {

eval/lib/agents/claude-code-cli.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ import type {
1616
*/
1717
const TOKENIZER_MODEL_MAP: Record<ClaudeModel, keyof typeof models> = {
1818
'claude-opus-4.6': 'anthropic/claude-opus-4.5',
19-
'claude-sonnet-4.5': 'anthropic/claude-sonnet-4.5',
19+
// tokenizer doesn't support 4.6 models yet
20+
'claude-sonnet-4.6': 'anthropic/claude-sonnet-4.5',
2021
'claude-haiku-4.5': 'anthropic/claude-haiku-4.5',
2122
};
2223

eval/lib/graders/mcp-tools.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,8 +283,9 @@ export async function gradeMcpTools(trialArgs: TrialArgs): Promise<McpToolsSumma
283283
taskConfig.expectedMcpTools,
284284
);
285285

286-
// Only return if there were any MCP tool calls
287-
if (mcpToolsSummary.totalCalls > 0) {
286+
// Return if there were any MCP tool calls, or if expectations were configured
287+
// (so a run that ignores MCP entirely scores 0 rather than undefined)
288+
if (mcpToolsSummary.totalCalls > 0 || mcpToolsSummary.expectedToolCount) {
288289
return mcpToolsSummary;
289290
}
290291

eval/pnpm-lock.yaml

Lines changed: 51 additions & 51 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)