Skip to content

Commit 00af2ed

Browse files
committed
use vitest cli instead of node for evals
1 parent b3c88e2 commit 00af2ed

15 files changed

Lines changed: 1782 additions & 742 deletions

File tree

eval/lib/agents/claude-code-cli.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,7 @@ function getTodoProgress(
285285
messages: ClaudeCodeStreamMessage[],
286286
): TodoProgress | null {
287287
// Find the most recent TodoWrite message
288-
for (let i = messages.length - 1; i >= 0; i--) {
289-
const message = messages[i];
288+
for (const message of messages.toReversed()) {
290289
if (message.type === 'assistant') {
291290
const todoWrite = message.message.content.find(
292291
(c): c is ToolUseContent =>

eval/lib/collect-args.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ export async function collectArgs() {
362362
const config: McpServerConfig = {
363363
[mcpServerName]: {
364364
type: 'stdio',
365-
command,
365+
command: command!,
366366
args: argsParts.length > 0 ? argsParts : undefined,
367367
},
368368
};

eval/lib/evaluations/prepare-evaluations.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,20 @@ export async function prepareEvaluations({
2828
filter: (source) =>
2929
!source.includes('node_modules') && !source.includes('dist'),
3030
});
31+
32+
const { default: pkgJson } = await import(
33+
path.join(projectPath, 'package.json'),
34+
{
35+
with: { type: 'json' },
36+
}
37+
);
38+
// add the storybook script after agent execution, so it does not taint the experiment
39+
pkgJson.scripts.storybook = 'storybook dev --port 6006';
40+
await fs.writeFile(
41+
path.join(projectPath, 'package.json'),
42+
JSON.stringify(pkgJson, null, 2),
43+
);
44+
3145
await fs
3246
.cp(
3347
path.join(evalPath, 'expected', 'stories'),

eval/lib/evaluations/test-stories.ts

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,41 @@
1-
import { startVitest } from 'vitest/node';
21
import * as path from 'node:path';
32
import * as fs from 'node:fs/promises';
43
import type { EvaluationSummary, ExperimentArgs } from '../../types';
54
import type { JsonTestResults } from 'vitest/reporters';
5+
import { x } from 'tinyexec';
6+
import { dedent } from 'ts-dedent';
67

78
export async function testStories({
89
projectPath,
910
resultsPath,
1011
}: ExperimentArgs): Promise<Pick<EvaluationSummary, 'test' | 'a11y'>> {
1112
const testResultsPath = path.join(resultsPath, 'tests.json');
1213

13-
const vitest = await startVitest('test', undefined, {
14-
root: projectPath,
15-
watch: false,
16-
silent: true,
17-
reporters: ['json'],
18-
outputFile: testResultsPath,
14+
const result = await x('pnpm', ['test'], {
15+
nodeOptions: {
16+
cwd: projectPath,
17+
},
1918
});
2019

21-
await vitest.close();
20+
await fs.writeFile(
21+
path.join(resultsPath, 'tests.md'),
22+
dedent`# Test Results
23+
24+
**Exit Code:** ${result.exitCode}
25+
26+
## stdout
27+
28+
\`\`\`sh
29+
${result.stdout}
30+
\`\`\`
31+
32+
## stderr
33+
34+
\`\`\`
35+
${result.stderr}
36+
\`\`\`
37+
`,
38+
);
2239

2340
const { default: jsonTestResults } = (await import(testResultsPath, {
2441
with: { type: 'json' },

eval/lib/evaluations/typecheck.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ if (import.meta.main) {
109109
}
110110
console.log({
111111
typeErrors: await checkTypes({
112-
projectPath: path.join(experimentPath[0], 'project'),
113-
resultsPath: path.join(experimentPath[0], 'results'),
112+
projectPath: path.join(experimentPath[0]!, 'project'),
113+
resultsPath: path.join(experimentPath[0]!, 'results'),
114114
} as ExperimentArgs),
115115
});
116116
}

eval/lib/save/google-sheet.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ function getContextDetails(context: Context): string {
4444
const mcpConfig = Object.values(context.mcpServerConfig)[0];
4545
if (mcpConfig?.type === 'stdio' && mcpConfig.args) {
4646
const manifestIndex = mcpConfig.args.indexOf('--manifestPath');
47-
if (manifestIndex !== -1 && mcpConfig.args[manifestIndex + 1]) {
48-
return path.basename(mcpConfig.args[manifestIndex + 1]);
47+
const manifestIndexValue = mcpConfig.args[manifestIndex + 1];
48+
if (manifestIndex !== -1 && manifestIndexValue) {
49+
return path.basename(manifestIndexValue);
4950
}
5051
}
5152
return 'unknown manifest name';

eval/package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
"scripts": {
88
"build-storybook": "storybook build",
99
"eval": "node eval.ts",
10-
"storybook": "storybook dev -p 6006"
10+
"storybook": "storybook dev -p 6006",
11+
"typecheck": "tsc"
1112
},
1213
"devDependencies": {
1314
"@anthropic-ai/claude-agent-sdk": "^0.1.30",
1415
"@clack/prompts": "1.0.0-alpha.6",
1516
"@radix-ui/colors": "^3.0.0",
16-
"chromatic": "^13.3.3",
1717
"@radix-ui/react-popover": "^1.1.15",
1818
"@radix-ui/react-toggle": "^1.1.10",
1919
"@radix-ui/react-toggle-group": "^1.1.11",
@@ -24,9 +24,11 @@
2424
"@tsconfig/node24": "^24.0.1",
2525
"@types/envinfo": "^7.8.4",
2626
"@types/eslint": "^9.6.1",
27+
"@types/node": "^24.10.1",
2728
"@types/react": "^18.3.26",
2829
"@vitejs/plugin-react-swc": "^4.2.0",
2930
"ai-tokenizer": "^1.0.3",
31+
"chromatic": "^13.3.3",
3032
"envinfo": "^7.20.0",
3133
"eslint": "^9.36.0",
3234
"globals": "^16.4.0",

eval/templates/evaluation/eslint.config.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export default defineConfig([
1212
extends: [
1313
js.configs.recommended,
1414
tseslint.configs.recommendedTypeChecked,
15-
reactHooks.configs['recommended-latest'],
15+
reactHooks.configs.flat['recommended-latest'],
1616
reactRefresh.configs.vite,
1717
],
1818
languageOptions: {

eval/templates/project/package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
"dev": "vite",
1010
"lint": "eslint .",
1111
"preview": "vite preview",
12-
"test": "vitest",
13-
"storybook": "storybook dev --port 6006",
12+
"test": "vitest run --reporter json --outputFile ../results/tests.json",
1413
"typecheck": "tsc --noEmit --project ./tsconfig.app.json"
1514
},
1615
"dependencies": {

eval/templates/result-docs/conversation.tsx

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,8 @@ const CodeBlock = ({
172172
const codeRef = useRef<HTMLElement>(null);
173173

174174
useEffect(() => {
175-
if (codeRef.current && (window as any).hljs) {
176-
(window as any).hljs.highlightElement(codeRef.current);
175+
if (codeRef.current && (globalThis as any).hljs) {
176+
(globalThis as any).hljs.highlightElement(codeRef.current);
177177
}
178178
}, [content, isTruncated]);
179179

@@ -410,6 +410,15 @@ const ElapsedTime = ({
410410
</div>
411411
);
412412

413+
const TYPE_COLORS = {
414+
assistant: { bg: '#dbeafe', text: '#1e40af' },
415+
user: { bg: '#f3e8ff', text: '#6b21a8' },
416+
system: { bg: '#e0e7ff', text: '#3730a3' },
417+
result: { bg: '#dcfce7', text: '#166534' },
418+
tool: { bg: '#fef3c7', text: '#92400e' },
419+
prompt: { bg: '#fce7f3', text: '#9f1239' },
420+
} as const;
421+
413422
const Turn = ({
414423
children,
415424
type,
@@ -420,7 +429,7 @@ const Turn = ({
420429
isMCP = false,
421430
}: {
422431
children: React.ReactNode;
423-
type: string;
432+
type: keyof typeof TYPE_COLORS;
424433
title: string;
425434
subtitle?: string;
426435
tokenCount?: string;
@@ -429,16 +438,7 @@ const Turn = ({
429438
}) => {
430439
const [isExpanded, setIsExpanded] = useState(false);
431440

432-
const typeColors: Record<string, { bg: string; text: string }> = {
433-
assistant: { bg: '#dbeafe', text: '#1e40af' },
434-
user: { bg: '#f3e8ff', text: '#6b21a8' },
435-
system: { bg: '#e0e7ff', text: '#3730a3' },
436-
result: { bg: '#dcfce7', text: '#166534' },
437-
tool: { bg: '#fef3c7', text: '#92400e' },
438-
prompt: { bg: '#fce7f3', text: '#9f1239' },
439-
};
440-
441-
const colors = typeColors[type] || typeColors.assistant;
441+
const colors = TYPE_COLORS[type] ?? TYPE_COLORS.assistant;
442442

443443
return (
444444
<div
@@ -771,10 +771,9 @@ function groupToolCallsWithResults(turns: ConversationMessage[]): Array<{
771771
}> = [];
772772
const usedResultIndices = new Set<number>();
773773

774-
for (let i = 0; i < turns.length; i++) {
774+
for (const [i, turn] of turns.entries()) {
775775
if (usedResultIndices.has(i)) continue;
776776

777-
const turn = turns[i];
778777
const toolUseContent =
779778
turn.type === 'assistant' &&
780779
'message' in turn &&

0 commit comments

Comments
 (0)