Skip to content

Commit 35bf6df

Browse files
authored
feat: add a way to evaluate llms for our usecase against our prompts (#1700)
1 parent b4b9dd8 commit 35bf6df

File tree

6 files changed

+1541
-6
lines changed

6 files changed

+1541
-6
lines changed
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
import { evalite } from "evalite";
2+
import { openai } from "@ai-sdk/openai";
3+
import { streamText } from "ai";
4+
import { traceAISDKModel } from "evalite/ai-sdk";
5+
import { Factuality, Levenshtein } from "autoevals";
6+
import { AiChatPrompt, GmailSearchAssistantSystemPrompt, StyledEmailAssistantSystemPrompt } from "../src/lib/prompts";
7+
import { generateObject } from "ai";
8+
import { z } from "zod";
9+
10+
// base model (untraced) for internal helpers to avoid trace errors
11+
// add ur own model here
12+
const baseModel = openai("gpt-4o-mini");
13+
14+
// traced model for the actual task under test
15+
const model = traceAISDKModel(baseModel);
16+
17+
// error handling incase llm fails
18+
const safeStreamText = async (config: Parameters<typeof streamText>[0]) => {
19+
try {
20+
const res = await streamText(config);
21+
return res.textStream;
22+
} catch (err) {
23+
console.error("LLM call failed", err);
24+
return "ERROR";
25+
}
26+
};
27+
28+
/**
29+
* basic tests to cover all major capabilities, avg score is 30%, anything above is goated:
30+
* - mail search and filtering
31+
* - label management and organization
32+
* - bulk operations (archive, delete, mark read/unread)
33+
* - email composition and sending
34+
* - smart categorization (subscriptions, newsletters, meetings)
35+
* - web search integration
36+
* - user interaction patterns
37+
*/
38+
39+
40+
// forever todo: make the expected output autistically specific
41+
42+
// Dynamically builds a list of natural-language queries and their minimal expected Gmail-syntax
43+
const buildGmailSearchTestCases = async (): Promise<{ input: string; expected: string }[]> => {
44+
const { object } = await generateObject({
45+
model: baseModel,
46+
system: `You are a JSON test-case generator for Gmail search query conversions.
47+
Return ONLY a JSON object with a single key "cases" mapping to an array. Each array element has exactly the keys {input, expected}.
48+
Guidelines:
49+
• input – natural-language requests about searching/filtering email.
50+
• expected – a short Gmail-syntax fragment (e.g., "is:unread", "has:attachment", "after:") that MUST appear in a correct answer.
51+
• Cover diverse filters: sender, subject, attachments, labels, dates, read/unread.
52+
• Array length: 8-12.
53+
• No comments or additional keys.`,
54+
prompt: "Generate Gmail search conversion test cases",
55+
schema: z.object({
56+
cases: z.array(
57+
z.object({
58+
input: z.string().min(5),
59+
expected: z.string().min(3),
60+
}),
61+
),
62+
}),
63+
});
64+
65+
return object.cases;
66+
};
67+
68+
// generic dynamic testcase builder
69+
70+
type TestCase = { input: string; expected: string };
71+
72+
const makeAiChatTestCaseBuilder = (topic: string): (() => Promise<TestCase[]>) => {
73+
return async () => {
74+
const { object } = await generateObject({
75+
model: baseModel,
76+
system: `You are a JSON test-case generator for the topic: ${topic}.
77+
Return ONLY a JSON object with key "cases" whose value is an array of objects {input, expected}.
78+
Guidelines:
79+
• input – natural-language request related to ${topic}.
80+
• expected – short keyword (≤3 words) expected in correct assistant reply.
81+
• Array length: 6-10.
82+
• No extra keys or comments.`,
83+
prompt: `Generate ${topic} test cases`,
84+
schema: z.object({
85+
cases: z.array(
86+
z.object({
87+
input: z.string().min(5),
88+
expected: z.string().min(2),
89+
}),
90+
),
91+
}),
92+
});
93+
94+
return object.cases;
95+
};
96+
};
97+
98+
evalite("AI Chat – Basic Responses", {
99+
data: makeAiChatTestCaseBuilder("basic responses (greetings, capabilities, quick help)"),
100+
task: async (input) => {
101+
return safeStreamText({
102+
model: model,
103+
system: AiChatPrompt("test-thread-id", "inbox", ""),
104+
prompt: input,
105+
});
106+
},
107+
scorers: [Factuality, Levenshtein],
108+
});
109+
110+
evalite("Gmail Search Query – Natural Language", {
111+
data: buildGmailSearchTestCases,
112+
task: async (input) => {
113+
return safeStreamText({
114+
model: model,
115+
system: GmailSearchAssistantSystemPrompt(),
116+
prompt: input,
117+
});
118+
},
119+
scorers: [Factuality, Levenshtein],
120+
});
121+
122+
evalite("AI Chat – Label Management", {
123+
data: makeAiChatTestCaseBuilder("label management (create, delete, list, apply labels)"),
124+
task: async (input) => {
125+
return safeStreamText({
126+
model: model,
127+
system: AiChatPrompt("test-thread-id", "inbox", ""),
128+
prompt: input,
129+
});
130+
},
131+
scorers: [Factuality, Levenshtein],
132+
});
133+
134+
evalite("AI Chat – Email Organization", {
135+
data: makeAiChatTestCaseBuilder("email organization (archive, mark read/unread, bulk actions)"),
136+
task: async (input) => {
137+
return safeStreamText({
138+
model: model,
139+
system: AiChatPrompt("test-thread-id", "inbox", ""),
140+
prompt: input,
141+
});
142+
},
143+
scorers: [Factuality, Levenshtein],
144+
});
145+
146+
evalite("AI Chat – Email Composition", {
147+
data: makeAiChatTestCaseBuilder("email composition tasks (compose, reply, send, draft)"),
148+
task: async (input) => {
149+
return safeStreamText({
150+
model: model,
151+
system: AiChatPrompt("test-thread-id", "inbox", ""),
152+
prompt: input,
153+
});
154+
},
155+
scorers: [Factuality, Levenshtein],
156+
});
157+
158+
evalite("AI Chat – Smart Categorization", {
159+
data: makeAiChatTestCaseBuilder("smart categorization (subscriptions, newsletters, meetings, bills)"),
160+
task: async (input) => {
161+
return safeStreamText({
162+
model: model,
163+
system: AiChatPrompt("test-thread-id", "inbox", ""),
164+
prompt: input,
165+
});
166+
},
167+
scorers: [Factuality, Levenshtein],
168+
});
169+
170+
evalite("AI Chat – Information Queries", {
171+
data: makeAiChatTestCaseBuilder("information queries (summaries, web search, tax docs, recent activity)"),
172+
task: async (input) => {
173+
return safeStreamText({
174+
model: model,
175+
system: AiChatPrompt("test-thread-id", "inbox", ""),
176+
prompt: input,
177+
});
178+
},
179+
scorers: [Factuality, Levenshtein],
180+
});
181+
182+
evalite("AI Chat – Complex Workflows", {
183+
data: makeAiChatTestCaseBuilder("complex workflows (multi-step actions, automation)"),
184+
task: async (input) => {
185+
return safeStreamText({
186+
model: model,
187+
system: AiChatPrompt("test-thread-id", "inbox", ""),
188+
prompt: input,
189+
});
190+
},
191+
scorers: [Factuality, Levenshtein],
192+
});
193+
194+
evalite("AI Chat – User Intent Recognition", {
195+
data: makeAiChatTestCaseBuilder("user intent recognition (help, overwhelm, search, cleanup)"),
196+
task: async (input) => {
197+
return safeStreamText({
198+
model: model,
199+
system: AiChatPrompt("test-thread-id", "inbox", ""),
200+
prompt: input,
201+
});
202+
},
203+
scorers: [Factuality, Levenshtein],
204+
});
205+
206+
evalite("AI Chat – Error Handling & Edge Cases", {
207+
data: makeAiChatTestCaseBuilder("error handling & edge cases (invalid, bulk actions, very old queries)"),
208+
task: async (input) => {
209+
return safeStreamText({
210+
model: model,
211+
system: AiChatPrompt("test-thread-id", "inbox", ""),
212+
prompt: input,
213+
});
214+
},
215+
scorers: [Factuality, Levenshtein],
216+
});
217+
218+
evalite("Gmail Search Query Building", {
219+
data: buildGmailSearchTestCases,
220+
task: async (input) => {
221+
return safeStreamText({
222+
model: model,
223+
system: GmailSearchAssistantSystemPrompt(),
224+
prompt: input,
225+
});
226+
},
227+
scorers: [Factuality, Levenshtein],
228+
});
229+
230+
evalite("Email Composition with Style Matching", {
231+
data: makeAiChatTestCaseBuilder("styled email composition (follow-up, thank you, meeting, apology)"),
232+
task: async (input) => {
233+
return safeStreamText({
234+
model: model,
235+
system: StyledEmailAssistantSystemPrompt(),
236+
prompt: input,
237+
});
238+
},
239+
scorers: [Factuality, Levenshtein],
240+
});

apps/server/package.json

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
"db:generate": "drizzle-kit generate",
1111
"db:migrate": "drizzle-kit migrate",
1212
"db:push": "drizzle-kit push",
13-
"db:studio": "drizzle-kit studio"
13+
"db:studio": "drizzle-kit studio",
14+
"eval": "evalite",
15+
"eval:dev": "evalite watch"
1416
},
1517
"exports": {
1618
"./trpc": "./src/trpc/index.ts",
@@ -83,9 +85,13 @@
8385
"@types/uuid": "10.0.0",
8486
"@zero/eslint-config": "workspace:*",
8587
"@zero/tsconfig": "workspace:*",
88+
"autoevals": "0.0.130",
8689
"drizzle-kit": "catalog:",
8790
"eslint": "^9.27.0",
91+
"evalite": "0.11.4",
8892
"jiti": "2.4.2",
89-
"typescript": "catalog:"
93+
"typescript": "catalog:",
94+
"vite": "^6.3.5",
95+
"vitest": "3.2.4"
9096
}
9197
}

apps/server/tsconfig.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
22
"extends": "@zero/tsconfig/base",
3-
"include": ["src/**/*.ts", "src/overrides.d.ts", "worker-configuration.d.ts", "drizzle.config.ts"]
3+
"include": ["src/**/*.ts", "src/overrides.d.ts", "worker-configuration.d.ts", "drizzle.config.ts", "tests/**/*.ts", "evals/**/*.ts"]
44
}

apps/server/vite.config.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import { defineConfig } from "vite";
2+
3+
export default defineConfig({
4+
test: {
5+
testTimeout: 120000,
6+
hookTimeout: 120000,
7+
teardownTimeout: 120000,
8+
},
9+
});

package.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@
2828
"db:push": "dotenv -- pnpm run -C apps/server db:push",
2929
"db:studio": "dotenv -- pnpm run -C apps/server db:studio",
3030
"sentry:sourcemaps": "sentry-cli sourcemaps inject --org zero-7y --project nextjs ./apps/mail/.next && sentry-cli sourcemaps upload --org zero-7y --project nextjs ./apps/mail/.next",
31-
"scripts": "dotenv -- pnpx tsx ./scripts/run.ts"
31+
"scripts": "dotenv -- pnpx tsx ./scripts/run.ts",
32+
"test:ai": "dotenv -- pnpm --filter=@zero/server run test:ai",
33+
"eval": "dotenv -- pnpm --filter=@zero/server run eval",
34+
"eval:dev": "dotenv -- pnpm --filter=@zero/server run eval:dev",
35+
"eval:ci": "dotenv -- pnpm --filter=@zero/server run eval:ci"
3236
},
3337
"devDependencies": {
3438
"@types/node": "22.15.29",

0 commit comments

Comments
 (0)