1+ import { evalite } from "evalite" ;
2+ import { openai } from "@ai-sdk/openai" ;
3+ import { streamText } from "ai" ;
4+ import { traceAISDKModel } from "evalite/ai-sdk" ;
5+ import { Factuality , Levenshtein } from "autoevals" ;
6+ import { AiChatPrompt , GmailSearchAssistantSystemPrompt , StyledEmailAssistantSystemPrompt } from "../src/lib/prompts" ;
7+ import { generateObject } from "ai" ;
8+ import { z } from "zod" ;
9+
10+ // base model (untraced) for internal helpers to avoid trace errors
11+ // add ur own model here
12+ const baseModel = openai ( "gpt-4o-mini" ) ;
13+
14+ // traced model for the actual task under test
15+ const model = traceAISDKModel ( baseModel ) ;
16+
17+ // error handling incase llm fails
18+ const safeStreamText = async ( config : Parameters < typeof streamText > [ 0 ] ) => {
19+ try {
20+ const res = await streamText ( config ) ;
21+ return res . textStream ;
22+ } catch ( err ) {
23+ console . error ( "LLM call failed" , err ) ;
24+ return "ERROR" ;
25+ }
26+ } ;
27+
28+ /**
29+ * basic tests to cover all major capabilities, avg score is 30%, anything above is goated:
30+ * - mail search and filtering
31+ * - label management and organization
32+ * - bulk operations (archive, delete, mark read/unread)
33+ * - email composition and sending
34+ * - smart categorization (subscriptions, newsletters, meetings)
35+ * - web search integration
36+ * - user interaction patterns
37+ */
38+
39+
40+ // forever todo: make the expected output autistically specific
41+
42+ // Dynamically builds a list of natural-language queries and their minimal expected Gmail-syntax
43+ const buildGmailSearchTestCases = async ( ) : Promise < { input : string ; expected : string } [ ] > => {
44+ const { object } = await generateObject ( {
45+ model : baseModel ,
46+ system : `You are a JSON test-case generator for Gmail search query conversions.
47+ Return ONLY a JSON object with a single key "cases" mapping to an array. Each array element has exactly the keys {input, expected}.
48+ Guidelines:
49+ • input – natural-language requests about searching/filtering email.
50+ • expected – a short Gmail-syntax fragment (e.g., "is:unread", "has:attachment", "after:") that MUST appear in a correct answer.
51+ • Cover diverse filters: sender, subject, attachments, labels, dates, read/unread.
52+ • Array length: 8-12.
53+ • No comments or additional keys.` ,
54+ prompt : "Generate Gmail search conversion test cases" ,
55+ schema : z . object ( {
56+ cases : z . array (
57+ z . object ( {
58+ input : z . string ( ) . min ( 5 ) ,
59+ expected : z . string ( ) . min ( 3 ) ,
60+ } ) ,
61+ ) ,
62+ } ) ,
63+ } ) ;
64+
65+ return object . cases ;
66+ } ;
67+
68+ // generic dynamic testcase builder
69+
70+ type TestCase = { input : string ; expected : string } ;
71+
72+ const makeAiChatTestCaseBuilder = ( topic : string ) : ( ( ) => Promise < TestCase [ ] > ) => {
73+ return async ( ) => {
74+ const { object } = await generateObject ( {
75+ model : baseModel ,
76+ system : `You are a JSON test-case generator for the topic: ${ topic } .
77+ Return ONLY a JSON object with key "cases" whose value is an array of objects {input, expected}.
78+ Guidelines:
79+ • input – natural-language request related to ${ topic } .
80+ • expected – short keyword (≤3 words) expected in correct assistant reply.
81+ • Array length: 6-10.
82+ • No extra keys or comments.` ,
83+ prompt : `Generate ${ topic } test cases` ,
84+ schema : z . object ( {
85+ cases : z . array (
86+ z . object ( {
87+ input : z . string ( ) . min ( 5 ) ,
88+ expected : z . string ( ) . min ( 2 ) ,
89+ } ) ,
90+ ) ,
91+ } ) ,
92+ } ) ;
93+
94+ return object . cases ;
95+ } ;
96+ } ;
97+
98+ evalite ( "AI Chat – Basic Responses" , {
99+ data : makeAiChatTestCaseBuilder ( "basic responses (greetings, capabilities, quick help)" ) ,
100+ task : async ( input ) => {
101+ return safeStreamText ( {
102+ model : model ,
103+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
104+ prompt : input ,
105+ } ) ;
106+ } ,
107+ scorers : [ Factuality , Levenshtein ] ,
108+ } ) ;
109+
110+ evalite ( "Gmail Search Query – Natural Language" , {
111+ data : buildGmailSearchTestCases ,
112+ task : async ( input ) => {
113+ return safeStreamText ( {
114+ model : model ,
115+ system : GmailSearchAssistantSystemPrompt ( ) ,
116+ prompt : input ,
117+ } ) ;
118+ } ,
119+ scorers : [ Factuality , Levenshtein ] ,
120+ } ) ;
121+
122+ evalite ( "AI Chat – Label Management" , {
123+ data : makeAiChatTestCaseBuilder ( "label management (create, delete, list, apply labels)" ) ,
124+ task : async ( input ) => {
125+ return safeStreamText ( {
126+ model : model ,
127+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
128+ prompt : input ,
129+ } ) ;
130+ } ,
131+ scorers : [ Factuality , Levenshtein ] ,
132+ } ) ;
133+
134+ evalite ( "AI Chat – Email Organization" , {
135+ data : makeAiChatTestCaseBuilder ( "email organization (archive, mark read/unread, bulk actions)" ) ,
136+ task : async ( input ) => {
137+ return safeStreamText ( {
138+ model : model ,
139+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
140+ prompt : input ,
141+ } ) ;
142+ } ,
143+ scorers : [ Factuality , Levenshtein ] ,
144+ } ) ;
145+
146+ evalite ( "AI Chat – Email Composition" , {
147+ data : makeAiChatTestCaseBuilder ( "email composition tasks (compose, reply, send, draft)" ) ,
148+ task : async ( input ) => {
149+ return safeStreamText ( {
150+ model : model ,
151+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
152+ prompt : input ,
153+ } ) ;
154+ } ,
155+ scorers : [ Factuality , Levenshtein ] ,
156+ } ) ;
157+
158+ evalite ( "AI Chat – Smart Categorization" , {
159+ data : makeAiChatTestCaseBuilder ( "smart categorization (subscriptions, newsletters, meetings, bills)" ) ,
160+ task : async ( input ) => {
161+ return safeStreamText ( {
162+ model : model ,
163+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
164+ prompt : input ,
165+ } ) ;
166+ } ,
167+ scorers : [ Factuality , Levenshtein ] ,
168+ } ) ;
169+
170+ evalite ( "AI Chat – Information Queries" , {
171+ data : makeAiChatTestCaseBuilder ( "information queries (summaries, web search, tax docs, recent activity)" ) ,
172+ task : async ( input ) => {
173+ return safeStreamText ( {
174+ model : model ,
175+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
176+ prompt : input ,
177+ } ) ;
178+ } ,
179+ scorers : [ Factuality , Levenshtein ] ,
180+ } ) ;
181+
182+ evalite ( "AI Chat – Complex Workflows" , {
183+ data : makeAiChatTestCaseBuilder ( "complex workflows (multi-step actions, automation)" ) ,
184+ task : async ( input ) => {
185+ return safeStreamText ( {
186+ model : model ,
187+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
188+ prompt : input ,
189+ } ) ;
190+ } ,
191+ scorers : [ Factuality , Levenshtein ] ,
192+ } ) ;
193+
194+ evalite ( "AI Chat – User Intent Recognition" , {
195+ data : makeAiChatTestCaseBuilder ( "user intent recognition (help, overwhelm, search, cleanup)" ) ,
196+ task : async ( input ) => {
197+ return safeStreamText ( {
198+ model : model ,
199+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
200+ prompt : input ,
201+ } ) ;
202+ } ,
203+ scorers : [ Factuality , Levenshtein ] ,
204+ } ) ;
205+
206+ evalite ( "AI Chat – Error Handling & Edge Cases" , {
207+ data : makeAiChatTestCaseBuilder ( "error handling & edge cases (invalid, bulk actions, very old queries)" ) ,
208+ task : async ( input ) => {
209+ return safeStreamText ( {
210+ model : model ,
211+ system : AiChatPrompt ( "test-thread-id" , "inbox" , "" ) ,
212+ prompt : input ,
213+ } ) ;
214+ } ,
215+ scorers : [ Factuality , Levenshtein ] ,
216+ } ) ;
217+
218+ evalite ( "Gmail Search Query Building" , {
219+ data : buildGmailSearchTestCases ,
220+ task : async ( input ) => {
221+ return safeStreamText ( {
222+ model : model ,
223+ system : GmailSearchAssistantSystemPrompt ( ) ,
224+ prompt : input ,
225+ } ) ;
226+ } ,
227+ scorers : [ Factuality , Levenshtein ] ,
228+ } ) ;
229+
230+ evalite ( "Email Composition with Style Matching" , {
231+ data : makeAiChatTestCaseBuilder ( "styled email composition (follow-up, thank you, meeting, apology)" ) ,
232+ task : async ( input ) => {
233+ return safeStreamText ( {
234+ model : model ,
235+ system : StyledEmailAssistantSystemPrompt ( ) ,
236+ prompt : input ,
237+ } ) ;
238+ } ,
239+ scorers : [ Factuality , Levenshtein ] ,
240+ } ) ;
0 commit comments