@@ -26,22 +26,76 @@ describe('evaluateAnswer', () => {
2626 const { response } = await evaluateAnswer (
2727 'What is TypeScript?' ,
2828 'TypeScript is a strongly typed programming language that builds on JavaScript.' ,
29+ [ 'definitive' ] ,
2930 tokenTracker
3031 ) ;
31- expect ( response ) . toHaveProperty ( 'is_definitive' ) ;
32- expect ( response ) . toHaveProperty ( 'reasoning' ) ;
32+ expect ( response ) . toHaveProperty ( 'pass' ) ;
33+ expect ( response ) . toHaveProperty ( 'think' ) ;
34+ expect ( response . type ) . toBe ( 'definitive' ) ;
35+ expect ( response . pass ) . toBe ( true ) ;
36+ } ) ;
37+
38+ it ( 'should evaluate answer freshness' , async ( ) => {
39+ const tokenTracker = new TokenTracker ( ) ;
40+ const { response } = await evaluateAnswer (
41+ 'What is the latest version of Node.js?' ,
42+ 'The latest version of Node.js is 14.0.0, released in April 2020.' ,
43+ [ 'freshness' ] ,
44+ tokenTracker
45+ ) ;
46+ expect ( response ) . toHaveProperty ( 'pass' ) ;
47+ expect ( response ) . toHaveProperty ( 'think' ) ;
48+ expect ( response . type ) . toBe ( 'freshness' ) ;
49+ expect ( response . freshness_analysis ) . toBeDefined ( ) ;
50+ expect ( response . freshness_analysis ?. likely_outdated ) . toBe ( true ) ;
51+ expect ( response . freshness_analysis ?. dates_mentioned ) . toContain ( '2020-04' ) ;
52+ expect ( response . freshness_analysis ?. current_time ) . toBeDefined ( ) ;
53+ expect ( response . pass ) . toBe ( false ) ;
54+ } ) ;
55+
56+ it ( 'should evaluate answer plurality' , async ( ) => {
57+ const tokenTracker = new TokenTracker ( ) ;
58+ const { response } = await evaluateAnswer (
59+ 'List three programming languages.' ,
60+ 'Python is a programming language.' ,
61+ [ 'plurality' ] ,
62+ tokenTracker
63+ ) ;
64+ expect ( response ) . toHaveProperty ( 'pass' ) ;
65+ expect ( response ) . toHaveProperty ( 'think' ) ;
66+ expect ( response . type ) . toBe ( 'plurality' ) ;
67+ expect ( response . plurality_analysis ) . toBeDefined ( ) ;
68+ expect ( response . plurality_analysis ?. expects_multiple ) . toBe ( true ) ;
69+ expect ( response . plurality_analysis ?. provides_multiple ) . toBe ( false ) ;
70+ expect ( response . plurality_analysis ?. count_expected ) . toBe ( 3 ) ;
71+ expect ( response . plurality_analysis ?. count_provided ) . toBe ( 1 ) ;
72+ expect ( response . pass ) . toBe ( false ) ;
73+ } ) ;
74+
75+ it ( 'should evaluate in order and stop at first failure' , async ( ) => {
76+ const tokenTracker = new TokenTracker ( ) ;
77+ const { response } = await evaluateAnswer (
78+ 'List the latest Node.js versions.' ,
79+ 'I am not sure about the Node.js versions.' ,
80+ [ 'definitive' , 'freshness' , 'plurality' ] ,
81+ tokenTracker
82+ ) ;
83+ expect ( response . type ) . toBe ( 'definitive' ) ;
84+ expect ( response . pass ) . toBe ( false ) ;
85+ expect ( response . freshness_analysis ) . toBeUndefined ( ) ;
86+ expect ( response . plurality_analysis ) . toBeUndefined ( ) ;
3387 } ) ;
3488
3589 it ( 'should track token usage' , async ( ) => {
3690 const tokenTracker = new TokenTracker ( ) ;
3791 const spy = jest . spyOn ( tokenTracker , 'trackUsage' ) ;
38- const { tokens } = await evaluateAnswer (
92+ await evaluateAnswer (
3993 'What is TypeScript?' ,
4094 'TypeScript is a strongly typed programming language that builds on JavaScript.' ,
95+ [ 'definitive' , 'freshness' , 'plurality' ] ,
4196 tokenTracker
4297 ) ;
43- expect ( spy ) . toHaveBeenCalledWith ( 'evaluator' , tokens ) ;
44- expect ( tokens ) . toBeGreaterThan ( 0 ) ;
98+ expect ( spy ) . toHaveBeenCalledWith ( 'evaluator' , expect . any ( Number ) ) ;
4599 } ) ;
46100 } ) ;
47101 } ) ;
0 commit comments