@@ -11,7 +11,6 @@ Options:
1111 --scenario=<name> Run only specific scenario
1212 --tool=<name> Run only scenarios for a specific tool
1313 --delay=<ms> Stagger delay between starting runs in milliseconds (default: 0)
14- --verbose Show detailed output
1514--]]
1615
1716local SCRIPT_DIR = vim .fn .fnamemodify (debug.getinfo (1 , " S" ).source :sub (2 ), " :h" )
@@ -252,7 +251,6 @@ local function parse_args()
252251 model = nil ,
253252 scenario = nil ,
254253 tool = nil ,
255- verbose = false ,
256254 }
257255
258256 for _ , arg in ipairs (vim .v .argv ) do
@@ -270,8 +268,6 @@ local function parse_args()
270268 args .csv = true
271269 elseif arg == " --log" then
272270 args .log = true
273- elseif arg == " --verbose" then
274- args .verbose = true
275271 end
276272 end
277273
@@ -286,30 +282,10 @@ local function setup_output_dir(config)
286282 return dir
287283end
288284
289- --- Run `diff -u` between expected and actual strings, returning the diff output.
290- --- @param actual string
291- --- @param expected string
292- --- @return string
293- local function unified_diff (actual , expected )
294- local tmp_expected = vim .fn .tempname ()
295- local tmp_actual = vim .fn .tempname ()
296- vim .fn .writefile (vim .split (expected , " \n " , { plain = true }), tmp_expected )
297- vim .fn .writefile (vim .split (actual , " \n " , { plain = true }), tmp_actual )
298- local output = vim .fn .system (string.format (" diff -u --label expected --label actual %s %s" , tmp_expected , tmp_actual ))
299- vim .fn .delete (tmp_expected )
300- vim .fn .delete (tmp_actual )
301- return vim .trim (output )
302- end
303-
304- --- @param opts { msg : string , level ?: string , verbose_only ?: boolean }
285+ --- @param opts { msg : string , level ?: string }
305286local function log (opts )
306287 local msg = opts .msg
307288 local level = opts .level or " INFO"
308- local verbose_only = opts .verbose_only
309-
310- if verbose_only and not _G ._test_verbose then
311- return
312- end
313289
314290 if level == " PASS" then
315291 print (string.format (" PASS %s" , msg ))
@@ -347,7 +323,7 @@ local function write_csv_row(opts)
347323 return
348324 end
349325 if not file_exists then
350- f :write (" run_at,id,adapter,model,scenario,result ,duration_s,tool_calls,tokens,error\n " )
326+ f :write (" run_at,id,adapter,model,scenario,pass ,duration_s,tool_calls,tokens,error\n " )
351327 end
352328
353329 local row = {
@@ -356,7 +332,7 @@ local function write_csv_row(opts)
356332 csv_escape (result .adapter ),
357333 csv_escape (result .model ),
358334 csv_escape (result .scenario ),
359- csv_escape (result .success and " pass " or " fail " ),
335+ csv_escape (result .success and " 1 " or " 0 " ),
360336 csv_escape (string.format (" %.2f" , result .duration_ms / 1000 )),
361337 csv_escape (tostring (# (result .tool_calls or {}))),
362338 csv_escape (tostring (result .tokens or 0 )),
@@ -411,7 +387,6 @@ local function start_scenario_run(opts)
411387 timestamp = os.date (" %Y-%m-%d %H:%M:%S" ),
412388 tokens = 0 ,
413389 tool_calls = {},
414- validation = nil ,
415390 },
416391 scenario = scenario ,
417392 start_time = vim .uv .hrtime (),
@@ -529,7 +504,7 @@ local function start_scenario_run(opts)
529504 return run
530505end
531506
532- --- Validate, build messages, and set result.success on a completed run .
507+ --- Run the scenario's test function and set result.success.
533508--- @param run table
534509local function finalize_run (run )
535510 local scenario = run .scenario
@@ -555,10 +530,10 @@ local function finalize_run(run)
555530 result .duration_ms = (vim .uv .hrtime () - run .start_time ) / 1000000
556531
557532 if run .completed then
558- local should_validate = true
533+ local should_test = true
559534
560- if scenario .tools_required then
561- for _ , required in ipairs (scenario .tools_required ) do
535+ if scenario .tools then
536+ for _ , required in ipairs (scenario .tools ) do
562537 local was_called = false
563538 for _ , call in ipairs (run .tool_calls ) do
564539 if call .name == required then
@@ -568,36 +543,24 @@ local function finalize_run(run)
568543 end
569544 if not was_called then
570545 result .error = string.format (" Required tool '%s' was not called" , required )
571- should_validate = false
546+ should_test = false
572547 break
573548 end
574549 end
575550 end
576551
577- if should_validate then
552+ if should_test then
578553 local run_data = { response_content = run .response_content , tool_calls = run .tool_calls }
579- local validate_ok , validate_success , validation_details = pcall (scenario .validate , run .context , run_data )
580- if validate_ok then
581- result .success = validate_success
582- result .validation = validation_details
583- if not validate_success and not result .error then
584- result .error = " Validation failed"
585- elseif validate_success then
586- result .error = nil
587- end
554+ local test_ok , test_success , test_msg = pcall (scenario .test , run .context , run_data )
555+ if test_ok then
556+ result .success = test_success
557+ result .error = not test_success and (test_msg or " Test failed" ) or nil
588558 else
589- if not result .error then
590- result .error = " Validation error: " .. tostring (validate_success )
591- end
559+ result .error = " Test error: " .. tostring (test_success )
592560 end
593561 end
594562 end
595563
596- if not result .success and # run .tool_calls > 0 then
597- result .error = (result .error or " Unknown error" )
598- .. string.format (" (tool called: %s, executed: %s)" , # run .tool_calls > 0 , run .tool_executed )
599- end
600-
601564 result .response_content = run .response_content
602565 result .tool_calls = run .tool_calls
603566
@@ -695,7 +658,6 @@ local function run_tests(opts)
695658 timestamp = os.date (" %Y-%m-%d %H:%M:%S" ),
696659 tokens = 0 ,
697660 tool_calls = {},
698- validation = nil ,
699661 })
700662 end
701663 else
@@ -789,22 +751,6 @@ local function run_tests(opts)
789751 scenario_name = scenario .name ,
790752 })
791753 result .result_file = result_file
792- log ({ msg = " Result saved to: " .. result_file , verbose_only = true })
793- end
794-
795- if not result .success and result .validation then
796- local val = result .validation
797- if type (val .actual ) == " string" and type (val .expected ) == " string" and val .actual ~= val .expected then
798- local diff = unified_diff (val .actual , val .expected )
799- if diff ~= " " then
800- log ({ msg = " Diff:" , verbose_only = true })
801- for _ , diff_line in ipairs (vim .split (diff , " \n " , { plain = true })) do
802- log ({ msg = " " .. diff_line , verbose_only = true })
803- end
804- end
805- else
806- log ({ msg = " Validation: " .. vim .inspect (val ), verbose_only = true })
807- end
808754 end
809755
810756 table.insert (all_results , result )
@@ -839,7 +785,7 @@ local function run_tests(opts)
839785 if args .log then
840786 local summary_file = vim .fs .joinpath (results_dir , " summary_" .. os.date (" %Y%m%d_%H%M%S" ) .. " .json" )
841787 vim .fn .writefile (vim .split (vim .json .encode ({ results = all_results , summary = summary }), " \n " ), summary_file )
842- log ({ msg = " Summary saved to: " .. summary_file , verbose_only = true })
788+ log ({ msg = " Summary saved to: " .. summary_file })
843789 end
844790
845791 vim .cmd (string.format (" cquit %d" , summary .failed + summary .errors ))
@@ -851,13 +797,6 @@ load_env_file()
851797local config = load_config ()
852798local args = parse_args ()
853799
854- if args .verbose then
855- config .output .verbose = true
856- _G ._test_verbose = true
857- else
858- _G ._test_verbose = false
859- end
860-
861800local ok , err = pcall (run_tests , { config = config , args = args })
862801if not ok then
863802 log ({ msg = " Fatal error: " .. tostring (err ), level = " FATAL" })
0 commit comments