Skip to content

Commit e19a5a2

Browse files
committed
feat: tweak tests
1 parent dad1dff commit e19a5a2

57 files changed

Lines changed: 1064 additions & 3177 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

tests/scripts/tool_testing/run_tests.lua

Lines changed: 15 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ Options:
1111
--scenario=<name> Run only specific scenario
1212
--tool=<name> Run only scenarios for a specific tool
1313
--delay=<ms> Stagger delay between starting runs in milliseconds (default: 0)
14-
--verbose Show detailed output
1514
--]]
1615

1716
local SCRIPT_DIR = vim.fn.fnamemodify(debug.getinfo(1, "S").source:sub(2), ":h")
@@ -252,7 +251,6 @@ local function parse_args()
252251
model = nil,
253252
scenario = nil,
254253
tool = nil,
255-
verbose = false,
256254
}
257255

258256
for _, arg in ipairs(vim.v.argv) do
@@ -270,8 +268,6 @@ local function parse_args()
270268
args.csv = true
271269
elseif arg == "--log" then
272270
args.log = true
273-
elseif arg == "--verbose" then
274-
args.verbose = true
275271
end
276272
end
277273

@@ -286,30 +282,10 @@ local function setup_output_dir(config)
286282
return dir
287283
end
288284

289-
---Run `diff -u` between expected and actual strings, returning the diff output.
290-
---@param actual string
291-
---@param expected string
292-
---@return string
293-
local function unified_diff(actual, expected)
294-
local tmp_expected = vim.fn.tempname()
295-
local tmp_actual = vim.fn.tempname()
296-
vim.fn.writefile(vim.split(expected, "\n", { plain = true }), tmp_expected)
297-
vim.fn.writefile(vim.split(actual, "\n", { plain = true }), tmp_actual)
298-
local output = vim.fn.system(string.format("diff -u --label expected --label actual %s %s", tmp_expected, tmp_actual))
299-
vim.fn.delete(tmp_expected)
300-
vim.fn.delete(tmp_actual)
301-
return vim.trim(output)
302-
end
303-
304-
---@param opts {msg: string, level?: string, verbose_only?: boolean}
285+
---@param opts {msg: string, level?: string}
305286
local function log(opts)
306287
local msg = opts.msg
307288
local level = opts.level or "INFO"
308-
local verbose_only = opts.verbose_only
309-
310-
if verbose_only and not _G._test_verbose then
311-
return
312-
end
313289

314290
if level == "PASS" then
315291
print(string.format(" PASS %s", msg))
@@ -347,7 +323,7 @@ local function write_csv_row(opts)
347323
return
348324
end
349325
if not file_exists then
350-
f:write("run_at,id,adapter,model,scenario,result,duration_s,tool_calls,tokens,error\n")
326+
f:write("run_at,id,adapter,model,scenario,pass,duration_s,tool_calls,tokens,error\n")
351327
end
352328

353329
local row = {
@@ -356,7 +332,7 @@ local function write_csv_row(opts)
356332
csv_escape(result.adapter),
357333
csv_escape(result.model),
358334
csv_escape(result.scenario),
359-
csv_escape(result.success and "pass" or "fail"),
335+
csv_escape(result.success and "1" or "0"),
360336
csv_escape(string.format("%.2f", result.duration_ms / 1000)),
361337
csv_escape(tostring(#(result.tool_calls or {}))),
362338
csv_escape(tostring(result.tokens or 0)),
@@ -411,7 +387,6 @@ local function start_scenario_run(opts)
411387
timestamp = os.date("%Y-%m-%d %H:%M:%S"),
412388
tokens = 0,
413389
tool_calls = {},
414-
validation = nil,
415390
},
416391
scenario = scenario,
417392
start_time = vim.uv.hrtime(),
@@ -529,7 +504,7 @@ local function start_scenario_run(opts)
529504
return run
530505
end
531506

532-
---Validate, build messages, and set result.success on a completed run.
507+
---Run the scenario's test function and set result.success.
533508
---@param run table
534509
local function finalize_run(run)
535510
local scenario = run.scenario
@@ -555,10 +530,10 @@ local function finalize_run(run)
555530
result.duration_ms = (vim.uv.hrtime() - run.start_time) / 1000000
556531

557532
if run.completed then
558-
local should_validate = true
533+
local should_test = true
559534

560-
if scenario.tools_required then
561-
for _, required in ipairs(scenario.tools_required) do
535+
if scenario.tools then
536+
for _, required in ipairs(scenario.tools) do
562537
local was_called = false
563538
for _, call in ipairs(run.tool_calls) do
564539
if call.name == required then
@@ -568,36 +543,24 @@ local function finalize_run(run)
568543
end
569544
if not was_called then
570545
result.error = string.format("Required tool '%s' was not called", required)
571-
should_validate = false
546+
should_test = false
572547
break
573548
end
574549
end
575550
end
576551

577-
if should_validate then
552+
if should_test then
578553
local run_data = { response_content = run.response_content, tool_calls = run.tool_calls }
579-
local validate_ok, validate_success, validation_details = pcall(scenario.validate, run.context, run_data)
580-
if validate_ok then
581-
result.success = validate_success
582-
result.validation = validation_details
583-
if not validate_success and not result.error then
584-
result.error = "Validation failed"
585-
elseif validate_success then
586-
result.error = nil
587-
end
554+
local test_ok, test_success, test_msg = pcall(scenario.test, run.context, run_data)
555+
if test_ok then
556+
result.success = test_success
557+
result.error = not test_success and (test_msg or "Test failed") or nil
588558
else
589-
if not result.error then
590-
result.error = "Validation error: " .. tostring(validate_success)
591-
end
559+
result.error = "Test error: " .. tostring(test_success)
592560
end
593561
end
594562
end
595563

596-
if not result.success and #run.tool_calls > 0 then
597-
result.error = (result.error or "Unknown error")
598-
.. string.format(" (tool called: %s, executed: %s)", #run.tool_calls > 0, run.tool_executed)
599-
end
600-
601564
result.response_content = run.response_content
602565
result.tool_calls = run.tool_calls
603566

@@ -695,7 +658,6 @@ local function run_tests(opts)
695658
timestamp = os.date("%Y-%m-%d %H:%M:%S"),
696659
tokens = 0,
697660
tool_calls = {},
698-
validation = nil,
699661
})
700662
end
701663
else
@@ -789,22 +751,6 @@ local function run_tests(opts)
789751
scenario_name = scenario.name,
790752
})
791753
result.result_file = result_file
792-
log({ msg = " Result saved to: " .. result_file, verbose_only = true })
793-
end
794-
795-
if not result.success and result.validation then
796-
local val = result.validation
797-
if type(val.actual) == "string" and type(val.expected) == "string" and val.actual ~= val.expected then
798-
local diff = unified_diff(val.actual, val.expected)
799-
if diff ~= "" then
800-
log({ msg = " Diff:", verbose_only = true })
801-
for _, diff_line in ipairs(vim.split(diff, "\n", { plain = true })) do
802-
log({ msg = " " .. diff_line, verbose_only = true })
803-
end
804-
end
805-
else
806-
log({ msg = " Validation: " .. vim.inspect(val), verbose_only = true })
807-
end
808754
end
809755

810756
table.insert(all_results, result)
@@ -839,7 +785,7 @@ local function run_tests(opts)
839785
if args.log then
840786
local summary_file = vim.fs.joinpath(results_dir, "summary_" .. os.date("%Y%m%d_%H%M%S") .. ".json")
841787
vim.fn.writefile(vim.split(vim.json.encode({ results = all_results, summary = summary }), "\n"), summary_file)
842-
log({ msg = "Summary saved to: " .. summary_file, verbose_only = true })
788+
log({ msg = "Summary saved to: " .. summary_file })
843789
end
844790

845791
vim.cmd(string.format("cquit %d", summary.failed + summary.errors))
@@ -851,13 +797,6 @@ load_env_file()
851797
local config = load_config()
852798
local args = parse_args()
853799

854-
if args.verbose then
855-
config.output.verbose = true
856-
_G._test_verbose = true
857-
else
858-
_G._test_verbose = false
859-
end
860-
861800
local ok, err = pcall(run_tests, { config = config, args = args })
862801
if not ok then
863802
log({ msg = "Fatal error: " .. tostring(err), level = "FATAL" })
Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,22 @@
1-
-- Three independent edits clustered close together in a config block.
2-
-- All three must land in a single tool call.
1+
local files = require("codecompanion.utils.files")
2+
local FIXTURES = vim.fn.fnamemodify(debug.getinfo(1, "S").source:sub(2), ":h")
33

4-
local CONTENT = {
5-
"local config = {",
6-
" database = {",
7-
" host = 'localhost',",
8-
" name = 'myapp_dev',",
9-
" pool_size = 5,",
10-
" port = 5432,",
11-
" ssl = false,",
12-
" timeout = 15,",
13-
" },",
14-
"}",
15-
"",
16-
"return config",
17-
}
18-
19-
local EXPECTED = {
20-
"local config = {",
21-
" database = {",
22-
" host = 'db.production.internal',",
23-
" name = 'myapp_prod',",
24-
" pool_size = 5,",
25-
" port = 5432,",
26-
" ssl = true,",
27-
" timeout = 15,",
28-
" },",
29-
"}",
30-
"",
31-
"return config",
32-
}
4+
local input_file = "adjacent_edits.lua.input"
335

346
return {
357
cleanup = function(ctx)
36-
vim.fn.delete(ctx.test_file)
8+
files.delete(ctx.test_file)
379
end,
3810

39-
description = "insert_edit_into_file: three adjacent single-line changes in a single tool call",
11+
description = "Make three adjacent single-line changes in a single tool call",
4012
name = "Adjacent edits",
4113
tools = { "insert_edit_into_file" },
42-
tools_required = { "insert_edit_into_file" },
4314

4415
setup = function()
16+
local input_path = vim.fs.joinpath(FIXTURES, input_file)
4517
local test_file = vim.fn.tempname() .. ".lua"
46-
vim.fn.writefile(CONTENT, test_file)
47-
return { test_file = test_file }
18+
files.write_to_path(test_file, files.read(input_path))
19+
return { input_path = input_path, test_file = test_file }
4820
end,
4921

5022
prompt = function(ctx)
@@ -63,16 +35,21 @@ Make all three changes in a single tool call with three edits:
6335
6436
Do not ask for permission — call the tool directly.]],
6537
ctx.test_file,
66-
table.concat(CONTENT, "\n")
38+
files.read(ctx.input_path)
6739
)
6840
end,
6941

70-
validate = function(ctx, _run)
71-
local actual = vim.fn.readfile(ctx.test_file)
72-
if actual[#actual] == "" then
73-
actual[#actual] = nil
42+
test = function(ctx)
43+
if vim.fn.executable("nvim") == 0 then
44+
return false, "nvim not available"
45+
end
46+
local result = vim.system({ "nvim", "-l", ctx.test_file }):wait()
47+
if result.code ~= 0 then
48+
return false, "execution failed: " .. vim.trim(result.stderr or "")
7449
end
75-
local ok = vim.deep_equal(actual, EXPECTED)
76-
return ok, { actual = table.concat(actual, "\n"), expected = table.concat(EXPECTED, "\n") }
50+
local output = vim.trim(result.stderr)
51+
return output == "db.production.internal/myapp_prod",
52+
output ~= "db.production.internal/myapp_prod" and "expected 'db.production.internal/myapp_prod', got: " .. output
53+
or nil
7754
end,
7855
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
local config = {
2+
database = {
3+
host = 'db.production.internal',
4+
name = 'myapp_prod',
5+
pool_size = 5,
6+
port = 5432,
7+
ssl = true,
8+
timeout = 15,
9+
},
10+
}
11+
12+
print(config.database.host .. "/" .. config.database.name)
13+
return config
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
local config = {
2+
database = {
3+
host = 'localhost',
4+
name = 'myapp_dev',
5+
pool_size = 5,
6+
port = 5432,
7+
ssl = false,
8+
timeout = 15,
9+
},
10+
}
11+
12+
print(config.database.host .. "/" .. config.database.name)
13+
return config

0 commit comments

Comments
 (0)