Skip to content

Commit 300fd8b

Browse files
authored
Merge pull request #881 from dgageot/improve-evals
Improve evals
2 parents 95b781a + 644c749 commit 300fd8b

File tree

9 files changed

+308
-131
lines changed

9 files changed

+308
-131
lines changed

cmd/root/eval.go

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ func newEvalCmd() *cobra.Command {
1919
var flags evalFlags
2020

2121
cmd := &cobra.Command{
22-
Use: "eval <agent-file>|<registry-ref> <eval-dir>",
22+
Use: "eval <agent-file>|<registry-ref> [<eval-dir>|./evals]",
2323
Short: "Run evaluations for an agent",
2424
GroupID: "advanced",
25-
Args: cobra.ExactArgs(2),
25+
Args: cobra.RangeArgs(1, 2),
2626
RunE: flags.runEvalCommand,
2727
}
2828

@@ -36,27 +36,26 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
3636

3737
ctx := cmd.Context()
3838
out := cli.NewPrinter(cmd.OutOrStdout())
39-
40-
agentFilename, err := agentfile.Resolve(ctx, out, args[0])
41-
if err != nil {
42-
return err
39+
agentFilename := args[0]
40+
evalsDir := "./evals"
41+
if len(args) >= 2 {
42+
evalsDir = args[1]
4343
}
4444

45-
agents, err := teamloader.Load(cmd.Context(), agentFilename, &f.runConfig)
45+
agentFilename, err := agentfile.Resolve(ctx, out, agentFilename)
4646
if err != nil {
4747
return err
4848
}
4949

50-
evalResults, err := evaluation.Evaluate(cmd.Context(), agents, args[1])
50+
agents, err := teamloader.Load(ctx, agentFilename, &f.runConfig)
5151
if err != nil {
5252
return err
5353
}
5454

55-
for _, evalResult := range evalResults {
56-
out.Printf("Eval file: %s\n", evalResult.EvalFile)
57-
out.Printf("Tool trajectory score: %f\n", evalResult.Score.ToolTrajectoryScore)
58-
out.Printf("Rouge-1 score: %f\n", evalResult.Score.Rouge1Score)
59-
}
60-
61-
return nil
55+
_, err = evaluation.Evaluate(ctx, agents, evalsDir, func(result evaluation.Result) {
56+
out.Printf("Eval file: %s\n", result.EvalFile)
57+
out.Printf("Tool trajectory score: %f\n", result.Score.ToolTrajectoryScore)
58+
out.Printf("Rouge-1 score: %f\n", result.Score.Rouge1Score)
59+
})
60+
return err
6261
}

examples/eval/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
!evals

examples/eval/README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
# Simple eval
22

3-
This is a simple agent that have an eval session saved in the `evals` directory, to run the eval you can:
3+
This is a simple agent that has two eval sessions saved in the `evals` directory, to run the eval you can:
44

55
```console
6-
$ cagent eval agent.yaml ./evals
6+
$ cagent eval demo.yaml ./evals
77
```
88

99
This will output something like
1010

1111
```console
12-
Eval file: 0463e7b4-d332-4bdb-97c2-8c092a32cde3
12+
Eval file: 41b179a2-ed19-4ae2-a45d-95775aaa90f7
1313
Tool trajectory score: 1.000000
14-
Rouge-1 score: 0.695652
14+
Rouge-1 score: 0.521739
15+
Eval file: 5d83e247-061f-4462-9b2d-240facde45f3
16+
Tool trajectory score: 1.000000
17+
Rouge-1 score: 0.829268
1518
```

examples/eval/demo.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
agents:
2+
root:
3+
model: openai/gpt-4o
4+
description: Test agent
5+
instruction: You know how to read/write and list files.
6+
toolsets:
7+
- type: filesystem
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"id": "41b179a2-ed19-4ae2-a45d-95775aaa90f7",
3+
"title": "Counting Files in Local Folder",
4+
"messages": [
5+
{
6+
"message": {
7+
"agentFilename": "./agent.yaml",
8+
"agentName": "",
9+
"message": {
10+
"role": "user",
11+
"content": "How many files in the local folder?",
12+
"created_at": "2025-11-20T12:51:05+01:00"
13+
}
14+
}
15+
},
16+
{
17+
"message": {
18+
"agentFilename": "",
19+
"agentName": "root",
20+
"message": {
21+
"role": "assistant",
22+
"content": "",
23+
"tool_calls": [
24+
{
25+
"id": "call_VUQE0PAUhRbKpFFoVdNY1XTx",
26+
"type": "function",
27+
"function": {
28+
"name": "list_directory",
29+
"arguments": "{\"path\":\"./\"}"
30+
}
31+
}
32+
],
33+
"created_at": "2025-11-20T12:51:06+01:00"
34+
}
35+
}
36+
},
37+
{
38+
"message": {
39+
"agentFilename": "",
40+
"agentName": "root",
41+
"message": {
42+
"role": "tool",
43+
"content": "FILE README.md\nFILE agent.yaml\n",
44+
"tool_call_id": "call_VUQE0PAUhRbKpFFoVdNY1XTx",
45+
"created_at": "2025-11-20T12:51:06+01:00"
46+
}
47+
}
48+
},
49+
{
50+
"message": {
51+
"agentFilename": "",
52+
"agentName": "root",
53+
"message": {
54+
"role": "assistant",
55+
"content": "There are 2 files in the local folder:\n\n1. `README.md`\n2. `agent.yaml`",
56+
"created_at": "2025-11-20T12:51:06+01:00"
57+
}
58+
}
59+
}
60+
],
61+
"created_at": "2025-11-20T12:50:48.32142+01:00",
62+
"tools_approved": false,
63+
"SendUserMessage": true,
64+
"max_iterations": 0,
65+
"input_tokens": 918,
66+
"output_tokens": 23,
67+
"cost": 0.0048850000000000005
68+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"id": "5d83e247-061f-4462-9b2d-240facde45f3",
3+
"title": "Checking the Content of README.md File",
4+
"messages": [
5+
{
6+
"message": {
7+
"agentFilename": "./demo.yaml",
8+
"agentName": "",
9+
"message": {
10+
"role": "user",
11+
"content": "Is README.md empty?",
12+
"created_at": "2025-11-20T12:53:33+01:00"
13+
}
14+
}
15+
},
16+
{
17+
"message": {
18+
"agentFilename": "",
19+
"agentName": "root",
20+
"message": {
21+
"role": "assistant",
22+
"content": "",
23+
"tool_calls": [
24+
{
25+
"id": "call_JRY589fOnxlErcyqhH6UG4tJ",
26+
"type": "function",
27+
"function": {
28+
"name": "read_file",
29+
"arguments": "{\"path\":\"README.md\"}"
30+
}
31+
}
32+
],
33+
"created_at": "2025-11-20T12:53:34+01:00"
34+
}
35+
}
36+
},
37+
{
38+
"message": {
39+
"agentFilename": "",
40+
"agentName": "root",
41+
"message": {
42+
"role": "tool",
43+
"content": "# Simple eval\n\nThis is a simple agent that has two eval sessions saved in the `evals` directory, to run the eval you can:\n\n```console\n$ cagent eval demo.yaml ./evals\n```\n\nThis will output something like\n\n```console\nEval file: 41b179a2-ed19-4ae2-a45d-95775aaa90f7\nTool trajectory score: 1.000000\nRouge-1 score: 0.521739\n```\n",
44+
"tool_call_id": "call_JRY589fOnxlErcyqhH6UG4tJ",
45+
"created_at": "2025-11-20T12:53:34+01:00"
46+
}
47+
}
48+
},
49+
{
50+
"message": {
51+
"agentFilename": "",
52+
"agentName": "root",
53+
"message": {
54+
"role": "assistant",
55+
"content": "The `README.md` file is not empty. It contains information about running a simple evaluation using a specific agent and includes example output.",
56+
"created_at": "2025-11-20T12:53:35+01:00"
57+
}
58+
}
59+
}
60+
],
61+
"created_at": "2025-11-20T12:53:32.334516+01:00",
62+
"tools_approved": false,
63+
"SendUserMessage": true,
64+
"max_iterations": 0,
65+
"input_tokens": 1012,
66+
"output_tokens": 28,
67+
"cost": 0.0051725
68+
}

0 commit comments

Comments
 (0)