Merge pull request #881 from dgageot/improve-evals

dgageot · web-flow · commit 300fd8ba4202 · 2025-11-20T13:26:22.000+01:00
Improve evals
diff --git a/cmd/root/eval.go b/cmd/root/eval.go
@@ -19,10 +19,10 @@ func newEvalCmd() *cobra.Command {
 	var flags evalFlags
 
 	cmd := &cobra.Command{
-		Use:     "eval <agent-file>|<registry-ref> <eval-dir>",
+		Use:     "eval <agent-file>|<registry-ref> [<eval-dir>|./evals]",
 		Short:   "Run evaluations for an agent",
 		GroupID: "advanced",
-		Args:    cobra.ExactArgs(2),
+		Args:    cobra.RangeArgs(1, 2),
 		RunE:    flags.runEvalCommand,
 	}
 
@@ -36,27 +36,26 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
 
 	ctx := cmd.Context()
 	out := cli.NewPrinter(cmd.OutOrStdout())
-
-	agentFilename, err := agentfile.Resolve(ctx, out, args[0])
-	if err != nil {
-		return err
+	agentFilename := args[0]
+	evalsDir := "./evals"
+	if len(args) >= 2 {
+		evalsDir = args[1]
 	}
 
-	agents, err := teamloader.Load(cmd.Context(), agentFilename, &f.runConfig)
+	agentFilename, err := agentfile.Resolve(ctx, out, agentFilename)
 	if err != nil {
 		return err
 	}
 
-	evalResults, err := evaluation.Evaluate(cmd.Context(), agents, args[1])
+	agents, err := teamloader.Load(ctx, agentFilename, &f.runConfig)
 	if err != nil {
 		return err
 	}
 
-	for _, evalResult := range evalResults {
-		out.Printf("Eval file: %s\n", evalResult.EvalFile)
-		out.Printf("Tool trajectory score: %f\n", evalResult.Score.ToolTrajectoryScore)
-		out.Printf("Rouge-1 score: %f\n", evalResult.Score.Rouge1Score)
-	}
-
-	return nil
+	_, err = evaluation.Evaluate(ctx, agents, evalsDir, func(result evaluation.Result) {
+		out.Printf("Eval file: %s\n", result.EvalFile)
+		out.Printf("Tool trajectory score: %f\n", result.Score.ToolTrajectoryScore)
+		out.Printf("Rouge-1 score: %f\n", result.Score.Rouge1Score)
+	})
+	return err
 }
diff --git a/examples/eval/.gitignore b/examples/eval/.gitignore
@@ -0,0 +1 @@
+!evals
diff --git a/examples/eval/README.md b/examples/eval/README.md
@@ -1,15 +1,18 @@
 # Simple eval
 
-This is a simple agent that have an eval session saved in the `evals` directory, to run the eval you can:
+This is a simple agent that has two eval sessions saved in the `evals` directory, to run the eval you can:
 
 ```console
-$ cagent eval agent.yaml ./evals
+$ cagent eval demo.yaml ./evals
 ```
 
 This will output something like
 
 ```console
-Eval file: 0463e7b4-d332-4bdb-97c2-8c092a32cde3
+Eval file: 41b179a2-ed19-4ae2-a45d-95775aaa90f7
 Tool trajectory score: 1.000000
-Rouge-1 score: 0.695652
+Rouge-1 score: 0.521739
+Eval file: 5d83e247-061f-4462-9b2d-240facde45f3
+Tool trajectory score: 1.000000
+Rouge-1 score: 0.829268
 ```
diff --git a/examples/eval/demo.yaml b/examples/eval/demo.yaml
@@ -0,0 +1,7 @@
+agents:
+  root:
+    model: openai/gpt-4o
+    description: Test agent
+    instruction: You know how to read/write and list files.
+    toolsets:
+      - type: filesystem
diff --git a/examples/eval/evals/41b179a2-ed19-4ae2-a45d-95775aaa90f7.json b/examples/eval/evals/41b179a2-ed19-4ae2-a45d-95775aaa90f7.json
@@ -0,0 +1,68 @@
+{
+    "id": "41b179a2-ed19-4ae2-a45d-95775aaa90f7",
+    "title": "Counting Files in Local Folder",
+    "messages": [
+        {
+            "message": {
+                "agentFilename": "./agent.yaml",
+                "agentName": "",
+                "message": {
+                    "role": "user",
+                    "content": "How many files in the local folder?",
+                    "created_at": "2025-11-20T12:51:05+01:00"
+                }
+            }
+        },
+        {
+            "message": {
+                "agentFilename": "",
+                "agentName": "root",
+                "message": {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "call_VUQE0PAUhRbKpFFoVdNY1XTx",
+                            "type": "function",
+                            "function": {
+                                "name": "list_directory",
+                                "arguments": "{\"path\":\"./\"}"
+                            }
+                        }
+                    ],
+                    "created_at": "2025-11-20T12:51:06+01:00"
+                }
+            }
+        },
+        {
+            "message": {
+                "agentFilename": "",
+                "agentName": "root",
+                "message": {
+                    "role": "tool",
+                    "content": "FILE README.md\nFILE agent.yaml\n",
+                    "tool_call_id": "call_VUQE0PAUhRbKpFFoVdNY1XTx",
+                    "created_at": "2025-11-20T12:51:06+01:00"
+                }
+            }
+        },
+        {
+            "message": {
+                "agentFilename": "",
+                "agentName": "root",
+                "message": {
+                    "role": "assistant",
+                    "content": "There are 2 files in the local folder:\n\n1. `README.md`\n2. `agent.yaml`",
+                    "created_at": "2025-11-20T12:51:06+01:00"
+                }
+            }
+        }
+    ],
+    "created_at": "2025-11-20T12:50:48.32142+01:00",
+    "tools_approved": false,
+    "SendUserMessage": true,
+    "max_iterations": 0,
+    "input_tokens": 918,
+    "output_tokens": 23,
+    "cost": 0.0048850000000000005
+}
diff --git a/examples/eval/evals/5d83e247-061f-4462-9b2d-240facde45f3.json b/examples/eval/evals/5d83e247-061f-4462-9b2d-240facde45f3.json
@@ -0,0 +1,68 @@
+{
+    "id": "5d83e247-061f-4462-9b2d-240facde45f3",
+    "title": "Checking the Content of README.md File",
+    "messages": [
+        {
+            "message": {
+                "agentFilename": "./demo.yaml",
+                "agentName": "",
+                "message": {
+                    "role": "user",
+                    "content": "Is README.md empty?",
+                    "created_at": "2025-11-20T12:53:33+01:00"
+                }
+            }
+        },
+        {
+            "message": {
+                "agentFilename": "",
+                "agentName": "root",
+                "message": {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "call_JRY589fOnxlErcyqhH6UG4tJ",
+                            "type": "function",
+                            "function": {
+                                "name": "read_file",
+                                "arguments": "{\"path\":\"README.md\"}"
+                            }
+                        }
+                    ],
+                    "created_at": "2025-11-20T12:53:34+01:00"
+                }
+            }
+        },
+        {
+            "message": {
+                "agentFilename": "",
+                "agentName": "root",
+                "message": {
+                    "role": "tool",
+                    "content": "# Simple eval\n\nThis is a simple agent that has two eval sessions saved in the `evals` directory, to run the eval you can:\n\n```console\n$ cagent eval demo.yaml ./evals\n```\n\nThis will output something like\n\n```console\nEval file: 41b179a2-ed19-4ae2-a45d-95775aaa90f7\nTool trajectory score: 1.000000\nRouge-1 score: 0.521739\n```\n",
+                    "tool_call_id": "call_JRY589fOnxlErcyqhH6UG4tJ",
+                    "created_at": "2025-11-20T12:53:34+01:00"
+                }
+            }
+        },
+        {
+            "message": {
+                "agentFilename": "",
+                "agentName": "root",
+                "message": {
+                    "role": "assistant",
+                    "content": "The `README.md` file is not empty. It contains information about running a simple evaluation using a specific agent and includes example output.",
+                    "created_at": "2025-11-20T12:53:35+01:00"
+                }
+            }
+        }
+    ],
+    "created_at": "2025-11-20T12:53:32.334516+01:00",
+    "tools_approved": false,
+    "SendUserMessage": true,
+    "max_iterations": 0,
+    "input_tokens": 1012,
+    "output_tokens": 28,
+    "cost": 0.0051725
+}
diff --git a/pkg/evaluation/evaluation.go b/pkg/evaluation/evaluation.go
diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go
diff --git a/pkg/evaluation/score.go b/pkg/evaluation/score.go