Skip to content

Commit e54e194

Browse files
add per eval time metrics to CLI
1 parent 3ded9b3 commit e54e194

4 files changed

Lines changed: 140 additions & 4 deletions

File tree

src/agentevals/output.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@
88
from .runner import MetricResult, RunResult
99

1010

11+
def _format_duration(ms: float | None) -> str:
12+
if ms is None:
13+
return ""
14+
ms = round(ms)
15+
if ms < 1000:
16+
return f"{ms}ms"
17+
seconds = ms / 1000
18+
if seconds < 60:
19+
return f"{seconds:.1f}s"
20+
minutes = int(seconds // 60)
21+
remaining = seconds - minutes * 60
22+
return f"{minutes}m {remaining:.0f}s"
23+
24+
1125
def format_results(run_result: RunResult, fmt: str = "table") -> str:
1226
if fmt == "json":
1327
return _format_json(run_result)
@@ -56,14 +70,15 @@ def _format_table(run_result: RunResult) -> str:
5670
score_str,
5771
mr.eval_status,
5872
per_inv,
73+
_format_duration(mr.duration_ms),
5974
error_str,
6075
]
6176
)
6277

6378
if rows:
6479
table = tabulate(
6580
rows,
66-
headers=["", "Metric", "Score", "Status", "Per-Invocation", "Error"],
81+
headers=["", "Metric", "Score", "Status", "Per-Invocation", "Time", "Error"],
6782
tablefmt="simple",
6883
)
6984
lines.append(table)
@@ -131,6 +146,7 @@ def _format_json(run_result: RunResult) -> str:
131146
"score": mr.score,
132147
"eval_status": mr.eval_status,
133148
"per_invocation_scores": mr.per_invocation_scores,
149+
"duration_ms": mr.duration_ms,
134150
"error": mr.error,
135151
}
136152
if mr.details:
@@ -159,12 +175,13 @@ def _format_summary(run_result: RunResult) -> str:
159175
lines.append(f"Trace {tr.trace_id} ({tr.num_invocations} invocations):")
160176
for mr in tr.metric_results:
161177
icon = _status_icon(mr.eval_status)
178+
duration_suffix = f" [{_format_duration(mr.duration_ms)}]" if mr.duration_ms is not None else ""
162179
if mr.error:
163-
lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}")
180+
lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}{duration_suffix}")
164181
elif mr.score is not None:
165-
lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status})")
182+
lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status}){duration_suffix}")
166183
else:
167-
lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status})")
184+
lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status}){duration_suffix}")
168185
lines.append("")
169186

170187
return "\n".join(lines)

src/agentevals/runner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import asyncio
66
import json
77
import logging
8+
import time
89
from collections.abc import Awaitable, Callable
910
from typing import Any
1011

@@ -39,6 +40,7 @@ class MetricResult(BaseModel):
3940
per_invocation_scores: list[float | None] = Field(default_factory=list)
4041
error: str | None = None
4142
details: dict[str, Any] | None = None
43+
duration_ms: float | None = None
4244

4345

4446
class TraceResult(BaseModel):
@@ -234,13 +236,15 @@ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
234236
async with eval_semaphore:
235237
if progress_callback:
236238
await progress_callback(f"Running {metric_name}...")
239+
t0 = time.monotonic()
237240
result = await evaluate_builtin_metric(
238241
metric_name=metric_name,
239242
actual_invocations=actual_invocations,
240243
expected_invocations=expected_invocations,
241244
judge_model=judge_model,
242245
threshold=threshold,
243246
)
247+
result.duration_ms = (time.monotonic() - t0) * 1000
244248
return await _append_result(result)
245249

246250
async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> MetricResult:
@@ -249,11 +253,13 @@ async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> Metr
249253
await progress_callback(f"Running {evaluator_def.name}...")
250254
from .custom_evaluators import evaluate_custom_evaluator
251255

256+
t0 = time.monotonic()
252257
result = await evaluate_custom_evaluator(
253258
evaluator_def=evaluator_def,
254259
actual_invocations=actual_invocations,
255260
expected_invocations=expected_invocations,
256261
)
262+
result.duration_ms = (time.monotonic() - t0) * 1000
257263
return await _append_result(result)
258264

259265
tasks = [_eval_builtin_with_semaphore(m) for m in metrics]

tests/test_output.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import json
2+
3+
from agentevals.output import _format_duration, format_results
4+
from agentevals.runner import MetricResult, RunResult, TraceResult
5+
6+
7+
class TestFormatDuration:
8+
def test_none(self):
9+
assert _format_duration(None) == ""
10+
11+
def test_milliseconds(self):
12+
assert _format_duration(42.3) == "42ms"
13+
14+
def test_zero(self):
15+
assert _format_duration(0.0) == "0ms"
16+
17+
def test_seconds(self):
18+
assert _format_duration(1500.0) == "1.5s"
19+
20+
def test_exact_one_second(self):
21+
assert _format_duration(1000.0) == "1.0s"
22+
23+
def test_minutes(self):
24+
assert _format_duration(125000.0) == "2m 5s"
25+
26+
def test_just_under_one_second(self):
27+
assert _format_duration(999.4) == "999ms"
28+
29+
def test_rounding_boundary(self):
30+
assert _format_duration(999.5) == "1.0s"
31+
32+
33+
class TestTableFormatTiming:
34+
def _make_result(self, duration_ms: float | None = None) -> RunResult:
35+
mr = MetricResult(
36+
metric_name="test_metric",
37+
score=0.95,
38+
eval_status="PASSED",
39+
per_invocation_scores=[0.95],
40+
duration_ms=duration_ms,
41+
)
42+
tr = TraceResult(
43+
trace_id="abc123",
44+
num_invocations=1,
45+
metric_results=[mr],
46+
)
47+
return RunResult(trace_results=[tr])
48+
49+
def test_time_column_in_table(self):
50+
output = format_results(self._make_result(duration_ms=1234.5), fmt="table")
51+
assert "Time" in output
52+
assert "1.2s" in output
53+
54+
def test_time_column_milliseconds(self):
55+
output = format_results(self._make_result(duration_ms=42.0), fmt="table")
56+
assert "42ms" in output
57+
58+
def test_time_column_none(self):
59+
output = format_results(self._make_result(duration_ms=None), fmt="table")
60+
assert "Time" in output
61+
62+
63+
class TestJsonFormatTiming:
64+
def test_duration_ms_in_json(self):
65+
mr = MetricResult(
66+
metric_name="test_metric",
67+
score=0.95,
68+
eval_status="PASSED",
69+
duration_ms=1234.5,
70+
)
71+
tr = TraceResult(
72+
trace_id="abc123",
73+
num_invocations=1,
74+
metric_results=[mr],
75+
)
76+
result = RunResult(trace_results=[tr])
77+
output = format_results(result, fmt="json")
78+
data = json.loads(output)
79+
assert data["traces"][0]["metrics"][0]["duration_ms"] == 1234.5
80+
81+
def test_duration_ms_null_in_json(self):
82+
mr = MetricResult(metric_name="test_metric", score=0.5, eval_status="PASSED")
83+
tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr])
84+
result = RunResult(trace_results=[tr])
85+
output = format_results(result, fmt="json")
86+
data = json.loads(output)
87+
assert data["traces"][0]["metrics"][0]["duration_ms"] is None
88+
89+
90+
class TestSummaryFormatTiming:
91+
def test_duration_in_summary(self):
92+
mr = MetricResult(
93+
metric_name="test_metric",
94+
score=0.95,
95+
eval_status="PASSED",
96+
duration_ms=820.0,
97+
)
98+
tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr])
99+
result = RunResult(trace_results=[tr])
100+
output = format_results(result, fmt="summary")
101+
assert "[820ms]" in output
102+
103+
def test_no_duration_no_brackets(self):
104+
mr = MetricResult(metric_name="test_metric", score=0.5, eval_status="PASSED")
105+
tr = TraceResult(trace_id="abc123", num_invocations=1, metric_results=[mr])
106+
result = RunResult(trace_results=[tr])
107+
output = format_results(result, fmt="summary")
108+
metric_line = [line for line in output.splitlines() if "test_metric" in line][0]
109+
assert metric_line.rstrip().endswith("(PASSED)")

tests/test_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def test_trajectory_eval_pass(self):
3939
assert mr.score == 1.0
4040
assert mr.eval_status == "PASSED"
4141
assert mr.error is None
42+
assert mr.duration_ms is not None
43+
assert mr.duration_ms >= 0
4244

4345
def test_missing_eval_set_error(self):
4446
"""Trajectory metric without eval set should report a clear error."""
@@ -51,6 +53,8 @@ def test_missing_eval_set_error(self):
5153
mr = result.trace_results[0].metric_results[0]
5254
assert mr.error is not None
5355
assert "requires expected invocations" in mr.error
56+
assert mr.duration_ms is not None
57+
assert mr.duration_ms >= 0
5458

5559
def test_bad_trace_file(self):
5660
config = EvalRunConfig(

0 commit comments

Comments
 (0)