Skip to content

Commit 9d9736c

Browse files
cleanup
1 parent 0974346 commit 9d9736c

7 files changed

Lines changed: 69 additions & 86 deletions

File tree

src/agentevals/api/streaming_routes.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from fastapi import APIRouter, Depends, HTTPException
1111
from fastapi.responses import FileResponse
12-
from pydantic import BaseModel, Field
12+
from pydantic import BaseModel, ConfigDict, Field
1313

1414
from ..config import BuiltinMetricDef, EvalRunConfig, EvaluatorDef
1515
from ..converter import convert_traces
@@ -42,6 +42,8 @@ class CreateEvalSetRequest(BaseModel):
4242

4343

4444
class EvaluateSessionsRequest(BaseModel):
45+
model_config = ConfigDict(extra="forbid")
46+
4547
golden_session_id: str
4648
eval_set_id: str
4749
evaluators: list[EvaluatorDef] = Field(default_factory=lambda: [BuiltinMetricDef(name="tool_trajectory_avg_score")])

src/agentevals/cli.py

Lines changed: 28 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -52,23 +52,6 @@ def _relative_time(iso_str: str | None) -> str:
5252
return ""
5353

5454

55-
def _apply_builtin_overrides(evaluators, *, judge_model, threshold, trajectory_match_type):
56-
updated = []
57-
for evaluator in evaluators:
58-
if getattr(evaluator, "type", None) == "builtin":
59-
payload = evaluator.model_dump(by_alias=False)
60-
if judge_model is not None:
61-
payload["judge_model"] = judge_model
62-
if threshold is not None:
63-
payload["threshold"] = threshold
64-
if trajectory_match_type is not None:
65-
payload["trajectory_match_type"] = trajectory_match_type
66-
updated.append(type(evaluator).model_validate(payload))
67-
else:
68-
updated.append(evaluator)
69-
return updated
70-
71-
7255
@click.group()
7356
@click.version_option(version=__version__, prog_name="agentevals")
7457
@click.option(
@@ -160,61 +143,54 @@ def run(
160143
config_file: str | None,
161144
) -> None:
162145
"""Evaluate trace file(s) against the configured evaluators."""
163-
from .config import EvalRunConfig, make_builtin_evaluator_entries
146+
from .config import EvalRunConfig, apply_builtin_overrides, make_builtin_evaluator_entries
164147
from .output import format_results
165148
from .runner import run_evaluation
166149

167150
explicit_metrics = list(metric) if metric else []
168151

169152
if config_file:
170-
from .eval_config_loader import load_eval_config, merge_configs
153+
from .eval_config_loader import load_eval_config
171154

172-
file_config = load_eval_config(config_file)
173-
config = file_config
155+
config = load_eval_config(config_file)
174156
if explicit_metrics:
175-
cli_config = EvalRunConfig(
176-
trace_files=[],
177-
evaluators=make_builtin_evaluator_entries(
178-
explicit_metrics,
179-
judge_model=judge_model,
180-
threshold=threshold,
181-
trajectory_match_type=trajectory_match_type,
182-
),
157+
cli_evaluators = make_builtin_evaluator_entries(
158+
explicit_metrics,
159+
judge_model=judge_model,
160+
threshold=threshold,
161+
trajectory_match_type=trajectory_match_type,
183162
)
184-
config = merge_configs(file_config, cli_config)
163+
by_name = {e.name: e for e in config.evaluators}
164+
for ev in cli_evaluators:
165+
by_name[ev.name] = ev
166+
config.evaluators = list(by_name.values())
185167
elif judge_model is not None or threshold is not None or trajectory_match_type is not None:
186-
config = config.model_copy(
187-
update={
188-
"evaluators": _apply_builtin_overrides(
189-
config.evaluators,
190-
judge_model=judge_model,
191-
threshold=threshold,
192-
trajectory_match_type=trajectory_match_type,
193-
)
194-
}
168+
config.evaluators = apply_builtin_overrides(
169+
config.evaluators,
170+
judge_model=judge_model,
171+
threshold=threshold,
172+
trajectory_match_type=trajectory_match_type,
195173
)
196-
if trace_files:
197-
config.trace_files = list(trace_files)
198-
if eval_set is not None:
199-
config.eval_set_file = eval_set
200-
if trace_format is not None:
201-
config.trace_format = trace_format
202-
if output != "table":
203-
config.output_format = output
204174
else:
205175
config = EvalRunConfig(
206-
trace_files=list(trace_files),
207-
eval_set_file=eval_set,
176+
trace_files=[],
208177
evaluators=make_builtin_evaluator_entries(
209-
explicit_metrics if explicit_metrics else None,
178+
explicit_metrics or None,
210179
judge_model=judge_model,
211180
threshold=threshold,
212181
trajectory_match_type=trajectory_match_type,
213182
),
214-
trace_format=trace_format,
215-
output_format=output,
216183
)
217184

185+
if trace_files:
186+
config.trace_files = list(trace_files)
187+
if eval_set is not None:
188+
config.eval_set_file = eval_set
189+
if trace_format is not None:
190+
config.trace_format = trace_format
191+
if output != "table":
192+
config.output_format = output
193+
218194
result = asyncio.run(run_evaluation(config))
219195
formatted = format_results(result, fmt=config.output_format)
220196
click.echo(formatted)

src/agentevals/config.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,34 @@ def make_builtin_evaluator_entries(
138138
return evaluators
139139

140140

141+
def apply_builtin_overrides(
142+
evaluators: list[EvaluatorDef],
143+
*,
144+
judge_model: str | None = None,
145+
threshold: float | None = None,
146+
trajectory_match_type: str | None = None,
147+
) -> list[EvaluatorDef]:
148+
"""Return a new evaluator list with run-level overrides applied to built-ins.
149+
150+
Non-builtin entries pass through unchanged. Each override is only applied
151+
when the corresponding argument is not None, so callers can pass any subset.
152+
"""
153+
updated: list[EvaluatorDef] = []
154+
for evaluator in evaluators:
155+
if isinstance(evaluator, BuiltinMetricDef):
156+
payload = evaluator.model_dump(by_alias=False)
157+
if judge_model is not None:
158+
payload["judge_model"] = judge_model
159+
if threshold is not None:
160+
payload["threshold"] = threshold
161+
if trajectory_match_type is not None:
162+
payload["trajectory_match_type"] = trajectory_match_type
163+
updated.append(BuiltinMetricDef.model_validate(payload))
164+
else:
165+
updated.append(evaluator)
166+
return updated
167+
168+
141169
class EvalParams(BaseModel):
142170
"""Evaluation parameters independent of how traces are provided.
143171

src/agentevals/eval_config_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def load_eval_config(path: str | Path) -> EvalRunConfig:
6666

6767
legacy_keys = {
6868
"metrics",
69-
"custom_graders",
69+
"custom_evaluators",
7070
"judge_model",
7171
"threshold",
7272
"trajectory_match_type",

src/agentevals/mcp_server.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mcp.server import FastMCP
99
from pydantic import BaseModel, Field
1010

11-
from agentevals.config import EvalRunConfig, make_builtin_evaluator_entries
11+
from agentevals.config import EvalRunConfig, apply_builtin_overrides, make_builtin_evaluator_entries
1212
from agentevals.runner import run_evaluation
1313

1414
_DEFAULT_SERVER_URL = "http://localhost:8001"
@@ -89,23 +89,6 @@ class EvaluateSessionsResponse(BaseModel):
8989
results: list[SessionEvalResultResponse]
9090

9191

92-
def _apply_builtin_overrides(evaluators, *, judge_model=None, threshold=None, trajectory_match_type=None):
93-
updated = []
94-
for evaluator in evaluators:
95-
if getattr(evaluator, "type", None) == "builtin":
96-
payload = evaluator.model_dump(by_alias=False)
97-
if judge_model is not None:
98-
payload["judge_model"] = judge_model
99-
if threshold is not None:
100-
payload["threshold"] = threshold
101-
if trajectory_match_type is not None:
102-
payload["trajectory_match_type"] = trajectory_match_type
103-
updated.append(type(evaluator).model_validate(payload))
104-
else:
105-
updated.append(evaluator)
106-
return updated
107-
108-
10992
# ---------------------------------------------------------------------------
11093
# Result transformation
11194
# ---------------------------------------------------------------------------
@@ -301,7 +284,7 @@ async def evaluate_traces(
301284
elif judge_model is not None or threshold is not None:
302285
config = config.model_copy(
303286
update={
304-
"evaluators": _apply_builtin_overrides(
287+
"evaluators": apply_builtin_overrides(
305288
config.evaluators,
306289
judge_model=judge_model,
307290
threshold=threshold,

tests/test_eval_config_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def test_load_eval_config_rejects_legacy_keys(tmp_path):
1515
"""
1616
metrics:
1717
- tool_trajectory_avg_score
18-
custom_graders:
18+
custom_evaluators:
1919
- name: tool_call_checker
2020
type: code
2121
path: ./examples/custom_evaluators/tool_call_checker.py

ui/src/components/upload/MetricSelector.tsx

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -146,18 +146,13 @@ const selectorStyle = css`
146146

147147
let cachedMetrics: MetricMetadata[] | null = null;
148148

149-
function isSupportedBuiltinMetric(metric: MetricMetadata): boolean {
150-
return metric.working !== false && metric.requiresRubrics !== true;
151-
}
152-
153149
export const MetricSelector: React.FC<MetricSelectorProps> = ({
154150
selectedEvaluatorNames,
155151
onToggleEvaluatorName,
156152
loadFromAPI = false,
157153
}) => {
158154
const [metrics, setMetrics] = useState<MetricMetadata[]>(cachedMetrics ?? AVAILABLE_METRICS);
159-
const supportedMetrics = metrics.filter(isSupportedBuiltinMetric);
160-
const unsupportedCount = metrics.length - supportedMetrics.length;
155+
const hasCaveatedMetrics = metrics.some((m) => m.requiresRubrics === true || m.working === false);
161156

162157
useEffect(() => {
163158
if (!loadFromAPI || cachedMetrics) return;
@@ -175,7 +170,7 @@ export const MetricSelector: React.FC<MetricSelectorProps> = ({
175170
return () => { cancelled = true; };
176171
}, [loadFromAPI]);
177172

178-
const categorizedSupportedMetrics = supportedMetrics.reduce(
173+
const categorizedMetrics = metrics.reduce(
179174
(acc, metric) => {
180175
if (!acc[metric.category]) {
181176
acc[metric.category] = [];
@@ -187,7 +182,7 @@ export const MetricSelector: React.FC<MetricSelectorProps> = ({
187182
);
188183

189184
const handleSelectAll = () => {
190-
supportedMetrics.forEach((metric) => {
185+
metrics.forEach((metric) => {
191186
if (!selectedEvaluatorNames.includes(metric.name)) {
192187
onToggleEvaluatorName(metric.name);
193188
}
@@ -203,7 +198,7 @@ export const MetricSelector: React.FC<MetricSelectorProps> = ({
203198
return (
204199
<div css={selectorStyle}>
205200
<div className="metric-categories">
206-
{Object.entries(categorizedSupportedMetrics).map(([category, metrics]) => (
201+
{Object.entries(categorizedMetrics).map(([category, metrics]) => (
207202
<div key={category} className="metric-category">
208203
<div className="category-title">{category}</div>
209204
<div className="metric-list">
@@ -250,10 +245,9 @@ export const MetricSelector: React.FC<MetricSelectorProps> = ({
250245
Clear All
251246
</Button>
252247
</div>
253-
{unsupportedCount > 0 && (
248+
{hasCaveatedMetrics && (
254249
<div className="selector-note">
255-
Hidden {unsupportedCount} unsupported built-in evaluator{unsupportedCount === 1 ? '' : 's'} that require
256-
rubric configuration or are marked incomplete.
250+
Some evaluators require rubric configuration or are work-in-progress; see badges.
257251
</div>
258252
)}
259253
</div>

0 commit comments

Comments
 (0)