|
18 | 18 | from agentevals import __version__ |
19 | 19 |
|
20 | 20 | from ..builtin_metrics import METRICS_NEEDING_EXPECTED, METRICS_NEEDING_GCP, METRICS_NEEDING_LLM |
21 | | -from ..config import ( |
22 | | - BuiltinMetricDef, |
23 | | - CodeEvaluatorDef, |
24 | | - CustomEvaluatorDef, |
25 | | - EvalParams, |
26 | | - EvalRunConfig, |
27 | | - OpenAIEvalDef, |
28 | | -) |
| 21 | +from ..config import EvalParams, EvalRunConfig |
29 | 22 | from ..converter import convert_traces |
30 | 23 | from ..extraction import get_extractor |
31 | 24 | from ..loader import load_traces |
@@ -121,24 +114,6 @@ async def _maybe_persist_evaluate_run( |
121 | 114 |
|
122 | 115 | _MAX_JSON_BODY_BYTES = 50 * 1024 * 1024 # 50 MB (multipart endpoints allow 10 MB per file) |
123 | 116 |
|
124 | | -_TYPE_TO_MODEL = { |
125 | | - "builtin": BuiltinMetricDef, |
126 | | - "code": CodeEvaluatorDef, |
127 | | - "openai_eval": OpenAIEvalDef, |
128 | | -} |
129 | | - |
130 | | - |
131 | | -def _parse_custom_evaluators(raw: list[dict]) -> list[CustomEvaluatorDef]: |
132 | | - """Parse a list of custom evaluator dicts from the API config JSON.""" |
133 | | - defs: list[CustomEvaluatorDef] = [] |
134 | | - for entry in raw: |
135 | | - evaluator_type = entry.get("type", "builtin") |
136 | | - model_cls = _TYPE_TO_MODEL.get(evaluator_type) |
137 | | - if not model_cls: |
138 | | - raise ValueError(f"Unknown custom evaluator type: {evaluator_type}") |
139 | | - defs.append(model_cls.model_validate(entry)) |
140 | | - return defs |
141 | | - |
142 | 117 |
|
143 | 118 | @router.get("/health", response_model=StandardResponse[HealthData]) |
144 | 119 | async def health_check(): |
@@ -489,10 +464,10 @@ async def evaluate_traces( |
489 | 464 | eval_set_file: UploadFile | None = File(None), |
490 | 465 | ): |
491 | 466 | """ |
492 | | - Evaluate agent traces using specified metrics. |
| 467 | + Evaluate agent traces using the provided evaluator configuration. |
493 | 468 |
|
494 | 469 | Args: |
495 | | - trace_files: List of Jaeger JSON trace files |
| 470 | + trace_files: List of Jaeger or OTLP JSON trace files |
496 | 471 | config: JSON string with evaluation configuration |
497 | 472 | eval_set_file: Optional golden eval set file |
498 | 473 |
|
@@ -556,40 +531,23 @@ async def evaluate_traces( |
556 | 531 | ) |
557 | 532 | f.write(content) |
558 | 533 |
|
559 | | - metrics = config_dict.get("metrics", ["tool_trajectory_avg_score"]) |
560 | | - if not metrics or not isinstance(metrics, list): |
561 | | - raise HTTPException( |
562 | | - status_code=400, |
563 | | - detail="Config must include 'metrics' as a non-empty array", |
564 | | - ) |
565 | | - |
566 | | - threshold = config_dict.get("threshold") |
567 | | - if threshold is not None and (threshold < 0 or threshold > 1): |
568 | | - raise HTTPException( |
569 | | - status_code=400, |
570 | | - detail="Threshold must be between 0 and 1", |
| 534 | + try: |
| 535 | + eval_config = EvalRunConfig.model_validate( |
| 536 | + { |
| 537 | + **config_dict, |
| 538 | + "traceFiles": trace_paths, |
| 539 | + "evalSetFile": eval_set_path, |
| 540 | + "traceFormat": trace_format, |
| 541 | + } |
571 | 542 | ) |
| 543 | + except Exception as exc: |
| 544 | + raise HTTPException(status_code=400, detail=f"Invalid config: {exc}") from exc |
572 | 545 |
|
573 | | - custom_evaluators: list[CustomEvaluatorDef] = [] |
574 | | - raw_custom = config_dict.get("customEvaluators", config_dict.get("customMetrics", [])) |
575 | | - if raw_custom: |
576 | | - try: |
577 | | - custom_evaluators = _parse_custom_evaluators(raw_custom) |
578 | | - except Exception as exc: |
579 | | - raise HTTPException(status_code=400, detail=f"Invalid customEvaluators: {exc}") from exc |
580 | | - |
581 | | - eval_config = EvalRunConfig( |
582 | | - trace_files=trace_paths, |
583 | | - eval_set_file=eval_set_path, |
584 | | - metrics=metrics, |
585 | | - custom_evaluators=custom_evaluators, |
586 | | - trace_format=trace_format, |
587 | | - judge_model=config_dict.get("judgeModel"), |
588 | | - threshold=threshold, |
589 | | - trajectory_match_type=config_dict.get("trajectoryMatchType"), |
| 546 | + logger.info( |
| 547 | + "Evaluating %d trace file(s) with evaluators: %s", |
| 548 | + len(trace_paths), |
| 549 | + [e.name for e in eval_config.evaluators], |
590 | 550 | ) |
591 | | - |
592 | | - logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}") |
593 | 551 | result = await run_evaluation(eval_config) |
594 | 552 |
|
595 | 553 | run_id = await _maybe_persist_evaluate_run( |
@@ -675,36 +633,19 @@ async def event_generator(): |
675 | 633 | return |
676 | 634 | f.write(content) |
677 | 635 |
|
678 | | - metrics = config_dict.get("metrics", ["tool_trajectory_avg_score"]) |
679 | | - if not metrics or not isinstance(metrics, list): |
680 | | - yield f"data: {SSEErrorEvent(error='Config must include metrics as a non-empty array').model_dump_json(by_alias=True)}\n\n" |
681 | | - return |
682 | | - |
683 | | - threshold = config_dict.get("threshold") |
684 | | - if threshold is not None and (threshold < 0 or threshold > 1): |
685 | | - yield f"data: {SSEErrorEvent(error='Threshold must be between 0 and 1').model_dump_json(by_alias=True)}\n\n" |
| 636 | + try: |
| 637 | + eval_config = EvalRunConfig.model_validate( |
| 638 | + { |
| 639 | + **config_dict, |
| 640 | + "traceFiles": trace_paths, |
| 641 | + "evalSetFile": eval_set_path, |
| 642 | + "traceFormat": trace_format, |
| 643 | + } |
| 644 | + ) |
| 645 | + except Exception as exc: |
| 646 | + yield f"data: {SSEErrorEvent(error=f'Invalid config: {exc}').model_dump_json(by_alias=True)}\n\n" |
686 | 647 | return |
687 | 648 |
|
688 | | - custom_evaluators: list[CustomEvaluatorDef] = [] |
689 | | - raw_custom = config_dict.get("customEvaluators", config_dict.get("customMetrics", [])) |
690 | | - if raw_custom: |
691 | | - try: |
692 | | - custom_evaluators = _parse_custom_evaluators(raw_custom) |
693 | | - except Exception as exc: |
694 | | - yield f"data: {SSEErrorEvent(error=f'Invalid customEvaluators: {exc}').model_dump_json(by_alias=True)}\n\n" |
695 | | - return |
696 | | - |
697 | | - eval_config = EvalRunConfig( |
698 | | - trace_files=trace_paths, |
699 | | - eval_set_file=eval_set_path, |
700 | | - metrics=metrics, |
701 | | - custom_evaluators=custom_evaluators, |
702 | | - trace_format=trace_format, |
703 | | - judge_model=config_dict.get("judgeModel"), |
704 | | - threshold=threshold, |
705 | | - trajectory_match_type=config_dict.get("trajectoryMatchType"), |
706 | | - ) |
707 | | - |
708 | 649 | for trace_file_path in trace_paths: |
709 | 650 | try: |
710 | 651 | traces = load_traces(trace_file_path, format=eval_config.trace_format) |
|
0 commit comments