Skip to content

Commit 572321b

Browse files
Merge pull request #123 from agentevals-dev/feature/trace-loader-extensions
Improve trace loading logic
2 parents d0955cc + d020d3b commit 572321b

8 files changed

Lines changed: 783 additions & 63 deletions

File tree

src/agentevals/api/app.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
from pathlib import Path
1111
from typing import TYPE_CHECKING
1212

13-
from fastapi import FastAPI, Request
14-
from fastapi import WebSocket
13+
from fastapi import FastAPI, Request, WebSocket
1514
from fastapi.middleware.cors import CORSMiddleware
1615
from fastapi.responses import StreamingResponse
1716

src/agentevals/api/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from pydantic import BaseModel, ConfigDict, Field
1212
from pydantic.alias_generators import to_camel
1313

14+
from ..config import EvalParams
15+
1416
T = TypeVar("T")
1517

1618

@@ -134,6 +136,14 @@ class ConvertTracesData(CamelModel):
134136
traces: list[TraceConversionEntry]
135137

136138

139+
class EvaluateJsonRequest(CamelModel):
140+
"""Request body for JSON-based trace evaluation (``POST /evaluate/json``)."""
141+
142+
traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
143+
config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
144+
eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
145+
146+
137147
# ---------------------------------------------------------------------------
138148
# SSE evaluation event models
139149
# ---------------------------------------------------------------------------

src/agentevals/api/routes.py

Lines changed: 158 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import tempfile
1212
from typing import Any
1313

14-
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
14+
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
1515
from fastapi.responses import StreamingResponse
1616
from pydantic.alias_generators import to_camel
1717

@@ -27,13 +27,22 @@
2727
)
2828
from ..converter import convert_traces
2929
from ..extraction import get_extractor
30-
from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
30+
from ..loader.otlp import OtlpJsonLoader
31+
from ..runner import (
32+
RunResult,
33+
get_loader,
34+
load_eval_set,
35+
load_eval_set_from_dict,
36+
run_evaluation,
37+
run_evaluation_from_traces,
38+
)
3139
from ..trace_metrics import extract_performance_metrics, extract_trace_metadata
3240
from .models import (
3341
ApiKeyStatus,
3442
ConfigData,
3543
ConvertTracesData,
3644
EvalSetValidation,
45+
EvaluateJsonRequest,
3746
HealthData,
3847
MetricInfo,
3948
SSEDoneEvent,
@@ -61,6 +70,8 @@ def _camel_keys(obj: Any) -> Any:
6170

6271
router = APIRouter()
6372

73+
_MAX_JSON_BODY_BYTES = 50 * 1024 * 1024 # 50 MB (multipart endpoints allow 10 MB per file)
74+
6475
_TYPE_TO_MODEL = {
6576
"builtin": BuiltinMetricDef,
6677
"code": CodeEvaluatorDef,
@@ -729,3 +740,148 @@ async def run_with_progress():
729740
"Connection": "keep-alive",
730741
},
731742
)
743+
744+
745+
def _parse_json_request(request: EvaluateJsonRequest):
746+
"""Parse traces and eval set from an EvaluateJsonRequest.
747+
748+
Returns (traces, eval_set). Raises HTTPException on invalid input.
749+
"""
750+
try:
751+
traces = OtlpJsonLoader().load_from_dict(request.traces)
752+
except ValueError as exc:
753+
raise HTTPException(status_code=400, detail=str(exc)) from exc
754+
755+
if not traces:
756+
raise HTTPException(status_code=400, detail="No traces found in OTLP JSON")
757+
758+
eval_set = None
759+
if request.eval_set:
760+
try:
761+
eval_set = load_eval_set_from_dict(request.eval_set)
762+
except Exception as exc:
763+
raise HTTPException(status_code=400, detail=f"Invalid eval set: {exc}") from exc
764+
765+
return traces, eval_set
766+
767+
768+
def _check_json_body_size(raw_request: Request):
769+
content_length = int(raw_request.headers.get("content-length", 0))
770+
if content_length > _MAX_JSON_BODY_BYTES:
771+
raise HTTPException(
772+
status_code=413,
773+
detail=f"Request body exceeds {_MAX_JSON_BODY_BYTES // (1024 * 1024)}MB limit",
774+
)
775+
776+
777+
def _sse_error(message: str) -> str:
778+
return f"data: {SSEErrorEvent(error=message).model_dump_json(by_alias=True)}\n\n"
779+
780+
781+
@router.post("/evaluate/json", response_model=StandardResponse[RunResult])
782+
async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Request):
783+
"""Evaluate OTLP JSON traces passed in the request body."""
784+
_check_json_body_size(raw_request)
785+
traces, eval_set = _parse_json_request(request)
786+
787+
try:
788+
result = await run_evaluation_from_traces(
789+
traces=traces,
790+
config=request.config,
791+
eval_set=eval_set,
792+
)
793+
return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
794+
except Exception as exc:
795+
logger.exception("JSON evaluation failed")
796+
raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
797+
798+
799+
@router.post("/evaluate/json/stream")
800+
async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request: Request):
801+
"""Evaluate OTLP JSON traces with real-time progress via SSE."""
802+
_check_json_body_size(raw_request)
803+
804+
async def event_generator():
805+
try:
806+
try:
807+
traces, eval_set = _parse_json_request(request)
808+
except HTTPException as exc:
809+
yield _sse_error(exc.detail)
810+
return
811+
812+
for trace in traces:
813+
try:
814+
extractor = get_extractor(trace)
815+
perf_metrics = _camel_keys(extract_performance_metrics(trace, extractor))
816+
trace_metadata = _camel_keys(extract_trace_metadata(trace, extractor))
817+
evt = SSEPerformanceMetricsEvent(
818+
trace_id=trace.trace_id,
819+
performance_metrics=perf_metrics,
820+
trace_metadata=trace_metadata,
821+
)
822+
yield f"event: performance_metrics\ndata: {evt.model_dump_json(by_alias=True)}\n\n"
823+
except Exception as e:
824+
logger.error(f"Failed to extract early performance metrics: {e}")
825+
826+
queue: asyncio.Queue = asyncio.Queue()
827+
828+
async def progress_callback(message: str):
829+
await queue.put(("progress", message))
830+
831+
async def trace_progress_callback(trace_result):
832+
await queue.put(("trace_progress", trace_result))
833+
834+
async def run_with_progress():
835+
result = await run_evaluation_from_traces(
836+
traces=traces,
837+
config=request.config,
838+
eval_set=eval_set,
839+
progress_callback=progress_callback,
840+
trace_progress_callback=trace_progress_callback,
841+
)
842+
await queue.put(("done", result))
843+
844+
eval_task = asyncio.create_task(run_with_progress())
845+
846+
try:
847+
while True:
848+
msg = await queue.get()
849+
tag, payload = msg
850+
851+
if tag == "done":
852+
evt = SSEDoneEvent(
853+
result=_camel_keys(payload.model_dump(by_alias=True)),
854+
)
855+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
856+
break
857+
elif tag == "trace_progress":
858+
evt = SSETraceProgressEvent(
859+
trace_progress=SSETraceProgress(
860+
trace_id=payload.trace_id,
861+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
862+
)
863+
)
864+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
865+
elif tag == "progress":
866+
evt = SSEProgressEvent(message=payload)
867+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
868+
finally:
869+
if not eval_task.done():
870+
eval_task.cancel()
871+
try:
872+
await eval_task
873+
except asyncio.CancelledError:
874+
pass
875+
876+
except Exception as exc:
877+
logger.exception("JSON evaluation stream failed")
878+
yield _sse_error(str(exc))
879+
880+
return StreamingResponse(
881+
event_generator(),
882+
media_type="text/event-stream",
883+
headers={
884+
"Cache-Control": "no-cache",
885+
"Connection": "keep-alive",
886+
},
887+
)

src/agentevals/config.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from pathlib import Path
66
from typing import Annotated, Any, Literal
77

8-
from pydantic import BaseModel, Field, field_validator
8+
from pydantic import BaseModel, ConfigDict, Field, field_validator
9+
from pydantic.alias_generators import to_camel
910

1011

1112
class BuiltinMetricDef(BaseModel):
@@ -99,13 +100,14 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
99100
]
100101

101102

102-
class EvalRunConfig(BaseModel):
103-
trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
103+
class EvalParams(BaseModel):
104+
"""Evaluation parameters independent of how traces are provided.
104105
105-
eval_set_file: str | None = Field(
106-
default=None,
107-
description="Path to a golden eval set JSON file (ADK EvalSet format).",
108-
)
106+
Used by ``run_evaluation_from_traces`` for programmatic / API-driven
107+
evaluation. ``EvalRunConfig`` inherits from this and adds file I/O fields.
108+
"""
109+
110+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
109111

110112
metrics: list[str] = Field(
111113
default_factory=lambda: ["tool_trajectory_avg_score"],
@@ -117,19 +119,16 @@ class EvalRunConfig(BaseModel):
117119
description="Custom evaluator definitions.",
118120
)
119121

120-
trace_format: str = Field(
121-
default="jaeger-json",
122-
description="Format of the trace files (jaeger-json or otlp-json).",
123-
)
124-
125122
judge_model: str | None = Field(
126123
default=None,
127124
description="LLM model for judge-based metrics.",
128125
)
129126

130127
threshold: float | None = Field(
131128
default=None,
132-
description="Score threshold for pass/fail.",
129+
ge=0,
130+
le=1,
131+
description="Score threshold for pass/fail (0.0 to 1.0).",
133132
)
134133

135134
trajectory_match_type: str | None = Field(
@@ -145,17 +144,35 @@ def _validate_trajectory_match_type(cls, v: str | None) -> str | None:
145144
raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
146145
return v.upper() if v is not None else v
147146

148-
output_format: str = Field(
149-
default="table",
150-
description="Output format: 'table', 'json', or 'summary'.",
151-
)
152-
153147
max_concurrent_traces: int = Field(
154148
default=10,
149+
ge=1,
155150
description="Maximum number of traces to evaluate concurrently.",
156151
)
157152

158153
max_concurrent_evals: int = Field(
159154
default=5,
155+
ge=1,
160156
description="Maximum number of concurrent metric evaluations (LLM API calls).",
161157
)
158+
159+
160+
class EvalRunConfig(EvalParams):
161+
"""Full configuration for file-based evaluation runs."""
162+
163+
trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
164+
165+
eval_set_file: str | None = Field(
166+
default=None,
167+
description="Path to a golden eval set JSON file (ADK EvalSet format).",
168+
)
169+
170+
trace_format: str = Field(
171+
default="jaeger-json",
172+
description="Format of the trace files (jaeger-json or otlp-json).",
173+
)
174+
175+
output_format: str = Field(
176+
default="table",
177+
description="Output format: 'table', 'json', or 'summary'.",
178+
)

0 commit comments

Comments
 (0)