cleanup and simplify

krisztianfekete · krisztianfekete · commit 5ee51227079f · 2026-05-05T14:05:39.000+02:00
diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py
@@ -95,39 +95,22 @@ async def _maybe_persist_evaluate_run(
     upload_filenames: list[str] | None,
     run_result: "RunResult",
 ) -> str | None:
-    """Persist a synchronously-completed eval as a Run + Result rows when
-    ``app.state.run_service`` is configured (i.e. ``backend=postgres``).
-
-    Returns the synthesized ``run_id`` so the caller can attach it to the
-    response (UI / SSE clients can then ``GET /api/runs/{id}/results`` to
-    pull historical context). Returns None on the memory backend so callers
-    keep their existing zero-config behavior. Errors are logged but never
-    propagated; if persistence fails the eval result is still returned to
-    the caller.
-    """
+    """Best-effort: persist the just-completed eval as a Run row + Result
+    rows when ``app.state.run_service`` is configured (postgres backend).
+    Returns the synthesized ``run_id`` for inclusion in the response, or
+    ``None`` on the memory backend or on persistence failure (eval result
+    is still returned to the caller in that case)."""
     service = getattr(request.app.state, "run_service", None)
     if service is None:
         return None
     try:
-        from ..run.service import RunService
-        from ..storage.models import RunSpec, TraceTarget
-
-        filenames = list(upload_filenames or [])
-        target = TraceTarget(
-            kind="uploaded",
-            trace_format=trace_format if trace_format in ("jaeger-json", "otlp-json") else None,
-            trace_count=len(filenames),
-            trace_files=filenames,
-        )
-        spec_payload = params.model_dump(by_alias=False)
-        spec = RunSpec(
-            approach="trace_replay",
-            target=target,
-            eval_config=spec_payload,
-            eval_set=eval_set_dict,
+        run = await service.record_eval_run(
+            params=params,
+            eval_set_dict=eval_set_dict,
+            trace_format=trace_format,
+            upload_filenames=upload_filenames,
+            run_result=run_result,
         )
-        assert isinstance(service, RunService)
-        run = await service.record_completed_eval(spec=spec, params=params, run_result=run_result)
         return str(run.run_id)
     except Exception:
         logger.exception("failed to persist /api/evaluate run; eval result still returned to caller")
diff --git a/src/agentevals/api/runs_routes.py b/src/agentevals/api/runs_routes.py
@@ -103,12 +103,11 @@ async def list_run_results(run_id: UUID, request: Request):
 
 @runs_router.post("/runs/{run_id}/cancel", response_model=StandardResponse[Run])
 async def cancel_run(run_id: UUID, request: Request):
+    """Mark a run cancel-requested. Idempotent: cancelling an already-terminal
+    run is a no-op and the current state is returned to the caller."""
     service = _service(request)
-    cancelled = await service.cancel(run_id)
+    await service.cancel(run_id)
     run = await service.get(run_id)
     if run is None:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"run {run_id} not found")
-    if not cancelled and run.status not in (RunStatus.QUEUED, RunStatus.RUNNING):
-        # Already terminal; surface that to the caller without an error.
-        return StandardResponse(data=run)
     return StandardResponse(data=run)
diff --git a/src/agentevals/run/result_builder.py b/src/agentevals/run/result_builder.py
@@ -13,8 +13,8 @@
 from uuid import UUID
 
 from ..config import EvalParams
-from ..runner import RunResult
-from ..storage.models import Result
+from ..runner import MetricResult, RunResult
+from ..storage.models import Result, ResultStatus, compute_result_id
 
 EvaluatorType = Literal["builtin", "code", "remote", "openai_eval"]
 
@@ -29,6 +29,50 @@ def classify_evaluator(metric_name: str, params: EvalParams) -> EvaluatorType:
     return "builtin"
 
 
+def result_from_metric_result(
+    *,
+    run_id: UUID,
+    eval_set_item_id: str,
+    eval_set_item_name: str,
+    trace_id: str | None,
+    evaluator_type: EvaluatorType,
+    metric_result: MetricResult,
+) -> Result:
+    """Project an ADK :class:`MetricResult` onto a persistable :class:`Result`.
+
+    The status mapping treats a non-empty ``error`` field as ``ERRORED`` even
+    when ``eval_status`` would have been ``PASSED`` / ``FAILED``, so
+    downstream consumers can filter on status alone without special-casing
+    the error column.
+    """
+    if metric_result.error:
+        status = ResultStatus.ERRORED
+    else:
+        raw = (metric_result.eval_status or "NOT_EVALUATED").upper()
+        status = {
+            "PASSED": ResultStatus.PASSED,
+            "FAILED": ResultStatus.FAILED,
+        }.get(raw, ResultStatus.SKIPPED)
+
+    latency_ms = int(metric_result.duration_ms) if metric_result.duration_ms is not None else None
+
+    return Result(
+        result_id=compute_result_id(run_id, eval_set_item_id, metric_result.metric_name),
+        run_id=run_id,
+        eval_set_item_id=eval_set_item_id,
+        eval_set_item_name=eval_set_item_name,
+        evaluator_name=metric_result.metric_name,
+        evaluator_type=evaluator_type,
+        status=status,
+        score=metric_result.score,
+        per_invocation_scores=list(metric_result.per_invocation_scores or []),
+        trace_id=trace_id,
+        details=metric_result.details or {},
+        error_text=metric_result.error,
+        latency_ms=latency_ms,
+    )
+
+
 def build_results(run_id: UUID, params: EvalParams, run_result: RunResult) -> list[Result]:
     """Flatten ``run_result.trace_results[*].metric_results[*]`` into a list
     of persistable :class:`Result` rows.
@@ -41,13 +85,12 @@ def build_results(run_id: UUID, params: EvalParams, run_result: RunResult) -> li
     out: list[Result] = []
     for trace_result in run_result.trace_results:
         item_id = trace_result.trace_id
-        item_name = trace_result.trace_id
         for mr in trace_result.metric_results:
             out.append(
-                Result.from_metric_result(
+                result_from_metric_result(
                     run_id=run_id,
                     eval_set_item_id=item_id,
-                    eval_set_item_name=item_name,
+                    eval_set_item_name=item_id,
                     trace_id=trace_result.trace_id,
                     evaluator_type=classify_evaluator(mr.metric_name, params),
                     metric_result=mr,
diff --git a/src/agentevals/run/service.py b/src/agentevals/run/service.py
@@ -1,35 +1,28 @@
-"""Synchronous control surface used by ``/api/runs`` HTTP handlers.
+"""Synchronous control surface used by ``/api/runs`` and ``/api/evaluate``.
 
 Wraps the :class:`agentevals.storage.repos.RunRepository` with submit
-idempotency, list pagination, and the 409 spec-mismatch path.
-
-Also provides :meth:`RunService.record_completed_eval` for the
-``/api/evaluate`` path: that handler executes synchronously (the trace was
-already supplied as multipart and the result is being streamed back over
-SSE), so we synthesize a Run row for visibility in run history rather than
-queueing work for the worker.
+idempotency, list pagination, and the 409 spec-mismatch path. Also exposes
+:meth:`RunService.record_eval_run` for the ``/api/evaluate`` path, which
+executes synchronously and synthesizes a Run row for visibility in run
+history rather than queueing work for the worker.
 """
 
 from __future__ import annotations
 
-import json
 import logging
 from datetime import datetime, timezone
+from typing import Any
 from uuid import UUID, uuid4
 
 from ..config import EvalParams
 from ..runner import RunResult
-from ..storage.models import Run, RunSpec, RunStatus
+from ..storage.models import Run, RunSpec, RunStatus, TraceTarget
 from ..storage.repos import ResultRepository, RunRepository
 from .result_builder import build_results, summarize_run_result
 
 logger = logging.getLogger(__name__)
 
 
-def _now() -> datetime:
-    return datetime.now(timezone.utc)
-
-
 class RunSubmitConflict(Exception):
     """Raised when a re-submission's spec differs from the persisted one.
 
@@ -54,7 +47,7 @@ async def submit(self, *, run_id: UUID | None, spec: RunSpec) -> Run:
             spec=spec,
         )
         persisted = await self._runs.create(run)
-        if persisted.run_id == run.run_id and not _specs_equal(persisted.spec, spec):
+        if persisted.run_id == run.run_id and persisted.spec != spec:
             raise RunSubmitConflict(persisted)
         return persisted
 
@@ -76,30 +69,46 @@ async def list_results(self, run_id: UUID):
     async def cancel(self, run_id: UUID) -> bool:
         return await self._runs.cancel(run_id)
 
-    async def record_completed_eval(
+    async def record_eval_run(
         self,
         *,
-        spec: RunSpec,
         params: EvalParams,
+        eval_set_dict: dict[str, Any] | None,
+        trace_format: str | None,
+        upload_filenames: list[str] | None,
         run_result: RunResult,
     ) -> Run:
-        """Persist a synchronously-completed eval as a Run row plus Result rows.
-
-        The run is created already in ``running`` state (so the row passes the
-        ``run_running_has_worker`` check is sidestepped via a synthetic worker
-        id), then transitioned to a terminal state in the same call. Two
-        writes per eval, but using the public :class:`RunRepository` API
-        avoids leaking an executor-only schema requirement into this layer.
+        """Persist a synchronously-completed ``/api/evaluate`` call as a Run
+        row plus Result rows.
+
+        Builds an ``uploaded`` :class:`TraceTarget` from the request metadata,
+        creates a queued run, persists results, then transitions the run to
+        a terminal status. Two writes (create + update_status), but the
+        public :class:`RunRepository` API stays clean of executor-only
+        schema knowledge.
         """
+        filenames = list(upload_filenames or [])
+        target = TraceTarget(
+            kind="uploaded",
+            trace_format=trace_format if trace_format in ("jaeger-json", "otlp-json") else None,
+            trace_count=len(filenames),
+            trace_files=filenames,
+        )
+        spec = RunSpec(
+            approach="trace_replay",
+            target=target,
+            eval_config=params.model_dump(by_alias=False),
+            eval_set=eval_set_dict,
+        )
+
         run_id = uuid4()
-        worker_id = "sync:/api/evaluate"
         run = Run(
             run_id=run_id,
             status=RunStatus.QUEUED,
             spec=spec,
             attempt=1,
-            worker_id=worker_id,
-            started_at=_now(),
+            worker_id="sync:/api/evaluate",
+            started_at=datetime.now(timezone.utc),
         )
         await self._runs.create(run)
 
@@ -117,11 +126,3 @@ async def record_completed_eval(
             run.status = RunStatus.SUCCEEDED
         run.summary = summary
         return run
-
-
-def _specs_equal(a: RunSpec, b: RunSpec) -> bool:
-    """Deep equality on the JSON projection. Pydantic equality compares model
-    instances by class identity, which trips up the round-trip from JSONB."""
-    return json.dumps(a.model_dump(by_alias=False), sort_keys=True) == json.dumps(
-        b.model_dump(by_alias=False), sort_keys=True
-    )
diff --git a/src/agentevals/run/worker.py b/src/agentevals/run/worker.py
@@ -15,7 +15,7 @@
 import asyncio
 import logging
 import socket
-from datetime import datetime, timedelta, timezone
+from datetime import timedelta
 from uuid import UUID
 
 from google.adk.evaluation.eval_set import EvalSet
@@ -32,10 +32,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _now() -> datetime:
-    return datetime.now(timezone.utc)
-
-
 class _CancelledByRequest(Exception):
     """Raised inside the worker task when the heartbeat observes cancel_requested."""
 
diff --git a/src/agentevals/storage/models.py b/src/agentevals/storage/models.py
@@ -121,49 +121,3 @@ class Result(BaseModel):
     tokens_used: dict[str, Any] | None = None
     latency_ms: int | None = None
     created_at: datetime = Field(default_factory=_now)
-
-    @classmethod
-    def from_metric_result(
-        cls,
-        *,
-        run_id: UUID,
-        eval_set_item_id: str,
-        eval_set_item_name: str,
-        trace_id: str | None,
-        evaluator_type: Literal["builtin", "code", "remote", "openai_eval"],
-        metric_result: Any,
-    ) -> Result:
-        """Project an in-pipeline MetricResult onto the persisted shape.
-
-        ADK emits ``eval_status`` strings ``PASSED`` / ``FAILED`` /
-        ``NOT_EVALUATED``; we additionally map presence of ``error`` to
-        ``errored`` so downstream consumers don't have to special-case
-        evaluator failures.
-        """
-        if metric_result.error:
-            status = ResultStatus.ERRORED
-        else:
-            raw = (metric_result.eval_status or "NOT_EVALUATED").upper()
-            status = {
-                "PASSED": ResultStatus.PASSED,
-                "FAILED": ResultStatus.FAILED,
-            }.get(raw, ResultStatus.SKIPPED)
-
-        scores: list[float | None] = list(metric_result.per_invocation_scores or [])
-        latency_ms = int(metric_result.duration_ms) if metric_result.duration_ms is not None else None
-
-        return cls(
-            result_id=compute_result_id(run_id, eval_set_item_id, metric_result.metric_name),
-            run_id=run_id,
-            eval_set_item_id=eval_set_item_id,
-            eval_set_item_name=eval_set_item_name,
-            evaluator_name=metric_result.metric_name,
-            evaluator_type=evaluator_type,
-            status=status,
-            score=metric_result.score,
-            per_invocation_scores=scores,
-            trace_id=trace_id,
-            details=metric_result.details or {},
-            error_text=metric_result.error,
-            latency_ms=latency_ms,
-        )
diff --git a/src/agentevals/storage/repos/memory.py b/src/agentevals/storage/repos/memory.py
@@ -20,10 +20,6 @@
     from ...streaming.session import TraceSession
 
 
-def _now() -> datetime:
-    return datetime.now(timezone.utc)
-
-
 class MemorySessionRepository:
     def __init__(self) -> None:
         self._sessions: dict[str, TraceSession] = {}
@@ -87,7 +83,7 @@ async def list(
         return runs[:limit]
 
     async def claim_next(self, *, worker_id: str, lease: timedelta, max_attempts: int) -> Run | None:
-        now = _now()
+        now = datetime.now(timezone.utc)
         async with self._lock:
             candidates = [r for r in self._runs.values() if r.status == RunStatus.QUEUED and r.attempt < max_attempts]
             candidates.sort(key=lambda r: r.created_at)
@@ -125,7 +121,7 @@ async def update_status(
             if summary is not None:
                 run.summary = summary
             if status in (RunStatus.SUCCEEDED, RunStatus.FAILED, RunStatus.CANCELLED):
-                run.finished_at = _now()
+                run.finished_at = datetime.now(timezone.utc)
 
     async def cancel(self, run_id: UUID) -> bool:
         async with self._lock:
@@ -135,7 +131,7 @@ async def cancel(self, run_id: UUID) -> bool:
             run.cancel_requested = True
             if run.status == RunStatus.QUEUED:
                 run.status = RunStatus.CANCELLED
-                run.finished_at = _now()
+                run.finished_at = datetime.now(timezone.utc)
             return True
 
 
diff --git a/src/agentevals/storage/repos/postgres.py b/src/agentevals/storage/repos/postgres.py
@@ -10,7 +10,7 @@
 
 import json
 import logging
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
 from typing import TYPE_CHECKING
 from uuid import UUID
 
@@ -25,10 +25,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _now() -> datetime:
-    return datetime.now(timezone.utc)
-
-
 def _row_to_session(row: "asyncpg.Record") -> "TraceSession":
     from ...streaming.session import TraceSession
 
diff --git a/tests/api/test_evaluate_persistence.py b/tests/api/test_evaluate_persistence.py
diff --git a/tests/run/test_result_builder.py b/tests/run/test_result_builder.py
diff --git a/tests/run/test_service.py b/tests/run/test_service.py
diff --git a/tests/storage/test_models.py b/tests/storage/test_models.py