agentevals-dev
diff --git a/‎DEVELOPMENT.md‎
Lines changed: 49 additions & 2 deletions b/‎DEVELOPMENT.md‎
Lines changed: 49 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 10 additions & 1 deletion b/‎Makefile‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 7 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/agentevals/api/app.py‎
Lines changed: 22 additions & 23 deletions b/‎src/agentevals/api/app.py‎
Lines changed: 22 additions & 23 deletions
diff --git a/‎src/agentevals/streaming/ws_server.py‎
Lines changed: 23 additions & 4 deletions b/‎src/agentevals/streaming/ws_server.py‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎tests/integration/__init__.py‎ b/‎tests/integration/__init__.py‎
@@ -37,13 +37,60 @@ make build-ui          # build React app only → ui/dist/
 
 Both `build` and `build-bundle` produce `dist/agentevals-*.whl` with the same package name and version. The difference is that `build-bundle` embeds `ui/dist/` as `agentevals/_static/` inside the wheel. The hatchling `artifacts` config ensures the gitignored `_static/` directory is included.
 
-### Testing and cleanup
+### Testing
+
+```bash
+make test              # run all tests (unit + integration, excludes e2e)
+make test-unit         # unit tests only (fast, no server startup)
+make test-integration  # integration tests — OTLP pipeline, session grouping, timing (no API keys)
+make test-e2e          # E2E tests — real agents as subprocesses (requires OPENAI_API_KEY)
+```
+
+### Cleanup
 
 ```bash
-make test              # run pytest
 make clean             # remove dist/, build/, ui/dist/, src/agentevals/_static/
 ```
 
+## Testing
+
+### Test tiers
+
+Tests are organized into three tiers with different trade-offs:
+
+| Tier | Location | Transport | API keys | What it verifies |
+|------|----------|-----------|----------|------------------|
+| **Unit** | `tests/` (excl. integration) | `TestClient` / mocks | None | Business logic, route handlers, converters |
+| **Integration** | `tests/integration/` | ASGI in-process | None | OTLP session grouping, timing, concurrent batches, eval pipeline |
+| **E2E** | `tests/integration/test_live_agents.py` | Real uvicorn servers | `OPENAI_API_KEY` | Full pipeline — real agent → OTLP export → session creation → invocation extraction → API visibility |
+
+Integration tests use `httpx.ASGITransport` to hit the OTLP and streaming API routes in-process (no ports, no real HTTP). Timers are configured fast (0.1s grace, 0.5s idle) for quick deterministic tests.
+
+E2E tests start real uvicorn servers on ephemeral ports in a background thread, then run example agent scripts as subprocesses that emit real OTLP traces with `BatchSpanProcessor`/`BatchLogRecordProcessor` flush timing.
+
+### Running E2E tests
+
+E2E tests require `OPENAI_API_KEY` (used by LangChain and Strands agents). They are skipped automatically when the key is not set.
+
+```bash
+# Source your .env and run
+set -a && source .env && set +a && make test-e2e
+```
+
+### Adding tests for new examples
+
+When adding a new example agent to `examples/`, add corresponding E2E tests to ensure the full OTLP pipeline works:
+
+1. Add a test class in `tests/integration/test_live_agents.py` following the existing pattern (`TestLangchainZeroCode`, `TestStrandsZeroCode`)
+2. Each agent should have at minimum three tests:
+   - **Session creation** — agent runs successfully, session is created with spans (and logs if applicable)
+   - **Invocation extraction** — invocations are extracted with user/agent content
+   - **API visibility** — session appears in `GET /api/streaming/sessions`
+3. Use `_run_agent()` to run the example as a subprocess with the test OTLP endpoint
+4. Use `wait_for_session_complete_sync()` to poll until the session finalizes
+5. Mark the test class with the appropriate skip condition (e.g., `_skip_no_openai`)
+6. Use unique `session_name` values per test to avoid collisions within the session-scoped server fixture
+
 ## Runtime behavior
 
 The serve command auto-detects the active mode:
 
@@ -1,7 +1,7 @@
 VERSION := $(shell grep '^version' pyproject.toml | cut -d'"' -f2)
 WHEEL := dist/agentevals-$(VERSION)-py3-none-any.whl
 
-.PHONY: build build-bundle build-ui release clean dev-backend dev-frontend dev-bundle test
+.PHONY: build build-bundle build-ui release clean dev-backend dev-frontend dev-bundle test test-unit test-integration test-e2e
 
 build:
 	uv build
@@ -47,6 +47,15 @@ dev-bundle: build-ui
 test:
 	uv run pytest
 
+test-unit:
+	uv run pytest tests/ --ignore=tests/integration
+
+test-integration:
+	uv run pytest tests/integration/ -m "integration and not e2e" -v
+
+test-e2e:
+	uv run pytest tests/integration/ -m "e2e" -v
+
 clean:
 	rm -rf dist/ build/ src/agentevals/_static/ ui/dist/
 	find . -name '*.egg-info' -type d -exec rm -rf {} + 2>/dev/null || true
@@ -47,8 +47,15 @@ members = []
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["src"]
+markers = [
+    "integration: OTLP pipeline tests with ASGI apps (no API keys)",
+    "e2e: End-to-end tests requiring API keys and real agents",
+]
+asyncio_mode = "auto"
 
 [dependency-groups]
 dev = [
     "pytest>=9.0.2",
+    "pytest-asyncio>=0.24.0",
+    "httpx>=0.27.0",
 ]
@@ -4,6 +4,7 @@
 import json
 import logging
 import os
+from contextlib import asynccontextmanager
 from pathlib import Path
 
 from fastapi import FastAPI
@@ -23,10 +24,31 @@
 except ImportError:
     pass
 
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    log_level_str = os.getenv("AGENTEVALS_LOG_LEVEL", "INFO").upper()
+    log_level = getattr(logging, log_level_str, logging.INFO)
+    logging.basicConfig(
+        level=log_level,
+        format="%(levelname)s:%(name)s:%(message)s",
+        force=True,
+    )
+    ae_logger = logging.getLogger("agentevals")
+    ae_logger.setLevel(log_level)
+    log_buffer.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
+    ae_logger.addHandler(log_buffer)
+    if _trace_manager:
+        _trace_manager.start_cleanup_task()
+    yield
+    if _trace_manager:
+        await _trace_manager.shutdown()
+
+
 app = FastAPI(
     title="agentevals API",
     version=__version__,
     description="REST API for evaluating agent traces using ADK's scoring framework",
+    lifespan=lifespan,
 )
 
 app.add_middleware(
@@ -105,26 +127,3 @@ async def spa_fallback(path: str):
         if file_path.is_file():
             return FileResponse(file_path)
         return FileResponse(_static_dir / "index.html")
-
-
-@app.on_event("startup")
-async def on_startup():
-    log_level_str = os.getenv("AGENTEVALS_LOG_LEVEL", "INFO").upper()
-    log_level = getattr(logging, log_level_str, logging.INFO)
-    logging.basicConfig(
-        level=log_level,
-        format="%(levelname)s:%(name)s:%(message)s",
-        force=True,
-    )
-    ae_logger = logging.getLogger("agentevals")
-    ae_logger.setLevel(log_level)
-    log_buffer.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
-    ae_logger.addHandler(log_buffer)
-    if _trace_manager:
-        _trace_manager.start_cleanup_task()
-
-
-@app.on_event("shutdown")
-async def on_shutdown():
-    if _trace_manager:
-        await _trace_manager.shutdown()
@@ -36,14 +36,27 @@ class StreamingTraceManager:
     Args:
         session_ttl_hours: How long to keep completed sessions in memory (default: 2 hours)
         max_sessions: Maximum number of sessions to keep (default: 100)
+        completion_grace_seconds: Delay after root span before completing session (default: 3.0)
+        idle_timeout_seconds: Complete session after this many seconds of inactivity (default: 30.0)
+        reextraction_delay_seconds: Debounce delay for late-log re-extraction (default: 2.0)
     """
 
-    def __init__(self, session_ttl_hours: int = 2, max_sessions: int = 100):
+    def __init__(
+        self,
+        session_ttl_hours: int = 2,
+        max_sessions: int = 100,
+        completion_grace_seconds: float = 3.0,
+        idle_timeout_seconds: float = 30.0,
+        reextraction_delay_seconds: float = 2.0,
+    ):
         self.sessions: dict[str, TraceSession] = {}
         self.incremental_extractors: dict[str, IncrementalInvocationExtractor] = {}
         self.sse_queues: list[asyncio.Queue] = []
         self.session_ttl = timedelta(hours=session_ttl_hours)
         self.max_sessions = max_sessions
+        self.completion_grace_seconds = completion_grace_seconds
+        self.idle_timeout_seconds = idle_timeout_seconds
+        self.reextraction_delay_seconds = reextraction_delay_seconds
         self._cleanup_task: asyncio.Task | None = None
         self._completion_timers: dict[str, asyncio.Task] = {}
         self._idle_timers: dict[str, asyncio.Task] = {}
@@ -72,6 +85,12 @@ async def shutdown(self) -> None:
         """Gracefully shut down: close SSE clients and cancel background tasks."""
         for queue in self.sse_queues:
             queue.put_nowait(None)
+        for task in self._completion_timers.values():
+            task.cancel()
+        self._completion_timers.clear()
+        for task in self._idle_timers.values():
+            task.cancel()
+        self._idle_timers.clear()
         if self._cleanup_task:
             self._cleanup_task.cancel()
             try:
@@ -274,7 +293,7 @@ def schedule_session_completion(self, session_id: str) -> None:
             self._completion_timers[session_id].cancel()
 
         self._completion_timers[session_id] = asyncio.create_task(
-            self._delayed_complete(session_id, 3.0)
+            self._delayed_complete(session_id, self.completion_grace_seconds)
         )
 
     def reset_idle_timer(self, session_id: str) -> None:
@@ -289,7 +308,7 @@ def reset_idle_timer(self, session_id: str) -> None:
             self._idle_timers[session_id].cancel()
 
         self._idle_timers[session_id] = asyncio.create_task(
-            self._delayed_complete(session_id, 30.0)
+            self._delayed_complete(session_id, self.idle_timeout_seconds)
         )
 
     def schedule_log_reextraction(self, session_id: str) -> None:
@@ -304,7 +323,7 @@ def schedule_log_reextraction(self, session_id: str) -> None:
             self._completion_timers[key].cancel()
 
         self._completion_timers[key] = asyncio.create_task(
-            self._delayed_reextract(session_id, 2.0)
+            self._delayed_reextract(session_id, self.reextraction_delay_seconds)
         )
 
     async def _delayed_complete(self, session_id: str, delay: float) -> None: