Skip to content

Commit 37f44a0

Browse files
refactor trace processing/conversion, update CONTRIBUTING.md on how to extend what we have
1 parent a8ca8a6 commit 37f44a0

14 files changed

Lines changed: 1141 additions & 549 deletions

CONTRIBUTING.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,41 @@ samples/ # Example traces and eval sets
147147
docs/ # Documentation
148148
```
149149

150+
## Trace Processing Architecture
151+
152+
agentevals converts OTel traces from agent frameworks into a common `Invocation` format for evaluation. If you're adding support for a new framework or changing how we extract data from spans, this section will help you find your way around.
153+
154+
### Key Modules
155+
156+
| Module | What it does |
157+
|--------|--------------|
158+
| `trace_attrs.py` | Single source of truth for OTel attribute key constants (`OTEL_GENAI_*` for standard semconv, `ADK_*` for Google ADK) |
159+
| `extraction.py` | Shared extraction functions, span classifiers, and the `TraceFormatExtractor` protocol with `AdkExtractor` / `GenAIExtractor` |
160+
| `converter.py` | Batch conversion orchestration, turns ADK traces into `Invocation` objects |
161+
| `genai_converter.py` | Batch conversion for GenAI semconv traces (single-turn and multi-turn) |
162+
| `streaming/incremental_processor.py` | Real-time span processing for the live UI, uses the same shared extraction functions |
163+
| `utils/log_enrichment.py` | Reconstructs `gen_ai.input/output.messages` from OTel log records into span attributes |
164+
165+
### Adding a new attribute constant
166+
167+
Add it to `trace_attrs.py` and import from there. Don't use hardcoded attribute key strings elsewhere.
168+
169+
### Adding or modifying extraction logic
170+
171+
The extraction functions in `extraction.py` accept flat `dict[str, Any]` attribute maps. This means they work with both `Span`-based batch converters (via `span.tags`) and the raw OTLP dict incremental processor. When extracting data, check ADK-specific attributes first (they contain richer data), then fall back to GenAI semconv.
172+
173+
### Supporting a new trace format
174+
175+
1. Add a new `TraceFormatExtractor` implementation in `extraction.py` with `detect()`, `find_invocation_spans()`, `find_llm_spans_in()`, `find_tool_spans_in()`, and `classify_span()`
176+
2. Register it in the `_EXTRACTORS` list. Order matters here: more specific formats should come first so they get detected before the generic GenAI fallback
177+
3. If the format introduces new attribute keys, add them to `trace_attrs.py`
178+
4. If you need conversion logic that the shared extraction functions don't cover, add a dedicated converter module (see `genai_converter.py` for an example)
179+
5. Add tests to `tests/test_extraction.py` for detection and span classification
180+
181+
### Adding an SDK example
182+
183+
Each example directory under `examples/` is self-contained with its own `requirements.txt`. The example needs to actually produce OTel spans. For OpenAI-based agents this means including `opentelemetry-instrumentation-openai-v2` in the requirements. Make sure all framework-specific OTel dependencies are listed in the example's `requirements.txt`.
184+
150185
## Getting Help
151186

152187
- Open an [issue](https://github.com/agentevals-dev/agentevals/issues) for bugs or questions
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
langchain-openai>=1.1.10
2+
opentelemetry-instrumentation-openai-v2
3+
opentelemetry-sdk>=1.39.1
4+
python-dotenv>=1.0.0

src/agentevals/api/streaming_routes.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from ..loader.otlp import OtlpJsonLoader
1616
from ..runner import run_evaluation
1717
from ..config import EvalRunConfig
18+
from ..trace_attrs import OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_REQUEST_MODEL
1819
from ..utils.log_enrichment import enrich_spans_with_logs
1920

2021
logger = logging.getLogger(__name__)
@@ -79,11 +80,22 @@ async def create_eval_set_from_session(request: CreateEvalSetRequest):
7980

8081
try:
8182
trace_file = await trace_manager._save_spans_to_temp_file(session)
83+
logger.debug(
84+
"Session %s: %d spans, %d logs saved to %s",
85+
request.session_id, len(session.spans), len(session.logs), trace_file,
86+
)
8287
loader = OtlpJsonLoader()
8388
traces = loader.load(str(trace_file))
8489

8590
if not traces:
86-
raise HTTPException(status_code=400, detail="No traces found in session")
91+
raise HTTPException(
92+
status_code=400,
93+
detail=(
94+
f"No traces found in session (spans={len(session.spans)}, "
95+
f"logs={len(session.logs)}). If using the SDK with langchain/openai, "
96+
f"ensure opentelemetry-instrumentation-openai-v2 is installed."
97+
),
98+
)
8799

88100
conversion_results = convert_traces(traces)
89101
if not conversion_results:
@@ -126,6 +138,8 @@ async def create_eval_set_from_session(request: CreateEvalSetRequest):
126138
"num_invocations": len(all_invocations),
127139
}
128140

141+
except HTTPException:
142+
raise
129143
except Exception as exc:
130144
logger.exception("Failed to create eval set")
131145
raise HTTPException(status_code=500, detail=str(exc))
@@ -212,6 +226,8 @@ async def eval_one_session(session_id: str, session) -> dict:
212226
"results": results,
213227
}
214228

229+
except HTTPException:
230+
raise
215231
except Exception as exc:
216232
logger.exception("Failed to evaluate sessions")
217233
raise HTTPException(status_code=500, detail=str(exc))
@@ -262,6 +278,8 @@ async def prepare_evaluation(request: PrepareEvaluationRequest):
262278
"num_traces": len(trace_files),
263279
}
264280

281+
except HTTPException:
282+
raise
265283
except Exception as exc:
266284
logger.exception("Failed to prepare evaluation")
267285
raise HTTPException(status_code=500, detail=str(exc))
@@ -304,7 +322,7 @@ async def get_trace(request: GetTraceRequest):
304322
has_genai_spans = any(
305323
span.get("attributes", [])
306324
and any(
307-
attr.get("key") in ("gen_ai.request.model", "gen_ai.input.messages")
325+
attr.get("key") in (OTEL_GENAI_REQUEST_MODEL, OTEL_GENAI_INPUT_MESSAGES)
308326
for attr in span.get("attributes", [])
309327
)
310328
for span in session.spans

src/agentevals/converter.py

Lines changed: 25 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,23 @@
1919
from google.adk.evaluation.eval_case import IntermediateData, Invocation
2020
from google.genai import types as genai_types
2121

22+
from .extraction import get_extractor
2223
from .loader.base import Span, Trace
24+
from .trace_attrs import (
25+
ADK_INVOCATION_ID,
26+
ADK_LLM_REQUEST,
27+
ADK_LLM_RESPONSE,
28+
ADK_SCOPE_VALUE,
29+
ADK_TOOL_CALL_ARGS,
30+
ADK_TOOL_RESPONSE,
31+
OTEL_GENAI_AGENT_NAME,
32+
OTEL_GENAI_TOOL_CALL_ID,
33+
OTEL_GENAI_TOOL_NAME,
34+
OTEL_SCOPE,
35+
)
2336

2437
logger = logging.getLogger(__name__)
2538

26-
FORMAT_DETECTION_SPAN_LIMIT = 10
27-
28-
# Tag keys used by the ADK OTel instrumentation (gcp.vertex.agent scope).
29-
_TAG_SCOPE = "otel.scope.name"
30-
_ADK_SCOPE = "gcp.vertex.agent"
31-
_TAG_OP = "gen_ai.operation.name"
32-
_TAG_AGENT_NAME = "gen_ai.agent.name"
33-
_TAG_LLM_REQUEST = "gcp.vertex.agent.llm_request"
34-
_TAG_LLM_RESPONSE = "gcp.vertex.agent.llm_response"
35-
_TAG_TOOL_NAME = "gen_ai.tool.name"
36-
_TAG_TOOL_CALL_ID = "gen_ai.tool.call.id"
37-
_TAG_TOOL_CALL_ARGS = "gcp.vertex.agent.tool_call_args"
38-
_TAG_TOOL_RESPONSE = "gcp.vertex.agent.tool_response"
39-
4039

4140
@dataclass
4241
class ConversionResult:
@@ -70,52 +69,8 @@ def convert_trace(trace: Trace, format: str | None = None) -> ConversionResult:
7069

7170

7271
def _detect_trace_format(trace: Trace) -> str:
73-
"""Detect trace format by inspecting span attributes.
74-
75-
Checks spans for format indicators:
76-
- ADK: otel.scope.name == "gcp.vertex.agent"
77-
- GenAI: gen_ai.request.model or gen_ai.input.messages attributes
78-
79-
First checks a limited number of spans for performance, then falls back
80-
to checking all spans if inconclusive.
81-
82-
Returns:
83-
"adk" or "genai"
84-
"""
85-
def check_spans(spans: list[Span]) -> str | None:
86-
has_genai = False
87-
for span in spans:
88-
if span.get_tag(_TAG_SCOPE) == _ADK_SCOPE:
89-
return "adk"
90-
if not has_genai and (
91-
span.get_tag("gen_ai.request.model") or span.get_tag("gen_ai.input.messages")
92-
):
93-
has_genai = True
94-
return "genai" if has_genai else None
95-
96-
initial_check = check_spans(trace.all_spans[:FORMAT_DETECTION_SPAN_LIMIT])
97-
if initial_check:
98-
logger.debug(
99-
f"Trace {trace.trace_id}: detected {initial_check} format "
100-
f"in first {FORMAT_DETECTION_SPAN_LIMIT} spans"
101-
)
102-
return initial_check
103-
104-
if len(trace.all_spans) > FORMAT_DETECTION_SPAN_LIMIT:
105-
logger.debug(
106-
f"Trace {trace.trace_id}: checking all {len(trace.all_spans)} spans "
107-
f"for format detection"
108-
)
109-
full_check = check_spans(trace.all_spans)
110-
if full_check:
111-
logger.debug(f"Trace {trace.trace_id}: detected {full_check} format in full scan")
112-
return full_check
113-
114-
logger.warning(
115-
f"Trace {trace.trace_id}: no format indicators found in {len(trace.all_spans)} spans, "
116-
f"defaulting to ADK format"
117-
)
118-
return "adk"
72+
"""Detect trace format by delegating to the extractor registry."""
73+
return get_extractor(trace).format_name()
11974

12075

12176
def _convert_adk_trace(trace: Trace) -> ConversionResult:
@@ -149,7 +104,7 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
149104
"""Find spans with ``otel.scope.name == "gcp.vertex.agent"`` matching an operation prefix."""
150105
matches = []
151106
for span in trace.all_spans:
152-
if span.get_tag(_TAG_SCOPE) != _ADK_SCOPE:
107+
if span.get_tag(OTEL_SCOPE) != ADK_SCOPE_VALUE:
153108
continue
154109
# operationName is e.g. "invoke_agent helm_agent" or "call_llm"
155110
if span.operation_name.startswith(operation):
@@ -159,7 +114,7 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
159114

160115

161116
def _convert_invoke_span(invoke_span: Span) -> Invocation:
162-
agent_name = invoke_span.get_tag(_TAG_AGENT_NAME, "unknown")
117+
agent_name = invoke_span.get_tag(OTEL_GENAI_AGENT_NAME, "unknown")
163118

164119
call_llm_spans = _find_children_by_op(invoke_span, "call_llm")
165120
if not call_llm_spans:
@@ -178,9 +133,7 @@ def _convert_invoke_span(invoke_span: Span) -> Invocation:
178133
tool_responses=tool_responses,
179134
)
180135

181-
invocation_id = invoke_span.get_tag(
182-
"gcp.vertex.agent.invocation_id", invoke_span.span_id
183-
)
136+
invocation_id = invoke_span.get_tag(ADK_INVOCATION_ID, invoke_span.span_id)
184137

185138
return Invocation(
186139
invocation_id=invocation_id,
@@ -207,7 +160,7 @@ def _walk(span: Span, op_prefix: str, acc: list[Span]) -> None:
207160

208161
def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
209162
"""Extract user input from the first call_llm span's llm_request tag."""
210-
llm_request_raw = first_call_llm.get_tag(_TAG_LLM_REQUEST, "{}")
163+
llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
211164
llm_request = _parse_json_tag(llm_request_raw, "llm_request")
212165
contents = llm_request.get("contents", [])
213166

@@ -234,7 +187,7 @@ def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
234187

235188
def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
236189
"""Extract final text response from the last call_llm span's llm_response tag."""
237-
llm_response_raw = last_call_llm.get_tag(_TAG_LLM_RESPONSE, "{}")
190+
llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
238191
llm_response = _parse_json_tag(llm_response_raw, "llm_response")
239192

240193
content_dict = llm_response.get("content", {})
@@ -293,8 +246,8 @@ def _extract_tool_trajectory(
293246
def _extract_from_tool_span(
294247
tool_span: Span,
295248
) -> tuple[genai_types.FunctionCall | None, genai_types.FunctionResponse | None]:
296-
tool_name = tool_span.get_tag(_TAG_TOOL_NAME)
297-
tool_call_id = tool_span.get_tag(_TAG_TOOL_CALL_ID)
249+
tool_name = tool_span.get_tag(OTEL_GENAI_TOOL_NAME)
250+
tool_call_id = tool_span.get_tag(OTEL_GENAI_TOOL_CALL_ID)
298251

299252
if not tool_name:
300253
# Fallback: parse tool name from operationName "execute_tool <name>"
@@ -307,7 +260,7 @@ def _extract_from_tool_span(
307260
)
308261
return None, None
309262

310-
args_raw = tool_span.get_tag(_TAG_TOOL_CALL_ARGS, "{}")
263+
args_raw = tool_span.get_tag(ADK_TOOL_CALL_ARGS, "{}")
311264
args = _parse_json_tag(args_raw, "tool_call_args")
312265

313266
fc = genai_types.FunctionCall(
@@ -316,7 +269,7 @@ def _extract_from_tool_span(
316269
id=tool_call_id,
317270
)
318271

319-
response_raw = tool_span.get_tag(_TAG_TOOL_RESPONSE)
272+
response_raw = tool_span.get_tag(ADK_TOOL_RESPONSE)
320273
fr = None
321274
if response_raw:
322275
response_data = _parse_json_tag(response_raw, "tool_response")
@@ -334,7 +287,7 @@ def _extract_from_tool_span(
334287
def _extract_function_calls_from_llm_response(
335288
call_llm: Span,
336289
) -> list[genai_types.FunctionCall]:
337-
llm_response_raw = call_llm.get_tag(_TAG_LLM_RESPONSE, "{}")
290+
llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
338291
llm_response = _parse_json_tag(llm_response_raw, "llm_response")
339292

340293
content_dict = llm_response.get("content", {})

0 commit comments

Comments
 (0)