Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 19 additions & 15 deletions src/agentevals/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
extract_tool_call_from_span,
extract_tool_result_from_span,
extract_user_text_from_attrs,
find_adk_llm_spans_in,
get_extractor,
has_adk_descendant,
is_adk_scope,
Expand Down Expand Up @@ -127,15 +128,18 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:


def _convert_invoke_span(invoke_span: Span) -> Invocation:
call_llm_spans = _find_children_by_op(invoke_span, "call_llm")
if not call_llm_spans:
raise ValueError(f"invoke_agent span {invoke_span.span_id} has no child call_llm spans")
llm_spans = find_adk_llm_spans_in(invoke_span)
if not llm_spans:
raise ValueError(
f"invoke_agent span {invoke_span.span_id} has no converter-compatible ADK LLM descendants; "
"expected call_llm or ADK generate_content spans"
)

tool_spans = _find_children_by_op(invoke_span, "execute_tool")

user_content = _extract_user_content(call_llm_spans[0])
final_response = _extract_final_response(call_llm_spans[-1])
tool_uses, tool_responses = _extract_tool_trajectory(call_llm_spans, tool_spans)
user_content = _extract_user_content(llm_spans[0])
final_response = _extract_final_response(llm_spans[-1])
tool_uses, tool_responses = _extract_tool_trajectory(llm_spans, tool_spans)

intermediate_data = IntermediateData(
tool_uses=tool_uses,
Expand Down Expand Up @@ -177,7 +181,7 @@ def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
)
llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
llm_request = parse_json(llm_request_raw)
for content_dict in llm_request.get("contents", []):
for content_dict in llm_request.get("contents", llm_request.get("Contents", [])):
if content_dict.get("role") == "user":
return _content_from_dict(content_dict)
raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")
Expand All @@ -193,7 +197,7 @@ def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
)
llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
llm_response = parse_json(llm_response_raw)
content_dict = llm_response.get("content", {})
content_dict = llm_response.get("content", llm_response.get("Content", {}))
if not content_dict:
raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")
logger.warning(
Expand Down Expand Up @@ -263,12 +267,12 @@ def _extract_function_calls_from_llm_response(
llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
llm_response = parse_json(llm_response_raw)

content_dict = llm_response.get("content", {})
content_dict = llm_response.get("content", llm_response.get("Content", {}))
parts = content_dict.get("parts", [])

calls = []
for part in parts:
fc_dict = part.get("function_call")
fc_dict = part.get("function_call", part.get("functionCall"))
if fc_dict:
calls.append(
genai_types.FunctionCall(
Expand All @@ -288,9 +292,9 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
parts: list[genai_types.Part] = []
for p in parts_dicts:
if "text" in p:
parts.append(genai_types.Part(text=p["text"]))
elif "function_call" in p:
fc = p["function_call"]
parts.append(genai_types.Part(text=p.get("text")))
elif "function_call" in p or "functionCall" in p:
fc = p.get("function_call", p.get("functionCall"))
parts.append(
genai_types.Part(
function_call=genai_types.FunctionCall(
Expand All @@ -300,8 +304,8 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
)
)
)
elif "function_response" in p:
fr = p["function_response"]
elif "function_response" in p or "functionResponse" in p:
fr = p.get("function_response", p.get("functionResponse"))
parts.append(
genai_types.Part(
function_response=genai_types.FunctionResponse(
Expand Down
46 changes: 38 additions & 8 deletions src/agentevals/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,15 @@ def extract_user_text_from_attrs(attrs: dict[str, Any]) -> str | None:
if llm_request_raw:
llm_request = parse_json(llm_request_raw)
if isinstance(llm_request, dict):
for content_dict in reversed(llm_request.get("contents", [])):
contents = llm_request.get("contents", llm_request.get("Contents", []))
Comment thread
erauner12 marked this conversation as resolved.
for content_dict in reversed(contents):
if content_dict.get("role") != "user":
continue
parts = content_dict.get("parts", [])
text_parts = [p for p in parts if "text" in p]
if text_parts:
return " ".join(p["text"] for p in text_parts)
for content_dict in llm_request.get("contents", []):
for content_dict in contents:
if content_dict.get("role") == "user":
parts = content_dict.get("parts", [])
if parts:
Expand All @@ -101,7 +102,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
if llm_response_raw:
llm_response = parse_json(llm_response_raw)
if isinstance(llm_response, dict):
content_dict = llm_response.get("content", {})
content_dict = llm_response.get("content", llm_response.get("Content", {}))
if content_dict:
parts_dicts = content_dict.get("parts", [])
text_parts = [p for p in parts_dicts if "text" in p]
Expand Down Expand Up @@ -392,6 +393,38 @@ def is_adk_scope(span: Span) -> bool:
return False


def is_adk_generate_content_llm_span(span: Span) -> bool:
if not (span.operation_name.startswith("generate_content") or span.get_tag(OTEL_GENAI_OP) == "generate_content"):
return False
return bool(span.get_tag(ADK_LLM_REQUEST) or span.get_tag(ADK_LLM_RESPONSE))


def is_adk_llm_span(span: Span) -> bool:
return span.operation_name.startswith("call_llm") or is_adk_generate_content_llm_span(span)


def find_adk_llm_spans_in(root: Span) -> list[Span]:
call_llm_spans: list[Span] = []
generate_content_spans: list[Span] = []

def collect(span: Span) -> None:
if span.operation_name.startswith("call_llm"):
call_llm_spans.append(span)
elif is_adk_generate_content_llm_span(span):
generate_content_spans.append(span)

_walk_descendants(root, collect)
call_llm_spans.sort(key=lambda s: s.start_time)
generate_content_spans.sort(key=lambda s: s.start_time)
return call_llm_spans or generate_content_spans


def _walk_descendants(span: Span, visit) -> None:
for child in span.children:
visit(child)
_walk_descendants(child, visit)


def is_llm_span(span: Span) -> bool:
return span.get_tag(OTEL_GENAI_REQUEST_MODEL) is not None

Expand Down Expand Up @@ -477,10 +510,7 @@ def find_invocation_spans(self, trace: Trace) -> list[Span]:
return matches

def find_llm_spans_in(self, root: Span) -> list[Span]:
results: list[Span] = []
self._walk(root, lambda s: s.operation_name.startswith("call_llm"), results)
results.sort(key=lambda s: s.start_time)
return results
return find_adk_llm_spans_in(root)

def find_tool_spans_in(self, root: Span) -> list[Span]:
results: list[Span] = []
Expand All @@ -493,7 +523,7 @@ def classify_span(self, span: Span) -> str | None:
return None
if span.operation_name.startswith("invoke_agent"):
return "invocation"
if span.operation_name.startswith("call_llm"):
if is_adk_llm_span(span):
return "llm"
if span.operation_name.startswith("execute_tool"):
return "tool"
Expand Down
131 changes: 131 additions & 0 deletions tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,108 @@ def test_convert_traces_multiple(self):
assert len(results) == 2
assert all(r.trace_id == "t1" for r in results)

def test_convert_adk_generate_content_llm_spans(self):
Comment thread
erauner12 marked this conversation as resolved.
invoke = Span(
trace_id="t-gc",
span_id="invoke1",
parent_span_id=None,
operation_name="invoke_agent query_agent",
start_time=1000,
duration=10000,
tags={"gen_ai.operation.name": "invoke_agent"},
)
llm_1 = Span(
trace_id="t-gc",
span_id="llm1",
parent_span_id="invoke1",
operation_name="generate_content mockllm-deterministic",
start_time=2000,
duration=1000,
tags={
"gen_ai.operation.name": "generate_content",
"gcp.vertex.agent.llm_request": json.dumps(
{"Contents": [{"role": "user", "parts": [{"text": "inspect pods"}]}]}
),
"gcp.vertex.agent.llm_response": json.dumps(
{"Content": {"role": "model", "parts": [{"text": "Calling tools."}]}}
),
},
)
tool_1 = Span(
trace_id="t-gc",
span_id="tool1",
parent_span_id="invoke1",
operation_name="execute_tool list_pods",
start_time=3000,
duration=500,
tags={
"gen_ai.tool.name": "list_pods",
"gen_ai.tool.call.id": "call_1",
"gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
"gcp.vertex.agent.tool_response": json.dumps({"pods": []}),
},
)
llm_2 = Span(
trace_id="t-gc",
span_id="llm2",
parent_span_id="invoke1",
operation_name="generate_content mockllm-deterministic",
start_time=4000,
duration=1000,
tags={
"gen_ai.operation.name": "generate_content",
"gcp.vertex.agent.llm_request": json.dumps({"contents": []}),
"gcp.vertex.agent.llm_response": json.dumps(
{
"Content": {
"role": "model",
"parts": [
{
"functionCall": {
"name": "summarize_pods",
"args": {"namespace": "default"},
"id": "call_final",
}
}
],
}
}
),
},
)
tool_2 = Span(
trace_id="t-gc",
span_id="tool2",
parent_span_id="invoke1",
operation_name="execute_tool get_events",
start_time=5000,
duration=500,
tags={
"gen_ai.tool.name": "get_events",
"gen_ai.tool.call.id": "call_2",
"gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
"gcp.vertex.agent.tool_response": json.dumps({"events": []}),
},
)
invoke.children.extend([llm_1, tool_1, llm_2, tool_2])
trace = Trace(
trace_id="t-gc",
root_spans=[invoke],
all_spans=[invoke, llm_1, tool_1, llm_2, tool_2],
)

result = convert_trace(trace)

assert result.warnings == []
assert len(result.invocations) == 1
inv = result.invocations[0]
assert inv.user_content.parts[0].text == "inspect pods"
final_call = inv.final_response.parts[0].function_call
assert final_call.name == "summarize_pods"
assert final_call.args == {"namespace": "default"}
assert final_call.id == "call_final"
assert [t.name for t in inv.intermediate_data.tool_uses] == ["list_pods", "get_events"]

def test_no_invoke_agent_warns(self):
trace = Trace(
trace_id="empty",
Expand All @@ -207,6 +309,35 @@ def test_no_invoke_agent_warns(self):
assert len(result.warnings) == 1
assert "no invoke_agent" in result.warnings[0]

def test_no_llm_descendants_warns_with_compatible_shapes(self):
invoke = Span(
trace_id="no-llm",
span_id="invoke-no-llm",
parent_span_id=None,
operation_name="invoke_agent test_agent",
start_time=1000,
duration=1000,
tags={
"otel.scope.name": "gcp.vertex.agent",
"gen_ai.operation.name": "invoke_agent",
},
)
trace = Trace(
trace_id="no-llm",
root_spans=[invoke],
all_spans=[invoke],
)

result = convert_trace(trace)

assert result.invocations == []
assert len(result.warnings) == 1
warning = result.warnings[0]
assert "invoke-no-llm" in warning
assert "no converter-compatible ADK LLM descendants" in warning
assert "call_llm" in warning
assert "ADK generate_content" in warning

def test_no_tool_spans_fallback_to_llm_response(self):
"""When no execute_tool spans exist, function_calls should be
extracted from call_llm responses instead."""
Expand Down
50 changes: 50 additions & 0 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ def test_adk_llm_request_prefers_last_user(self):
}
assert extract_user_text_from_attrs(attrs) == "Second"

def test_adk_llm_request_outer_contents_pascalcase(self):
attrs = {
ADK_LLM_REQUEST: json.dumps(
{
"Contents": [
{"role": "user", "parts": [{"text": "Outer PascalCase only"}]},
]
}
)
}
assert extract_user_text_from_attrs(attrs) == "Outer PascalCase only"

def test_genai_content_based(self):
attrs = {
OTEL_GENAI_INPUT_MESSAGES: json.dumps(
Expand Down Expand Up @@ -170,6 +182,10 @@ def test_adk_llm_response(self):
attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"text": "ADK response"}]}})}
assert extract_agent_response_from_attrs(attrs) == "ADK response"

def test_adk_llm_response_outer_content_pascalcase(self):
attrs = {ADK_LLM_RESPONSE: json.dumps({"Content": {"parts": [{"text": "Outer Content only"}]}})}
assert extract_agent_response_from_attrs(attrs) == "Outer Content only"

def test_genai_content_based(self):
attrs = {
OTEL_GENAI_OUTPUT_MESSAGES: json.dumps(
Expand Down Expand Up @@ -519,6 +535,39 @@ def test_find_llm_spans_in(self):
ext = AdkExtractor()
assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]

def test_find_llm_spans_in_falls_back_to_adk_generate_content(self):
child_llm = _span(
op="generate_content mockllm-deterministic",
tags={ADK_LLM_REQUEST: "{}"},
span_id="llm1",
)
child_tool = _span(op="execute_tool search", span_id="tool1")
root = _span(op="invoke_agent a", children=[child_llm, child_tool])
ext = AdkExtractor()
assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]

def test_find_llm_spans_in_ignores_provider_generate_content_without_adk_payload(self):
child_llm = _span(
op="generate_content gpt-4",
tags={OTEL_GENAI_REQUEST_MODEL: "gpt-4"},
span_id="llm1",
)
root = _span(op="invoke_agent a", children=[child_llm])
ext = AdkExtractor()
assert ext.find_llm_spans_in(root) == []

def test_find_llm_spans_in_prefers_call_llm_over_generate_content(self):
call_llm = _span(op="call_llm gemini", span_id="llm1", start_time=20)
generate_content = _span(
op="generate_content gemini",
tags={ADK_LLM_REQUEST: "{}"},
span_id="llm2",
start_time=10,
)
root = _span(op="invoke_agent a", children=[generate_content, call_llm])
ext = AdkExtractor()
assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]

def test_find_tool_spans_in(self):
child_llm = _span(op="call_llm gemini", span_id="llm1")
child_tool = _span(op="execute_tool search", span_id="tool1")
Expand All @@ -530,6 +579,7 @@ def test_classify_span(self):
ext = AdkExtractor()
assert ext.classify_span(_span(op="invoke_agent a", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "invocation"
assert ext.classify_span(_span(op="call_llm", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "llm"
assert ext.classify_span(_span(op="generate_content", tags={ADK_LLM_REQUEST: "{}"})) == "llm"
assert ext.classify_span(_span(op="execute_tool x", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "tool"
assert ext.classify_span(_span(op="random")) is None

Expand Down