Merge pull request #79 from agentevals-dev/chore/content-extraction-dedup

krisztianfekete · web-flow · commit 743389ae90f8 · 2026-03-31T12:33:39.000+02:00
Consolidate text extraction into a single source of truth
diff --git a/src/agentevals/converter.py b/src/agentevals/converter.py
@@ -18,7 +18,12 @@
 from google.adk.evaluation.eval_case import IntermediateData, Invocation
 from google.genai import types as genai_types
 
-from .extraction import get_extractor, parse_json
+from .extraction import (
+    extract_agent_response_from_attrs,
+    extract_user_text_from_attrs,
+    get_extractor,
+    parse_json,
+)
 from .loader.base import Span, Trace
 from .trace_attrs import (
     ADK_INVOCATION_ID,
@@ -152,50 +157,34 @@ def _walk(span: Span, op_prefix: str, acc: list[Span]) -> None:
 
 
 def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
-    """Extract user input from the first call_llm span's llm_request tag."""
+    """Extract user input from the first call_llm span's attributes via shared extractor."""
+    text = extract_user_text_from_attrs(first_call_llm.tags)
+    if text:
+        return genai_types.Content(
+            role="user",
+            parts=[genai_types.Part(text=text)],
+        )
     llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
     llm_request = parse_json(llm_request_raw)
-    contents = llm_request.get("contents", [])
-
-    for content_dict in reversed(contents):
-        if content_dict.get("role") != "user":
-            continue
-        parts = content_dict.get("parts", [])
-        # Skip function_response parts — only want actual user text messages
-        text_parts = [p for p in parts if "text" in p]
-        if text_parts:
-            return genai_types.Content(
-                role="user",
-                parts=[genai_types.Part(text=p["text"]) for p in text_parts],
-            )
-
-    for content_dict in contents:
+    for content_dict in llm_request.get("contents", []):
         if content_dict.get("role") == "user":
             return _content_from_dict(content_dict)
-
     raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")
 
 
 def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
-    """Extract final text response from the last call_llm span's llm_response tag."""
+    """Extract final text response from the last call_llm span's attributes via shared extractor."""
+    text = extract_agent_response_from_attrs(last_call_llm.tags)
+    if text:
+        return genai_types.Content(
+            role="model",
+            parts=[genai_types.Part(text=text)],
+        )
     llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
     llm_response = parse_json(llm_response_raw)
-
     content_dict = llm_response.get("content", {})
     if not content_dict:
         raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")
-
-    parts_dicts = content_dict.get("parts", [])
-    # Final response should have text parts, not function_call parts
-    text_parts = [p for p in parts_dicts if "text" in p]
-    if text_parts:
-        return genai_types.Content(
-            role="model",
-            parts=[genai_types.Part(text=p["text"]) for p in text_parts],
-        )
-
-    # If the last call_llm only has function_call parts, that's unexpected
-    # for a final response — the agent may have been cut short.
     logger.warning(
         "call_llm span %s: last llm_response has no text parts, may not be the actual final response",
         last_call_llm.span_id,
diff --git a/src/agentevals/extraction.py b/src/agentevals/extraction.py
@@ -100,7 +100,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
     if messages_raw:
         messages = parse_json_attr(messages_raw, "gen_ai.output.messages")
         if isinstance(messages, list):
-            for msg in messages:
+            for msg in reversed(messages):
                 if isinstance(msg, dict) and msg.get("role") in ASSISTANT_ROLES:
                     text = extract_text_from_message(msg)
                     if text:
diff --git a/src/agentevals/genai_converter.py b/src/agentevals/genai_converter.py
@@ -14,7 +14,14 @@
 from google.genai import types as genai_types
 
 from .converter import ConversionResult
-from .extraction import GenAIExtractor, is_invocation_span, is_llm_span, parse_tool_response_content
+from .extraction import (
+    GenAIExtractor,
+    extract_agent_response_from_attrs,
+    extract_user_text_from_attrs,
+    is_invocation_span,
+    is_llm_span,
+    parse_tool_response_content,
+)
 from .loader.base import Span, Trace
 from .trace_attrs import (
     OTEL_GENAI_INPUT_MESSAGES,
@@ -307,50 +314,21 @@ def _turn_to_invocation(turn: _ConversationTurn) -> Invocation:
 
 
 def _extract_user_text(llm_span: Span) -> str:
-    messages_raw = llm_span.get_tag(OTEL_GENAI_INPUT_MESSAGES, "[]")
-    messages = parse_json_attr(messages_raw, "gen_ai.input.messages")
-
-    if not isinstance(messages, list):
-        messages = []
-
-    for msg in reversed(messages):
-        if not isinstance(msg, dict):
-            continue
-        if msg.get("role") in USER_ROLES:
-            text = extract_text_from_message(msg)
-            if text:
-                logger.debug(f"Found user message: {text[:100]}")
-                return text
-
-    logger.warning(f"No user message found in {len(messages)} messages")
-    raise ValueError(f"LLM span {llm_span.span_id}: no user message found in gen_ai.input.messages")
+    text = extract_user_text_from_attrs(llm_span.tags)
+    if text:
+        return text
+    raise ValueError(
+        f"LLM span {llm_span.span_id}: no user message found (checked gen_ai.input.messages and ADK llm_request)"
+    )
 
 
 def _extract_assistant_text(llm_span: Span) -> str:
-    messages_raw = llm_span.get_tag(OTEL_GENAI_OUTPUT_MESSAGES, "[]")
-    messages = parse_json_attr(messages_raw, "gen_ai.output.messages")
-
-    if not isinstance(messages, list):
-        messages = []
-
-    logger.debug(f"Extracting final response from {len(messages)} output messages")
-    for i, msg in enumerate(messages):
-        if isinstance(msg, dict):
-            logger.debug(
-                f"  Message {i}: role={msg.get('role')}, content_len={len(msg.get('content', ''))}, has_tool_calls={bool(msg.get('tool_calls'))}"
-            )
-
-    for msg in reversed(messages):
-        if not isinstance(msg, dict):
-            continue
-        if msg.get("role") in ASSISTANT_ROLES:
-            text = extract_text_from_message(msg)
-            if text:
-                logger.debug(f"Found assistant message with text: {text[:100]}")
-                return text
-
+    text = extract_agent_response_from_attrs(llm_span.tags)
+    if text:
+        return text
     logger.warning(
-        f"LLM span {llm_span.span_id}: no assistant message with content in gen_ai.output.messages ({len(messages)} messages)"
+        "LLM span %s: no assistant message with content in span attributes",
+        llm_span.span_id,
     )
     return ""
 
diff --git a/tests/test_extraction.py b/tests/test_extraction.py
@@ -194,6 +194,17 @@ def test_adk_no_text_parts(self):
         attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"function_call": {"name": "tool"}}]}})}
         assert extract_agent_response_from_attrs(attrs) is None
 
+    def test_genai_prefers_last_assistant(self):
+        attrs = {
+            OTEL_GENAI_OUTPUT_MESSAGES: json.dumps(
+                [
+                    {"role": "assistant", "content": "First response"},
+                    {"role": "assistant", "content": "Second response"},
+                ]
+            )
+        }
+        assert extract_agent_response_from_attrs(attrs) == "Second response"
+
 
 # ---------------------------------------------------------------------------
 # extract_token_usage_from_attrs