feat(eval): enqueue call trace ids onto evaluation redis queue

narsimhaReddyJuspay · claude · narsimhaReddyJuspay · commit 82a88d8d4faa · 2026-06-17T00:04:59.000+05:30
Push the bare OTEL trace_id to a Redis list at call end (atomic SETNX+RPUSH via Lua) for the evaluation worker to drain later.

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
diff --git a/app/ai/voice/agents/breeze_buddy/handlers/internal/end_conversation.py b/app/ai/voice/agents/breeze_buddy/handlers/internal/end_conversation.py
@@ -7,7 +7,11 @@
 from app.ai.voice.agents.breeze_buddy.callbacks import (
     service_callback,
 )
+from app.ai.voice.agents.breeze_buddy.observability.evaluations import (
+    enqueue_trace_for_evaluation,
+)
 from app.ai.voice.agents.breeze_buddy.observability.tracing_setup import (
+    get_trace_id,
     update_span_with_evaluation_data,
 )
 from app.ai.voice.agents.breeze_buddy.template.context import TemplateContext
@@ -259,6 +263,12 @@ async def end_conversation(context: TemplateContext, args, transition_to=None):
         # so context.lead.outcome reflects the final persisted value.
         update_span_with_evaluation_data(context)
 
+        # Enqueue the trace for internal LLM-as-judge evaluation.
+        # Best-effort + idempotent per trace_id; never breaks call teardown.
+        evaluation_trace_id = get_trace_id(context.root_span)
+        if evaluation_trace_id:
+            await enqueue_trace_for_evaluation(evaluation_trace_id)
+
         # Execute end_conversation_callbacks
         if context.end_conversation_callbacks:
             logger.info(
diff --git a/app/ai/voice/agents/breeze_buddy/observability/evaluations/__init__.py b/app/ai/voice/agents/breeze_buddy/observability/evaluations/__init__.py
@@ -0,0 +1,21 @@
+"""Internal LLM-as-judge evaluation subsystem.
+
+Phase 1 ships only the producer: ``enqueue_trace_for_evaluation``, called
+from the call-end handler with a bare trace_id. The worker (runner,
+llm_client, actions), the DB three-layer, the Pydantic types, and the REST
+API land in later phases.
+"""
+
+from .queue import (
+    ENQUEUED_KEY_PREFIX,
+    ENQUEUED_TTL_SECONDS,
+    TRACE_QUEUE_KEY,
+    enqueue_trace_for_evaluation,
+)
+
+__all__ = [
+    "ENQUEUED_KEY_PREFIX",
+    "ENQUEUED_TTL_SECONDS",
+    "TRACE_QUEUE_KEY",
+    "enqueue_trace_for_evaluation",
+]
diff --git a/app/ai/voice/agents/breeze_buddy/observability/evaluations/queue.py b/app/ai/voice/agents/breeze_buddy/observability/evaluations/queue.py
@@ -0,0 +1,78 @@
+"""Producer for the internal evaluation trace queue.
+
+When a call ends, the OTEL/Langfuse trace_id is pushed onto a Redis list so
+the evaluation worker (later phase) can drain it and fetch the full trace
+from Langfuse. ONLY the trace_id is stored here — every other field (tags,
+call_sid, transcription, payload, ...) is read back from Langfuse by the
+worker, keeping the queue minimal.
+
+This module owns only the producer half of the key layout. The consumer
+half (inflight ZSET + atomic pop + self-sweep) ships with the worker in a
+later phase and imports the constants below:
+
+  evaluation:trace_queue      LIST   trace_ids awaiting the worker
+  evaluation:enqueued:{id}    STRING (SETNX) dedup-at-enqueue marker
+
+The dedup marker and the queue push run inside a single Redis Lua script so
+they are atomic: a failure between them can never leave a marker that
+suppresses retries for the TTL window. This mirrors how the dispatch system
+uses ``run_script`` for atomic multi-key ops, and is safe on the single-node
+Redis that prod runs.
+"""
+
+from app.core.logger import logger
+from app.services.redis.client import get_redis_service
+
+TRACE_QUEUE_KEY = "evaluation:trace_queue"
+ENQUEUED_KEY_PREFIX = "evaluation:enqueued:"
+# A trace is "done" once judged; cap the dedup/replay window at a week so a
+# retried call-end within that window won't requeue an already-queued trace.
+ENQUEUED_TTL_SECONDS = 7 * 24 * 3600
+
+# Atomically set the dedup marker AND enqueue the trace_id, so a crash or
+# Redis failure between the two can never leave a marker that suppresses
+# retries. Returns 1 if newly enqueued, 0 if already enqueued (dedup).
+#   KEYS[1] = evaluation:trace_queue (LIST)
+#   KEYS[2] = evaluation:enqueued:{trace_id} (STRING marker)
+#   ARGV[1] = trace_id   ARGV[2] = marker TTL (seconds)
+_ENQUEUE_SCRIPT = """
+if redis.call('SET', KEYS[2], '1', 'NX', 'EX', ARGV[2]) then
+    redis.call('RPUSH', KEYS[1], ARGV[1])
+    return 1
+end
+return 0
+"""
+
+
+async def enqueue_trace_for_evaluation(trace_id: str) -> bool:
+    """Push a trace_id onto the evaluation queue, once per trace.
+
+    Dedups via a SETNX marker inside the same atomic Lua script that RPUSHes
+    the trace_id, so the marker and the queue entry can never diverge.
+    Best-effort: any failure (Redis down, script error) is logged and
+    swallowed so it can never break call teardown. Returns True only when the
+    trace was newly enqueued.
+    """
+    if not trace_id:
+        return False
+
+    try:
+        redis = await get_redis_service()
+        marker = f"{ENQUEUED_KEY_PREFIX}{trace_id}"
+        result = await redis.run_script(
+            _ENQUEUE_SCRIPT,
+            keys=[TRACE_QUEUE_KEY, marker],
+            args=[trace_id, ENQUEUED_TTL_SECONDS],
+        )
+        if result == 1:
+            logger.info(f"Enqueued trace {trace_id} for evaluation")
+            return True
+        if result == 0:
+            logger.debug(f"Trace {trace_id} already enqueued for evaluation; skipping")
+            return False
+        # run_script swallows Redis errors and returns None on failure.
+        logger.error(f"Failed to enqueue trace {trace_id}: script returned {result!r}")
+        return False
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Failed to enqueue trace for evaluation: {e}")
+        return False
diff --git a/app/ai/voice/agents/breeze_buddy/observability/tracing_setup.py b/app/ai/voice/agents/breeze_buddy/observability/tracing_setup.py
@@ -8,6 +8,7 @@
 from opentelemetry.sdk.resources import SERVICE_NAME, Resource
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.trace import format_trace_id
 
 from app.ai.voice.agents.breeze_buddy.template.context import TemplateContext
 from app.core.config.static import (
@@ -240,3 +241,20 @@ def update_span_with_evaluation_data(context: TemplateContext) -> None:
 
     except Exception as e:
         logger.error(f"Error updating span with evaluation data: {e}")
+
+
+def get_trace_id(span: Optional[trace.Span]) -> Optional[str]:
+    """Return the 32-char hex OTEL trace_id for a span, or None.
+
+    The OTEL trace_id is a 1:1 mapping to the Langfuse trace.id (the OTLP
+    exporter ships this exact id). Returns None when tracing is disabled
+    (mirrors update_span_with_evaluation_data's guard) or the span is
+    missing, so callers can short-circuit safely.
+    """
+    if not span or not ENABLE_BREEZE_BUDDY_TRACING:
+        return None
+    try:
+        return format_trace_id(span.get_span_context().trace_id)
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Error extracting trace_id from span: {e}")
+        return None