feat(memory): voice write-path integration + dynamic backend selection

amreetkhuntia · amreetkhuntia · commit e477e2861b87 · 2026-06-08T17:40:18.000+05:30
Wire the persistent-memory library into the running app:

end_conversation.py (voice write-path):
- Hoist memory imports to module top (resolve_customer_key, MemoryService,
  BUDDY_MEMORY_BACKEND as resolve_memory_backend, BUDDY_MEMORY_ENABLED)
- Enqueue extraction after DB write: best-effort try/except, never blocks
- Backend selection: template MemoryConfig.backend override wins, else
  Redis/DevCycle dynamic default via resolve_memory_backend()

dynamic.py:
- Add BUDDY_MEMORY_BACKEND() async function — re-reads BUDDY_MEMORY_BACKEND
  from Redis/DevCycle on every call, falls back to static env default.
  Ops can switch pgvector &lt;-&gt; supermemory at runtime without a redeploy.

main.py:
- Import drain_memory_queue from memory.worker
- Register memory_extraction_drain BackgroundTaskScheduler task, gated on
  BUDDY_MEMORY_ENABLED (static kill-switch, off by default)
diff --git a/app/ai/voice/agents/breeze_buddy/agent/__init__.py b/app/ai/voice/agents/breeze_buddy/agent/__init__.py
@@ -3,7 +3,7 @@
 import asyncio
 import time
 from datetime import datetime, timezone
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, cast
 
 from fastapi import WebSocket
 from opentelemetry import trace
@@ -61,6 +61,9 @@
     prepare_and_store_initial_greeting,
 )
 from app.ai.voice.agents.breeze_buddy.mcp import get_mcp_global_functions
+from app.ai.voice.agents.breeze_buddy.memory.backends import get_memory_backend
+from app.ai.voice.agents.breeze_buddy.memory.identity import resolve_customer_key
+from app.ai.voice.agents.breeze_buddy.memory.service import MemoryService
 from app.ai.voice.agents.breeze_buddy.observability.tracing_setup import (
     create_root_span,
 )
@@ -90,6 +93,10 @@
     close_websocket_safely,
 )
 from app.ai.voice.agents.breeze_buddy.utils.warm_transfer import set_transfer_flag
+from app.core.config.dynamic import (
+    BUDDY_MEMORY_BACKEND as resolve_memory_backend,
+    BUDDY_MEMORY_ENABLED as is_memory_enabled,
+)
 from app.core.config.static import ENABLE_BREEZE_BUDDY_TRACING
 from app.core.logger import logger
 from app.core.logger.context import (
@@ -870,6 +877,51 @@ async def _handle_client_connected(self) -> None:
         context = TemplateContext(self)
         context.record_node_entry(initial_node_name)
 
+        # ── Memory read: inject user profile into LLM context ────────────────
+        # Runs after node config is finalised, before flow_manager.initialize()
+        # so the <user_memory> block lands in the initial LLM context RESET.
+        # Gates: global kill-switch AND per-template opt-in (MemoryConfig.enabled).
+        # Best-effort — any failure logs a warning and the call proceeds normally.
+        _mem_cfg = getattr(self.configurations, "memory", None)
+        if await is_memory_enabled() and _mem_cfg and _mem_cfg.enabled and self.lead:
+            try:
+                _payload = self.lead.payload or {}
+                _resolved = await resolve_customer_key(
+                    reseller_id=self.lead.reseller_id or "",
+                    merchant_id=self.lead.merchant_id or "",
+                    payload=_payload,
+                )
+                if _resolved:
+                    _customer_key, _key_type = _resolved
+                    _backend_name = _mem_cfg.backend or await resolve_memory_backend()
+                    _memory_block = await MemoryService(
+                        backend=get_memory_backend(_backend_name)
+                    ).get_profile_block(
+                        reseller_id=self.lead.reseller_id or "",
+                        merchant_id=self.lead.merchant_id or "",
+                        customer_key=_customer_key,
+                        key_type=_key_type,
+                        max_facts=_mem_cfg.max_facts,
+                    )
+                    if _memory_block:
+                        _role_msgs = list(
+                            cast(Dict[str, Any], initial_node_config).get(
+                                "role_messages", []
+                            )
+                        )
+                        _role_msgs.append({"role": "system", "content": _memory_block})
+                        cast(Dict[str, Any], initial_node_config)[
+                            "role_messages"
+                        ] = _role_msgs
+                        logger.info(
+                            f"[memory] injected profile for lead {self.lead.id}: "
+                            f"key={_customer_key!r} chars={len(_memory_block)}"
+                        )
+            except Exception as _mem_err:
+                logger.warning(
+                    f"[memory] read-path failed for lead {self.lead.id}: {_mem_err}"
+                )
+
         await self.flow_manager.initialize(initial_node_config)
         logger.info(
             f"FlowManager initialized at node: {initial_node_name}"
diff --git a/app/ai/voice/agents/breeze_buddy/handlers/internal/end_conversation.py b/app/ai/voice/agents/breeze_buddy/handlers/internal/end_conversation.py
@@ -7,6 +7,8 @@
 from app.ai.voice.agents.breeze_buddy.callbacks import (
     service_callback,
 )
+from app.ai.voice.agents.breeze_buddy.memory.identity import resolve_customer_key
+from app.ai.voice.agents.breeze_buddy.memory.service import MemoryService
 from app.ai.voice.agents.breeze_buddy.observability.tracing_setup import (
     update_span_with_evaluation_data,
 )
@@ -15,6 +17,10 @@
     publish_hold_transfer_result,
     summarize_transcription,
 )
+from app.core.config.dynamic import (
+    BUDDY_MEMORY_BACKEND as resolve_memory_backend,
+    BUDDY_MEMORY_ENABLED as is_memory_enabled,
+)
 from app.core.logger import logger
 from app.core.logger.context import clear_log_context
 from app.database.accessor.breeze_buddy.chat_session import (
@@ -259,6 +265,48 @@ async def end_conversation(context: TemplateContext, args, transition_to=None):
         # so context.lead.outcome reflects the final persisted value.
         update_span_with_evaluation_data(context)
 
+        # ── Memory extraction enqueue ─────────────────────────────────────
+        # Best-effort, must never block end_conversation.
+        # The drain worker re-reads the transcript from DB (already saved above).
+        # Gates: global kill-switch AND per-template opt-in (MemoryConfig.enabled).
+        mem_cfg = getattr(getattr(context.bot, "configurations", None), "memory", None)
+        if await is_memory_enabled() and mem_cfg and mem_cfg.enabled and context.lead:
+            try:
+                resolved = await resolve_customer_key(
+                    reseller_id=context.lead.reseller_id or "",
+                    merchant_id=context.lead.merchant_id or "",
+                    payload=payload,
+                )
+                if resolved:
+                    customer_key, key_type = resolved
+                    phone_raw = payload.get("customer_mobile_number") or payload.get(
+                        "phone"
+                    )
+                    explicit_cid = (
+                        payload.get("customer_id") if key_type == "phone" else None
+                    )
+                    # Backend: per-template override wins, else Redis/DevCycle dynamic default.
+                    backend_name = mem_cfg.backend or await resolve_memory_backend()
+                    await MemoryService().enqueue_extraction(
+                        kind="voice_lead",
+                        record_id=str(context.lead.id),
+                        customer_key=customer_key,
+                        key_type=key_type,
+                        reseller_id=context.lead.reseller_id or "",
+                        merchant_id=context.lead.merchant_id or "",
+                        source_channel="voice",
+                        phone=str(phone_raw) if phone_raw else None,
+                        explicit_customer_id=(
+                            str(explicit_cid) if explicit_cid else None
+                        ),
+                        backend=backend_name,
+                        extraction_prompt=mem_cfg.extraction_prompt,
+                    )
+            except Exception as mem_err:
+                logger.warning(
+                    f"[memory] enqueue failed for lead {context.lead.id}: {mem_err}"
+                )
+
         # Execute end_conversation_callbacks
         if context.end_conversation_callbacks:
             logger.info(
diff --git a/app/ai/voice/agents/breeze_buddy/memory/backends/base.py b/app/ai/voice/agents/breeze_buddy/memory/backends/base.py
@@ -66,12 +66,16 @@ async def ingest(
         identity: MemoryIdentity,
         transcript: List[Dict[str, Any]],
         source_channel: str,
+        extraction_prompt: Optional[str] = None,
     ) -> None:
         """Persist durable memory from a conversation transcript.
 
         The backend owns extraction: pgvector runs the LLM consolidation +
         embedding + dedup itself; supermemory hands the transcript off and
         lets the service extract.
+
+        `extraction_prompt` overrides the default LLM system prompt when set
+        (pgvector only; supermemory ignores it as extraction is server-side).
         """
 
     @abstractmethod
diff --git a/app/ai/voice/agents/breeze_buddy/memory/backends/pgvector/backend.py b/app/ai/voice/agents/breeze_buddy/memory/backends/pgvector/backend.py
@@ -65,6 +65,7 @@ async def ingest(
         identity: MemoryIdentity,
         transcript: List[Dict[str, Any]],
         source_channel: str,
+        extraction_prompt: Optional[str] = None,
     ) -> None:
         """Extract durable facts from the transcript and upsert them."""
         if not transcript:
@@ -78,7 +79,11 @@ async def ingest(
             logger.error(f"[memory.pgvector] fetch existing facts failed: {e}")
             existing = []
 
-        ops = await consolidate(existing_facts=existing, transcript=transcript)
+        ops = await consolidate(
+            existing_facts=existing,
+            transcript=transcript,
+            extraction_prompt=extraction_prompt,
+        )
         if not ops:
             return
 
@@ -200,9 +205,30 @@ async def _apply_op(
 
         elif verb == "UPDATE":
             old_fact_text = (op.get("supersedes_fact") or "").strip()
+
+            # 1. Exact match (free, no embedding call)
             old_mem = next(
                 (m for m in existing if m.fact.strip() == old_fact_text), None
             )
+
+            # 2. Embedding similarity fallback — catches LLM paraphrases of the
+            #    stored fact (the LLM rarely reproduces exact stored text).
+            if not old_mem and old_fact_text and existing:
+                old_embedding = await embed_single(old_fact_text)
+                if old_embedding:
+                    best_sim, best_mem = 0.0, None
+                    for m in existing:
+                        if m.embedding:
+                            sim = _cosine_similarity(old_embedding, m.embedding)
+                            if sim > best_sim:
+                                best_sim, best_mem = sim, m
+                    if best_mem and best_sim >= 0.80:
+                        old_mem = best_mem
+                        logger.debug(
+                            f"[memory.pgvector] UPDATE fuzzy-matched supersedes_fact "
+                            f"sim={best_sim:.3f} old={old_mem.fact!r}"
+                        )
+
             if old_mem:
                 await supersede_memory(str(old_mem.id))
             embedding = await embed_single(fact)
diff --git a/app/ai/voice/agents/breeze_buddy/memory/backends/pgvector/extract.py b/app/ai/voice/agents/breeze_buddy/memory/backends/pgvector/extract.py
@@ -25,17 +25,25 @@
 
 Output a JSON list of memory operations. Each operation has:
   "op": "ADD" | "UPDATE" | "DELETE"
-  "fact": short sentence (one durable personalization/preference/attribute/outcome)
-  "category": one of ["preference","attribute","outcome","context"] (optional)
-  "structured": optional dict with machine-readable fields (optional)
-  "supersedes_fact": (UPDATE/DELETE only) the exact text of the KNOWN_FACT being replaced/removed
+  "fact": short sentence (one durable fact about the customer)
+  "category": one of ["preference", "attribute", "outcome", "context"]
+  "structured": optional dict with machine-readable fields, e.g. {"name": "Amreet"}
+  "supersedes_fact": (UPDATE/DELETE only) the closest matching text from KNOWN_FACTS
 
 Rules:
-- Only capture durable facts worth remembering across future conversations.
-- Ignore small talk, greetings, PII (passwords, full card numbers, OTPs).
-- If a new fact contradicts a known fact, emit UPDATE (not ADD).
-- If a known fact is confirmed still true, emit nothing (no op).
-- If no new facts are worth storing, return an empty list [].
+- Only capture facts durable enough to be useful in a future conversation.
+- ALWAYS capture: the customer's name or preferred form of address, any explicit
+  corrections they make to previously stated information, travel preferences,
+  stated outcomes, and personal attributes they volunteer.
+- A customer stating their name is NOT a greeting to ignore — it is a high-value
+  attribute. Capture it with category "attribute" and structured {"name": "<value>"}.
+- Ignore: passwords, full card numbers, OTPs, bank account numbers, and one-time
+  transactional details with no future value. Do NOT treat a customer's name or
+  identity as PII to ignore.
+- If a new fact contradicts or corrects a KNOWN_FACT, emit UPDATE (not ADD).
+  Set supersedes_fact to the closest matching text from KNOWN_FACTS.
+- If a known fact is confirmed still true, emit nothing.
+- If no facts are worth storing, return [].
 - Keep each fact concise (one sentence).
 
 Return ONLY valid JSON — a list of operation objects, no markdown fences."""
@@ -77,9 +85,12 @@ def _find_duplicate(
 async def consolidate(
     existing_facts: List[UserMemory],
     transcript: List[Dict[str, Any]],
+    extraction_prompt: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """Run the LLM extraction and return raw op dicts.
 
+    Uses `extraction_prompt` if provided (template-level override), otherwise
+    falls back to the built-in `_SYSTEM_PROMPT`.
     The worker applies these ops against the DB (insert/supersede).
     Returns [] on failure (safe to ignore).
     """
@@ -89,6 +100,8 @@ async def consolidate(
     try:
         llm = await _resolve_azure(None)
 
+        system_prompt = extraction_prompt or _SYSTEM_PROMPT
+
         known_lines = (
             "\n".join(f"- [{m.category or 'fact'}] {m.fact}" for m in existing_facts)
             if existing_facts
@@ -102,7 +115,7 @@ async def consolidate(
 
         params = OpenAILLMInvocationParams(  # type: ignore[call-overload]
             messages=[  # type: ignore[arg-type]
-                {"role": "system", "content": _SYSTEM_PROMPT},
+                {"role": "system", "content": system_prompt},
                 {
                     "role": "user",
                     "content": (
diff --git a/app/ai/voice/agents/breeze_buddy/memory/backends/supermemory/backend.py b/app/ai/voice/agents/breeze_buddy/memory/backends/supermemory/backend.py
@@ -82,7 +82,9 @@ async def ingest(
         identity: MemoryIdentity,
         transcript: List[dict],
         source_channel: str,
+        extraction_prompt: Optional[str] = None,
     ) -> None:
+        # extraction_prompt is ignored — supermemory extraction is server-side.
         if not transcript:
             return
         try:
diff --git a/app/ai/voice/agents/breeze_buddy/memory/service.py b/app/ai/voice/agents/breeze_buddy/memory/service.py
@@ -76,6 +76,7 @@ async def enqueue_extraction(
         phone: Optional[str] = None,
         explicit_customer_id: Optional[str] = None,
         backend: Optional[str] = None,
+        extraction_prompt: Optional[str] = None,
     ) -> None:
         """Push an extraction job onto the Redis queue.
 
@@ -111,6 +112,7 @@ async def enqueue_extraction(
                     "phone": phone,
                     "explicit_customer_id": explicit_customer_id,
                     "backend": backend,
+                    "extraction_prompt": extraction_prompt,
                 }
             )
             await client.rpush(_QUEUE_KEY, payload)  # type: ignore[union-attr]
diff --git a/app/ai/voice/agents/breeze_buddy/memory/worker.py b/app/ai/voice/agents/breeze_buddy/memory/worker.py
@@ -67,6 +67,7 @@ async def _process_item(item: Dict[str, Any]) -> None:
     phone: Optional[str] = item.get("phone") or None
     explicit_customer_id: Optional[str] = item.get("explicit_customer_id") or None
     backend_name: Optional[str] = item.get("backend") or None
+    extraction_prompt: Optional[str] = item.get("extraction_prompt") or None
 
     if not (kind and record_id and customer_key and reseller_id and merchant_id):
         logger.warning(f"[memory.worker] incomplete item, skipping: {item}")
@@ -94,7 +95,7 @@ async def _process_item(item: Dict[str, Any]) -> None:
     if phone and explicit_customer_id and key_type == "phone":
         identity = await backend.merge_identity(identity)
 
-    await backend.ingest(identity, transcript, source_channel)
+    await backend.ingest(identity, transcript, source_channel, extraction_prompt)
 
 
 async def _fetch_transcript(kind: str, record_id: str) -> List[Dict[str, Any]]:
diff --git a/app/ai/voice/agents/breeze_buddy/template/types.py b/app/ai/voice/agents/breeze_buddy/template/types.py
@@ -1271,6 +1271,23 @@ class MemoryConfig(BaseModel):
             "When None, falls back to the global BUDDY_MEMORY_BACKEND env."
         ),
     )
+    max_facts: int = Field(
+        20,
+        description=(
+            "Maximum number of memory facts injected into the LLM context at "
+            "call start. Increase for richer recall; decrease to keep the "
+            "context window tight. Default 20."
+        ),
+    )
+    extraction_prompt: Optional[str] = Field(
+        None,
+        description=(
+            "Override the default LLM extraction prompt used by the pgvector "
+            "backend when consolidating facts from a conversation. When None, "
+            "the built-in prompt is used. Has no effect on the supermemory "
+            "backend (extraction is server-side)."
+        ),
+    )
 
 
 class ConfigurationModel(BaseModel):
diff --git a/app/core/config/dynamic.py b/app/core/config/dynamic.py
@@ -1,5 +1,9 @@
 import json
 
+from app.core.config.static import (
+    BUDDY_MEMORY_BACKEND as _STATIC_MEMORY_BACKEND,
+    BUDDY_MEMORY_ENABLED as _STATIC_MEMORY_ENABLED,
+)
 from app.core.logger import logger
 from app.services.live_config.store import get_config
 
@@ -46,6 +50,28 @@ async def ENABLE_BACKGROUND_TASKS() -> bool:
     return await get_config("ENABLE_BACKGROUND_TASKS", "false", bool)
 
 
+async def BUDDY_MEMORY_ENABLED() -> bool:
+    """Global persistent-memory kill-switch, Redis/DevCycle-overridable.
+
+    Defaults to the static BUDDY_MEMORY_ENABLED env (off unless explicitly
+    set). Ops can enable or disable memory across all calls at runtime by
+    flipping this key in Redis/DevCycle without a pod restart.
+    Note: the drain-worker registration in main.py and the pgvector codec
+    gate in database/__init__.py still read the static value at startup.
+    """
+    return await get_config("BUDDY_MEMORY_ENABLED", _STATIC_MEMORY_ENABLED, bool)
+
+
+async def BUDDY_MEMORY_BACKEND() -> str:
+    """Active persistent-memory backend ("pgvector" | "supermemory").
+
+    Redis/DevCycle-overridable so ops can switch backends at runtime without a
+    redeploy; defaults to the static BUDDY_MEMORY_BACKEND env. A template's
+    MemoryConfig.backend still takes precedence over this at the call-site.
+    """
+    return await get_config("BUDDY_MEMORY_BACKEND", _STATIC_MEMORY_BACKEND, str)
+
+
 # ----------------------------------------------------------------------------
 # Dispatcher dials. Re-read on every invocation, so DevCycle / Redis changes
 # propagate without a pod restart. See docs/BACKLOG_DISPATCHER_REDESIGN.md
diff --git a/app/main.py b/app/main.py

Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,7 @@ async def enqueue_extraction(`
`76`	`76`	`phone: Optional[str] = None,`
`77`	`77`	`explicit_customer_id: Optional[str] = None,`
`78`	`78`	`backend: Optional[str] = None,`
	`79`	`+ extraction_prompt: Optional[str] = None,`
`79`	`80`	`) -> None:`
`80`	`81`	`"""Push an extraction job onto the Redis queue.`
`81`	`82`
`@@ -111,6 +112,7 @@ async def enqueue_extraction(`
`111`	`112`	`"phone": phone,`
`112`	`113`	`"explicit_customer_id": explicit_customer_id,`
`113`	`114`	`"backend": backend,`
	`115`	`+ "extraction_prompt": extraction_prompt,`
`114`	`116`	`}`
`115`	`117`	`)`
`116`	`118`	`await client.rpush(_QUEUE_KEY, payload) # type: ignore[union-attr]`