test(#972): backfill observability regression coverage (#1086)

Cataldir · web-flow · commit 6b75f40de3d7 · 2026-05-09T14:25:31.000Z
Backfills focused regression tests originally scoped to PRs #947, #949, #951 that were superseded by the consolidated #953 merge: - OTEL_SERVICE_NAME propagation via configure_logging and FoundryTracer - AZURE_TRACING_ENABLED guard when no App Insights exporter is configured - Critical operational logging for fail-open memory, model invocation, circuit breaker state transitions, and event-handler error paths
diff --git a/apps/search-enrichment-agent/tests/test_event_handlers.py b/apps/search-enrichment-agent/tests/test_event_handlers.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import logging
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -20,6 +21,20 @@ def _clear_service_traces() -> None:
     tracer.clear()
 
 
+@pytest.fixture
+def event_logger(monkeypatch: pytest.MonkeyPatch) -> logging.Logger:
+    """Return a capturable logger for Event Hub handler assertions."""
+    logger = logging.getLogger("search_enrichment_event_handler_test")
+    logger.handlers.clear()
+    logger.setLevel(logging.INFO)
+    logger.propagate = True
+    monkeypatch.setattr(
+        "search_enrichment_agent.event_handlers.configure_logging",
+        lambda app_name: logger,
+    )
+    return logger
+
+
 def _eventhub_liveness_events() -> list[dict[str, object]]:
     return [
         event
@@ -36,7 +51,11 @@ async def test_build_event_handlers_includes_search_enrichment_jobs() -> None:
 
 
 @pytest.mark.asyncio
-async def test_search_enrichment_event_handler_processes_job_with_mocks() -> None:
+async def test_search_enrichment_event_handler_processes_job_with_mocks(
+    event_logger: logging.Logger,
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    caplog.set_level(logging.INFO, logger=event_logger.name)
     approved_truth = AsyncMock()
     approved_truth.get_approved_data = AsyncMock(
         return_value={
@@ -80,10 +99,18 @@ async def test_search_enrichment_event_handler_processes_job_with_mocks() -> Non
         "entity_id": "SKU-9",
         "status": "enriched",
     }
+    assert any(
+        "search_enrichment_event_processed entity_id=SKU-9 status=enriched" in record.getMessage()
+        for record in caplog.records
+    )
 
 
 @pytest.mark.asyncio
-async def test_search_enrichment_event_handler_traces_missing_entity() -> None:
+async def test_search_enrichment_event_handler_traces_missing_entity(
+    event_logger: logging.Logger,
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    caplog.set_level(logging.INFO, logger=event_logger.name)
     approved_truth = AsyncMock()
     approved_truth.get_approved_data = AsyncMock(return_value=None)
 
@@ -119,10 +146,18 @@ async def test_search_enrichment_event_handler_traces_missing_entity() -> None:
         "eventhub": "search-enrichment-jobs",
         "status": "skipped",
     }
+    assert any(
+        "search_enrichment_event_skipped_missing_entity" in record.getMessage()
+        for record in caplog.records
+    )
 
 
 @pytest.mark.asyncio
-async def test_search_enrichment_event_handler_traces_error_when_orchestrator_raises() -> None:
+async def test_search_enrichment_event_handler_traces_error_when_orchestrator_raises(
+    event_logger: logging.Logger,
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    caplog.set_level(logging.INFO, logger=event_logger.name)
     approved_truth = AsyncMock()
     approved_truth.get_approved_data = AsyncMock(side_effect=RuntimeError("eventhub boom"))
 
@@ -160,3 +195,8 @@ async def test_search_enrichment_event_handler_traces_error_when_orchestrator_ra
         "entity_id": "SKU-9",
         "status": "error",
     }
+    assert any(
+        "search_enrichment_event_processing_failed entity_id=SKU-9 error=eventhub boom"
+        in record.getMessage()
+        for record in caplog.records
+    )
diff --git a/lib/tests/test_agents_base.py b/lib/tests/test_agents_base.py
@@ -1,5 +1,7 @@
 """Tests for base agent functionality."""
 
+import asyncio
+import logging
 from unittest.mock import AsyncMock, Mock
 
 import pytest
@@ -182,6 +184,54 @@ async def test_invoke_model_with_routing(self, slm_target, llm_target):
         assert result is not None
         assert result.get("_target") == "test-slm"
 
+    @pytest.mark.asyncio
+    async def test_invoke_model_logs_provider_failure(self, caplog):
+        """Provider failures emit an error log before propagating."""
+
+        async def failing_invoker(**_kwargs):
+            raise RuntimeError("provider down")
+
+        slm = ModelTarget(name="slm", model="small", invoker=failing_invoker)
+        deps = AgentDependencies(slm=slm, llm=None, service_name="critical-log-test")
+        agent = SimpleTestAgent(config=deps)
+
+        with caplog.at_level(logging.ERROR, logger="holiday_peak_lib.agents.base_agent"):
+            with pytest.raises(RuntimeError, match="provider down"):
+                await agent.invoke_model(
+                    {"query": "hello"},
+                    [{"role": "user", "content": "hello"}],
+                )
+
+        assert any(
+            "agent_model_invocation_failed service=critical-log-test "
+            "model=small error=provider down" in record.getMessage()
+            for record in caplog.records
+        )
+
+    @pytest.mark.asyncio
+    async def test_invoke_model_logs_provider_timeout(self, caplog):
+        """Provider timeouts emit an error log and return the timeout fallback."""
+
+        async def timeout_invoker(**_kwargs):
+            raise asyncio.TimeoutError
+
+        slm = ModelTarget(name="slm", model="small", invoker=timeout_invoker)
+        deps = AgentDependencies(slm=slm, llm=None, service_name="critical-log-test")
+        agent = SimpleTestAgent(config=deps)
+
+        with caplog.at_level(logging.ERROR, logger="holiday_peak_lib.agents.base_agent"):
+            result = await agent.invoke_model(
+                {"query": "hello"},
+                [{"role": "user", "content": "hello"}],
+            )
+
+        assert result["error"] == "timeout"
+        assert any(
+            "agent_model_invocation_timeout service=critical-log-test model=small"
+            in record.getMessage()
+            for record in caplog.records
+        )
+
     @pytest.mark.asyncio
     async def test_foundry_governance_strips_system_prompt(self):
         """Test Foundry governance strips local system prompts from messages."""
diff --git a/lib/tests/test_app_factory.py b/lib/tests/test_app_factory.py
@@ -1,5 +1,6 @@
 """Tests for app_factory module."""
 
+import os
 from unittest.mock import AsyncMock, patch
 
 import pytest
@@ -953,3 +954,83 @@ async def handle(self, request):
         client = TestClient(app)
         response = client.post("/invoke", json={"test": "data"})
         assert response.json()["custom"] is True
+
+
+class TestAzureTracingGuard:
+    """Tests for the AZURE_TRACING_ENABLED env-var guard."""
+
+    def test_guard_sets_env_when_no_appinsights_and_no_tracing_var(
+        self, mock_hot_memory, mock_warm_memory, mock_cold_memory, monkeypatch
+    ):
+        """build_service_app disables Azure SDK tracing when no exporter is configured."""
+        monkeypatch.setenv("PROJECT_ENDPOINT", TEST_PROJECT_ENDPOINT)
+        monkeypatch.setenv("FOUNDRY_AGENT_ID_FAST", "agent-fast-123")
+        monkeypatch.setenv("MODEL_DEPLOYMENT_NAME_FAST", "gpt-4o-mini")
+        monkeypatch.setenv("FOUNDRY_TRACING_ENABLED", "false")
+        monkeypatch.delenv("APPLICATIONINSIGHTS_CONNECTION_STRING", raising=False)
+        monkeypatch.delenv("APPINSIGHTS_CONNECTION_STRING", raising=False)
+        monkeypatch.delenv("AZURE_TRACING_ENABLED", raising=False)
+
+        build_service_app(
+            service_name="guard-test",
+            agent_class=SampleServiceAgent,
+            hot_memory=mock_hot_memory,
+            warm_memory=mock_warm_memory,
+            cold_memory=mock_cold_memory,
+        )
+
+        assert os.environ.get("AZURE_TRACING_ENABLED") == "false"
+
+    def test_guard_does_not_override_explicit_tracing_enabled(
+        self, mock_hot_memory, mock_warm_memory, mock_cold_memory, monkeypatch
+    ):
+        """Explicit AZURE_TRACING_ENABLED values remain authoritative."""
+        monkeypatch.setenv("PROJECT_ENDPOINT", TEST_PROJECT_ENDPOINT)
+        monkeypatch.setenv("FOUNDRY_AGENT_ID_FAST", "agent-fast-123")
+        monkeypatch.setenv("MODEL_DEPLOYMENT_NAME_FAST", "gpt-4o-mini")
+        monkeypatch.setenv("FOUNDRY_TRACING_ENABLED", "false")
+        monkeypatch.delenv("APPLICATIONINSIGHTS_CONNECTION_STRING", raising=False)
+        monkeypatch.delenv("APPINSIGHTS_CONNECTION_STRING", raising=False)
+        monkeypatch.setenv("AZURE_TRACING_ENABLED", "true")
+
+        build_service_app(
+            service_name="guard-test-no-override",
+            agent_class=SampleServiceAgent,
+            hot_memory=mock_hot_memory,
+            warm_memory=mock_warm_memory,
+            cold_memory=mock_cold_memory,
+        )
+
+        assert os.environ.get("AZURE_TRACING_ENABLED") == "true"
+
+    def test_guard_does_not_activate_when_appinsights_configured(
+        self, mock_hot_memory, mock_warm_memory, mock_cold_memory, monkeypatch
+    ):
+        """The guard stays inactive when Application Insights is configured."""
+        import azure.monitor.opentelemetry as azure_monitor
+
+        monkeypatch.setattr(
+            azure_monitor,
+            "configure_azure_monitor",
+            lambda **_kwargs: None,
+        )
+        monkeypatch.setenv("PROJECT_ENDPOINT", TEST_PROJECT_ENDPOINT)
+        monkeypatch.setenv("FOUNDRY_AGENT_ID_FAST", "agent-fast-123")
+        monkeypatch.setenv("MODEL_DEPLOYMENT_NAME_FAST", "gpt-4o-mini")
+        monkeypatch.setenv("FOUNDRY_TRACING_ENABLED", "false")
+        monkeypatch.setenv(
+            "APPLICATIONINSIGHTS_CONNECTION_STRING",
+            "InstrumentationKey=00000000-0000-0000-0000-000000000000",
+        )
+        monkeypatch.delenv("APPINSIGHTS_CONNECTION_STRING", raising=False)
+        monkeypatch.delenv("AZURE_TRACING_ENABLED", raising=False)
+
+        build_service_app(
+            service_name="guard-test-appinsights",
+            agent_class=SampleServiceAgent,
+            hot_memory=mock_hot_memory,
+            warm_memory=mock_warm_memory,
+            cold_memory=mock_cold_memory,
+        )
+
+        assert os.environ.get("AZURE_TRACING_ENABLED") is None
diff --git a/lib/tests/test_circuit_breaker.py b/lib/tests/test_circuit_breaker.py
@@ -1,6 +1,7 @@
 """Unit tests for the async circuit breaker."""
 
 import asyncio
+import logging
 
 import pytest
 from holiday_peak_lib.utils.circuit_breaker import (
@@ -42,6 +43,27 @@ async def failing():
         assert cb.state == CircuitState.OPEN
         assert cb.failure_count == 3
 
+    async def test_failure_and_open_transition_are_logged(self, caplog):
+        cb = CircuitBreaker("inventory-api", failure_threshold=1, recovery_timeout=30.0)
+
+        async def failing():
+            raise RuntimeError("backend unavailable")
+
+        with caplog.at_level(logging.INFO, logger="holiday_peak_lib.utils.circuit_breaker"):
+            with pytest.raises(RuntimeError, match="backend unavailable"):
+                await cb.call(failing)
+
+        messages = [record.getMessage() for record in caplog.records]
+        assert any(
+            "circuit_breaker_failure name=inventory-api state=closed "
+            "failure_count=1 threshold=1 error=backend unavailable" in message
+            for message in messages
+        )
+        assert any(
+            "circuit_breaker name=inventory-api state=open failures=1" in message
+            for message in messages
+        )
+
     async def test_open_circuit_raises_without_fallback(self):
         cb = CircuitBreaker("test", failure_threshold=1, recovery_timeout=30.0)
 
diff --git a/lib/tests/test_session_manager.py b/lib/tests/test_session_manager.py
@@ -1,6 +1,7 @@
 """Tests for session_manager: smart session continuity logic."""
 
 import json
+import logging
 import time
 from unittest.mock import AsyncMock
 
@@ -172,8 +173,8 @@ async def test_keyword_divergence_returns_fresh(self):
         assert decision.continue_session is False
 
     @pytest.mark.asyncio
-    async def test_cosmos_read_failure_still_continues(self):
-        """If Cosmos is unavailable, still continue but without session state."""
+    async def test_cosmos_read_failure_still_continues(self, caplog):
+        """If Cosmos is unavailable, continue without state and log the failure."""
         hot = AsyncMock()
         warm = AsyncMock()
         summary = SessionSummary(
@@ -187,15 +188,26 @@ async def test_cosmos_read_failure_still_continues(self):
         )
         hot.get = AsyncMock(return_value=json.dumps(summary.__dict__))
         warm.read = AsyncMock(side_effect=Exception("Cosmos unavailable"))
-        decision = await evaluate_session_continuity(
-            hot,
-            warm,
-            {"query": "shipping eta tracking update"},
-            service="svc",
-            entity_id="ent1",
-        )
+
+        with caplog.at_level(
+            logging.WARNING,
+            logger="holiday_peak_lib.agents.memory.session_manager",
+        ):
+            decision = await evaluate_session_continuity(
+                hot,
+                warm,
+                {"query": "shipping eta tracking update"},
+                service="svc",
+                entity_id="ent1",
+            )
+
         assert decision.continue_session is True
         assert decision.foundry_session_state is None
+        assert any(
+            "session_continuity_cosmos_read_failed session_id=svc:ent1:123 "
+            "service=svc entity_id=ent1 error=Cosmos unavailable" in record.getMessage()
+            for record in caplog.records
+        )
 
 
 class TestBuildSessionSummary:
@@ -327,3 +339,29 @@ async def test_none_session_state_noop(self):
             summary_text="",
         )
         warm.upsert.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_cosmos_write_failure_logs_warning(self, caplog):
+        warm = AsyncMock()
+        warm.upsert = AsyncMock(side_effect=RuntimeError("Cosmos write unavailable"))
+
+        with caplog.at_level(
+            logging.WARNING,
+            logger="holiday_peak_lib.agents.memory.session_manager",
+        ):
+            await persist_full_session(
+                warm,
+                session_id="svc:ent1:123",
+                service="svc",
+                entity_id="ent1",
+                foundry_session_state={"session_id": "foundry-abc"},
+                messages=[{"role": "user", "content": "test"}],
+                summary_text="test summary",
+            )
+
+        warm.upsert.assert_awaited_once()
+        assert any(
+            "session_persistence_cosmos_write_failed session_id=svc:ent1:123 "
+            "service=svc entity_id=ent1 error=Cosmos write unavailable" in record.getMessage()
+            for record in caplog.records
+        )
diff --git a/lib/tests/test_telemetry.py b/lib/tests/test_telemetry.py