Skip to content

Commit dcb1556

Browse files
authored
fix: comprehensive logging for all operations, exceptions, and agent lifecycle (#952) (#953)
1 parent 422aafb commit dcb1556

10 files changed

Lines changed: 177 additions & 11 deletions

File tree

apps/search-enrichment-agent/src/search_enrichment_agent/event_handlers.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,13 @@ async def handle_search_enrichment_job(_partition_context, event) -> None: # no
9797
result.get("status"),
9898
result.get("strategy"),
9999
)
100-
except Exception:
100+
except Exception as exc:
101+
logger.error(
102+
"search_enrichment_event_processing_failed entity_id=%s error=%s",
103+
entity_id,
104+
exc,
105+
exc_info=True,
106+
)
101107
_trace_eventhub_liveness(
102108
outcome="error",
103109
status="error",

lib/src/holiday_peak_lib/adapters/base.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"""
1919

2020
import asyncio
21+
import logging
2122
import random
2223
import time
2324
from abc import ABC, abstractmethod
@@ -28,6 +29,8 @@
2829
CircuitBreaker,
2930
CircuitBreakerOpenError,
3031
)
32+
33+
logger = logging.getLogger(__name__)
3134
from pydantic import BaseModel, ValidationError
3235

3336
ModelT = TypeVar("ModelT", bound=BaseModel)
@@ -176,7 +179,19 @@ def __init__(
176179

177180
# Public methods (resilient wrappers)
178181
async def connect(self, **kwargs: Any) -> None:
179-
await self._connect_impl(**kwargs)
182+
adapter_name = type(self).__name__
183+
logger.info("adapter_connect_start adapter=%s", adapter_name)
184+
try:
185+
await self._connect_impl(**kwargs)
186+
logger.info("adapter_connect_success adapter=%s", adapter_name)
187+
except Exception as exc:
188+
logger.error(
189+
"adapter_connect_failed adapter=%s error=%s",
190+
adapter_name,
191+
exc,
192+
exc_info=True,
193+
)
194+
raise
180195

181196
async def fetch(self, query: dict[str, Any]) -> Iterable[dict[str, Any]]:
182197
key = self._cache_key(query)
@@ -228,6 +243,7 @@ async def _delete_impl(self, identifier: str) -> bool:
228243
async def _call_with_resilience(self, func):
229244
await self._rate_limiter.acquire()
230245

246+
adapter_name = type(self).__name__
231247
last_error: Exception | None = None
232248
for attempt in range(1, self._retries + 2):
233249
try:
@@ -238,14 +254,31 @@ async def _timed_call() -> Any:
238254
result = await self._circuit_breaker.call(_timed_call)
239255
return result
240256
except CircuitBreakerOpenError as exc:
257+
logger.error(
258+
"adapter_circuit_breaker_open adapter=%s",
259+
adapter_name,
260+
)
241261
raise AdapterError("Circuit breaker open") from exc
242262
except Exception as exc: # noqa: BLE001
243263
last_error = exc
244264
if attempt > self._retries:
245265
break
266+
logger.warning(
267+
"adapter_retry adapter=%s attempt=%d/%d error=%s",
268+
adapter_name,
269+
attempt,
270+
self._retries,
271+
exc,
272+
)
246273
delay = min(self._max_delay, self._base_delay * (2 ** (attempt - 1)))
247274
delay *= 1 + random.random() * 0.25
248275
await asyncio.sleep(delay)
276+
logger.error(
277+
"adapter_operation_failed adapter=%s retries_exhausted=%d error=%s",
278+
adapter_name,
279+
self._retries,
280+
last_error,
281+
)
249282
raise AdapterError("Operation failed after retries") from last_error
250283

251284
def resilience_status(self) -> dict[str, Any]:

lib/src/holiday_peak_lib/agents/base_agent.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
"""Base agent abstraction with model selection and SDK integration points."""
22

33
import asyncio
4+
import logging
45
import os
56
from abc import ABC, abstractmethod
67
from dataclasses import dataclass
78
from time import perf_counter
89
from typing import Any, AsyncGenerator, Awaitable, Callable
910

11+
logger = logging.getLogger(__name__)
12+
1013
from agent_framework import BaseAgent
1114
from holiday_peak_lib.agents.complexity import assess_complexity
1215

@@ -380,10 +383,22 @@ async def __invoke_target(
380383
except asyncio.TimeoutError:
381384
outcome = "timeout"
382385
error_text = "Model invocation timed out"
386+
logger.error(
387+
"agent_model_invocation_timeout service=%s model=%s",
388+
getattr(self, "service_name", "unknown"),
389+
target.model,
390+
)
383391
raise
384392
except Exception as exc:
385393
outcome = "error"
386394
error_text = str(exc)
395+
logger.error(
396+
"agent_model_invocation_failed service=%s model=%s error=%s",
397+
getattr(self, "service_name", "unknown"),
398+
target.model,
399+
exc,
400+
exc_info=True,
401+
)
387402
raise
388403
finally:
389404
elapsed_ms = (perf_counter() - started) * 1000
@@ -641,3 +656,38 @@ async def invoke_model_stream(
641656
@abstractmethod
642657
async def handle(self, request: dict[str, Any]) -> dict[str, Any]:
643658
"""Handle an incoming request."""
659+
660+
def __init_subclass__(cls, **kwargs: Any) -> None:
661+
"""Auto-wrap concrete handle() implementations with entry/exit logging."""
662+
super().__init_subclass__(**kwargs)
663+
original_handle = cls.__dict__.get("handle")
664+
if original_handle is None:
665+
return
666+
667+
async def _logged_handle(self: Any, request: dict[str, Any]) -> dict[str, Any]:
668+
svc = getattr(self, "service_name", cls.__name__)
669+
logger.info("agent_handle_entry service=%s", svc)
670+
started = perf_counter()
671+
try:
672+
result = await original_handle(self, request)
673+
elapsed = (perf_counter() - started) * 1000
674+
logger.info(
675+
"agent_handle_success service=%s elapsed_ms=%.1f",
676+
svc,
677+
elapsed,
678+
)
679+
return result
680+
except Exception as exc:
681+
elapsed = (perf_counter() - started) * 1000
682+
logger.error(
683+
"agent_handle_failed service=%s elapsed_ms=%.1f error=%s",
684+
svc,
685+
elapsed,
686+
exc,
687+
exc_info=True,
688+
)
689+
raise
690+
691+
_logged_handle.__name__ = "handle"
692+
_logged_handle.__qualname__ = f"{cls.__qualname__}.handle"
693+
cls.handle = _logged_handle # type: ignore[method-assign]

lib/src/holiday_peak_lib/agents/memory/session_manager.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,14 @@
1919
from __future__ import annotations
2020

2121
import json
22+
import logging
2223
import re
2324
import time
2425
from dataclasses import asdict, dataclass, field
2526
from typing import Any
2627

28+
logger = logging.getLogger(__name__)
29+
2730
# ---------------------------------------------------------------------------
2831
# Data structures
2932
# ---------------------------------------------------------------------------
@@ -298,8 +301,15 @@ async def evaluate_session_continuity(
298301
)
299302
if cosmos_session and "foundry_session_state" in cosmos_session:
300303
foundry_state = cosmos_session["foundry_session_state"]
301-
except Exception: # noqa: BLE001 — fail-open on Cosmos read
302-
pass
304+
except Exception as exc: # noqa: BLE001 — fail-open on Cosmos read
305+
logger.warning(
306+
"session_continuity_cosmos_read_failed "
307+
"session_id=%s service=%s entity_id=%s error=%s",
308+
summary.session_id,
309+
service,
310+
entity_id,
311+
exc,
312+
)
303313

304314
return SessionDecision(
305315
continue_session=True,
@@ -416,8 +426,15 @@ async def persist_full_session(
416426
}
417427
try:
418428
await warm_memory.upsert(document)
419-
except Exception: # noqa: BLE001 — fail-open on Cosmos write
420-
pass
429+
except Exception as exc: # noqa: BLE001 — fail-open on Cosmos write
430+
logger.warning(
431+
"session_persistence_cosmos_write_failed "
432+
"session_id=%s service=%s entity_id=%s error=%s",
433+
session_id,
434+
service,
435+
entity_id,
436+
exc,
437+
)
421438

422439

423440
# ---------------------------------------------------------------------------

lib/src/holiday_peak_lib/app_factory.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,14 @@ def _sync_foundry_tracing_state() -> None:
251251

252252
tracer = get_foundry_tracer(service_name)
253253

254+
# Suppress azure-ai-projects SDK internal telemetry instrumentor when
255+
# no real OpenTelemetry exporter is configured. The SDK crashes with
256+
# AttributeError on NonRecordingSpan.attributes (GH-946).
257+
if not os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") and not os.getenv(
258+
"AZURE_TRACING_ENABLED"
259+
):
260+
os.environ.setdefault("AZURE_TRACING_ENABLED", "false")
261+
254262
configured_foundry_roles = tuple(
255263
role for role, config in (("fast", slm_config), ("rich", llm_config)) if config is not None
256264
)

lib/src/holiday_peak_lib/mcp/server.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22

33
from __future__ import annotations
44

5+
import logging
56
from dataclasses import dataclass
67
from inspect import isawaitable
78
from typing import Any, Awaitable, Callable
89

910
from fastapi import APIRouter, FastAPI, HTTPException
1011
from holiday_peak_lib.self_healing import FailureSignal, SurfaceType
12+
13+
logger = logging.getLogger(__name__)
1114
from pydantic import BaseModel, ValidationError
1215

1316
ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]] | dict[str, Any]]
@@ -126,11 +129,21 @@ def _wrap_handler(
126129
output_model: type[BaseModel] | None,
127130
) -> Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]:
128131
async def validated_handler(payload: dict[str, Any]) -> dict[str, Any]:
132+
logger.info(
133+
"mcp_tool_invoked path=%s service=%s",
134+
path,
135+
self.app.title,
136+
)
129137
normalized_payload = payload
130138
if input_model is not None:
131139
try:
132140
validated_input = input_model.model_validate(payload)
133141
except ValidationError as exc:
142+
logger.warning(
143+
"mcp_tool_input_validation_failed path=%s error=%s",
144+
path,
145+
exc,
146+
)
134147
await self._emit_failure(
135148
path=path,
136149
status_code=422,
@@ -146,14 +159,29 @@ async def validated_handler(payload: dict[str, Any]) -> dict[str, Any]:
146159
) from exc
147160
normalized_payload = validated_input.model_dump(mode="json")
148161

149-
result = handler(normalized_payload)
150-
if isawaitable(result):
151-
result = await result
162+
try:
163+
result = handler(normalized_payload)
164+
if isawaitable(result):
165+
result = await result
166+
except Exception as exc:
167+
logger.error(
168+
"mcp_tool_execution_failed path=%s service=%s error=%s",
169+
path,
170+
self.app.title,
171+
exc,
172+
exc_info=True,
173+
)
174+
raise
152175

153176
if output_model is not None:
154177
try:
155178
validated_output = output_model.model_validate(result)
156179
except ValidationError as exc:
180+
logger.warning(
181+
"mcp_tool_output_validation_failed path=%s error=%s",
182+
path,
183+
exc,
184+
)
157185
await self._emit_failure(
158186
path=path,
159187
status_code=500,

lib/src/holiday_peak_lib/utils/circuit_breaker.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,19 @@ async def call(
142142
if self._state != CircuitState.CLOSED:
143143
await self._transition(CircuitState.CLOSED)
144144
return result
145-
except Exception:
145+
except Exception as exc:
146146
async with self._lock:
147147
self._failure_count += 1
148148
self._last_failure_time = time.monotonic()
149+
logger.warning(
150+
"circuit_breaker_failure name=%s state=%s failure_count=%d "
151+
"threshold=%d error=%s",
152+
self.name,
153+
self._state.value,
154+
self._failure_count,
155+
self.failure_threshold,
156+
exc,
157+
)
149158
if self._state == CircuitState.HALF_OPEN or (
150159
self._state == CircuitState.CLOSED
151160
and self._failure_count >= self.failure_threshold

lib/src/holiday_peak_lib/utils/event_hub.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,17 @@ def make_handler(eventhub_name: str):
795795

796796
async def _handler(partition_context, event): # noqa: ANN001
797797
if handler is not None:
798-
await handler(partition_context, event)
798+
try:
799+
await handler(partition_context, event)
800+
except Exception as exc:
801+
logger.error(
802+
"eventhub_handler_failed eventhub=%s service=%s error=%s",
803+
eventhub_name,
804+
service_name,
805+
exc,
806+
exc_info=True,
807+
)
808+
raise
799809
return
800810
payload = json.loads(event.body_as_str())
801811
_safe_log(

lib/src/holiday_peak_lib/utils/logging.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ def configure_logging(
7373
if conn:
7474
from azure.monitor.opentelemetry import configure_azure_monitor
7575

76+
# Ensure OTEL_SERVICE_NAME is set so App Insights identifies the emitting service.
77+
os.environ.setdefault("OTEL_SERVICE_NAME", resolved_app)
78+
7679
try:
7780
configure_azure_monitor(connection_string=conn)
7881
base_logger.info("Azure Monitor logging enabled via configure_azure_monitor.")

lib/src/holiday_peak_lib/utils/telemetry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ def _initialize_foundry_instrumentation(self) -> None:
271271
pass
272272

273273
if self.connection_string and not _FOUNDRY_INSTRUMENTATION_STATE["azure_monitor"]:
274+
# Ensure OTEL_SERVICE_NAME is set so App Insights identifies the emitting service.
275+
os.environ.setdefault("OTEL_SERVICE_NAME", self.service_name)
274276
try:
275277
configure_azure_monitor(connection_string=self.connection_string)
276278
_FOUNDRY_INSTRUMENTATION_STATE["azure_monitor"] = True

0 commit comments

Comments
 (0)