Skip to content

Commit 76e1373

Browse files
feat(hermes): add orchestration and memory RCA synthetic scenarios (Tracer-Cloud#2183)
* feat(hermes): add orchestration and memory RCA synthetic scenarios * fix(hermes): tighten orchestration and evidence validation * fix(hermes): tighten routing decision validation * fix(hermes): tighten synthetic evidence validation * fix(hermes): clarify routing ignored scenario response * fix(hermes): correct synthetic sha256 fixture * fix(hermes): align alert annotation session keys * fix(hermes): validate routing capability categories
1 parent a6732fb commit 76e1373

56 files changed

Lines changed: 1512 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

app/tools/HermesSessionEvidenceTool/__init__.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,106 @@ def get_hermes_session_topology(
237237
return cast(dict[str, Any], backend.get_session_topology(session_id=session_id))
238238

239239

240+
@tool(
241+
name="get_hermes_orchestration_state",
242+
source="hermes",
243+
description="Get Hermes orchestration role/topology execution state.",
244+
use_cases=["Diagnose collapsed orchestration, isolated ACP sessions, and role execution drift"],
245+
surfaces=("investigation",),
246+
input_schema={
247+
"type": "object",
248+
"properties": {"session_id": {"type": "string"}},
249+
"required": [],
250+
},
251+
is_available=_fixture_backend_only,
252+
extract_params=_extract_params,
253+
)
254+
def get_hermes_orchestration_state(
255+
session_id: str = "",
256+
hermes_backend: Any = None,
257+
**_kwargs: Any,
258+
) -> dict[str, Any]:
259+
backend = _backend_or_error(hermes_backend, "get_hermes_orchestration_state")
260+
if isinstance(backend, dict):
261+
return backend
262+
return cast(dict[str, Any], backend.get_orchestration_state(session_id=session_id))
263+
264+
265+
@tool(
266+
name="get_hermes_routing_decisions",
267+
source="hermes",
268+
description="Get Hermes capability routing decisions and model selection outcomes.",
269+
use_cases=["Diagnose ignored routing policies and default-model fallback behavior"],
270+
surfaces=("investigation",),
271+
input_schema={
272+
"type": "object",
273+
"properties": {"session_id": {"type": "string"}},
274+
"required": [],
275+
},
276+
is_available=_fixture_backend_only,
277+
extract_params=_extract_params,
278+
)
279+
def get_hermes_routing_decisions(
280+
session_id: str = "",
281+
hermes_backend: Any = None,
282+
**_kwargs: Any,
283+
) -> dict[str, Any]:
284+
backend = _backend_or_error(hermes_backend, "get_hermes_routing_decisions")
285+
if isinstance(backend, dict):
286+
return backend
287+
return cast(dict[str, Any], backend.get_routing_decisions(session_id=session_id))
288+
289+
290+
@tool(
291+
name="get_hermes_memory_state",
292+
source="hermes",
293+
description="Get Hermes memory backend health and parse/fallback state.",
294+
use_cases=["Diagnose memory backend outages, corruption, and parse failures"],
295+
surfaces=("investigation",),
296+
input_schema={
297+
"type": "object",
298+
"properties": {"session_id": {"type": "string"}},
299+
"required": [],
300+
},
301+
is_available=_fixture_backend_only,
302+
extract_params=_extract_params,
303+
)
304+
def get_hermes_memory_state(
305+
session_id: str = "",
306+
hermes_backend: Any = None,
307+
**_kwargs: Any,
308+
) -> dict[str, Any]:
309+
backend = _backend_or_error(hermes_backend, "get_hermes_memory_state")
310+
if isinstance(backend, dict):
311+
return backend
312+
return cast(dict[str, Any], backend.get_memory_state(session_id=session_id))
313+
314+
315+
@tool(
316+
name="get_hermes_filesystem_state",
317+
source="hermes",
318+
description="Get Hermes filesystem persistence and corruption state.",
319+
use_cases=["Diagnose corrupted memory snapshots and missing recovery backups"],
320+
surfaces=("investigation",),
321+
input_schema={
322+
"type": "object",
323+
"properties": {"session_id": {"type": "string"}},
324+
"required": [],
325+
},
326+
is_available=_fixture_backend_only,
327+
extract_params=_extract_params,
328+
)
329+
def get_hermes_filesystem_state(
330+
session_id: str = "",
331+
hermes_backend: Any = None,
332+
**_kwargs: Any,
333+
) -> dict[str, Any]:
334+
backend = _backend_or_error(hermes_backend, "get_hermes_filesystem_state")
335+
if isinstance(backend, dict):
336+
return backend
337+
return cast(dict[str, Any], backend.get_filesystem_state(session_id=session_id))
338+
339+
240340
__all__ = [
241341
"get_hermes_session_log",
242342
"get_hermes_provider_traffic",
@@ -246,4 +346,8 @@ def get_hermes_session_topology(
246346
"get_hermes_runtime_state",
247347
"get_hermes_cron_state",
248348
"get_hermes_session_topology",
349+
"get_hermes_orchestration_state",
350+
"get_hermes_routing_decisions",
351+
"get_hermes_memory_state",
352+
"get_hermes_filesystem_state",
249353
]

app/types/root_cause_categories.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,36 @@ class RootCauseCategory:
530530
GROUP_WORKLOAD,
531531
"System remains up but materially slower due to cache-thrash/inefficiency regressions.",
532532
),
533+
RootCauseCategory(
534+
"orchestration_missing",
535+
GROUP_CODE_AND_CONFIG,
536+
"Declared multi-agent orchestration/topology is missing or silently collapsed into a single-agent execution path.",
537+
),
538+
RootCauseCategory(
539+
"protocol_unsupported",
540+
GROUP_CODE_AND_CONFIG,
541+
"Requested agent communication protocol is unsupported by the runtime/client implementation.",
542+
),
543+
RootCauseCategory(
544+
"routing_ignored",
545+
GROUP_CODE_AND_CONFIG,
546+
"Capability-based model routing configuration exists but is ignored at runtime.",
547+
),
548+
RootCauseCategory(
549+
"memory_unavailable",
550+
GROUP_CODE_AND_CONFIG,
551+
"External memory backend is unreachable and the runtime silently falls back to degraded local memory.",
552+
),
553+
RootCauseCategory(
554+
"memory_corruption",
555+
GROUP_CODE_AND_CONFIG,
556+
"Persistent agent memory/filesystem state is corrupted without recovery or backup support.",
557+
),
558+
RootCauseCategory(
559+
"memory_parse_failure",
560+
GROUP_CODE_AND_CONFIG,
561+
"Memory tool failed due to strict JSON parsing incompatibility with model output.",
562+
),
533563
# ── Generic fallbacks (kept for backward compatibility) ────────────
534564
# These exist so legacy answer keys, eval pipelines, and prior LLM
535565
# outputs continue to validate. New diagnoses should always prefer a
@@ -598,6 +628,12 @@ class RootCauseCategory:
598628
"delivery_hang",
599629
"ghost_session",
600630
"performance_degradation",
631+
"orchestration_missing",
632+
"protocol_unsupported",
633+
"routing_ignored",
634+
"memory_unavailable",
635+
"memory_corruption",
636+
"memory_parse_failure",
601637
}
602638
)
603639

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"title": "Hermes multi-agent workflow collapsed into single context",
3+
"state": "firing",
4+
"alert_source": "hermes",
5+
"commonLabels": {
6+
"alertname": "HermesOrchestrationFailure",
7+
"severity": "critical",
8+
"service": "hermes"
9+
},
10+
"commonAnnotations": {
11+
"summary": "Planner/worker/reviewer workflow exceeded context window",
12+
"description": "Hermes configured for multi-agent orchestration but all roles executed inside the same context window.",
13+
"context_sources": "hermes",
14+
"hermes_session_id": "sess-orch-020",
15+
"failure_mode": "orchestration_missing"
16+
}
17+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
root_cause_category: orchestration_missing
2+
3+
required_keywords:
4+
- orchestration
5+
- planner
6+
- worker
7+
- reviewer
8+
- scheduler
9+
10+
forbidden_categories:
11+
- unknown
12+
- healthy
13+
- user_error
14+
15+
required_evidence_sources:
16+
- hermes_orchestration_state
17+
18+
optimal_trajectory:
19+
- get_hermes_orchestration_state
20+
- get_hermes_session_log
21+
- get_hermes_config
22+
23+
max_investigation_loops: 3
24+
25+
model_response: Hermes was configured for planner/worker/reviewer orchestration but no real scheduler separated the roles into independent execution contexts.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"provider": "openai",
3+
"model": "gpt-5.4-mini",
4+
"region": "us-east-1",
5+
"providers": [
6+
{
7+
"name": "openai",
8+
"base_url": "https://api.openai.com",
9+
"auth_kind": "api_key"
10+
}
11+
],
12+
"transport": {
13+
"sse_max_line_bytes": 131072,
14+
"request_timeout_s": 120
15+
},
16+
"orchestration": {
17+
"enabled": true,
18+
"topology": "planner_worker_reviewer",
19+
"roles": [
20+
{
21+
"name": "planner",
22+
"model": "gpt-5.4-mini"
23+
},
24+
{
25+
"name": "worker",
26+
"model": "gpt-5.4-mini"
27+
},
28+
{
29+
"name": "reviewer",
30+
"model": "gpt-5.4-mini"
31+
}
32+
]
33+
}
34+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"declared_roles": [
3+
{
4+
"name": "planner",
5+
"model": "gpt-5.4-mini",
6+
"system_prompt_excerpt": "Plan deployment strategy"
7+
},
8+
{
9+
"name": "worker",
10+
"model": "gpt-5.4-mini",
11+
"system_prompt_excerpt": "Execute deployment tasks"
12+
},
13+
{
14+
"name": "reviewer",
15+
"model": "gpt-5.4-mini",
16+
"system_prompt_excerpt": "Review deployment output"
17+
}
18+
],
19+
"declared_topology": "planner_worker_reviewer",
20+
"observed": {
21+
"actual_runs": [
22+
{
23+
"role": "default",
24+
"model": "gpt-5.4-mini",
25+
"input_tokens": 18240,
26+
"output_tokens": 4120,
27+
"context_window_used": 32768
28+
}
29+
],
30+
"actual_topology": "single_agent_loop"
31+
}
32+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"pid": 5821,
3+
"started_at": "2026-05-18T09:00:00Z",
4+
"frozen_now_ts": "2026-05-18T09:12:18Z",
5+
"interrupt_queue_depth": 0,
6+
"last_progress_ts": "2026-05-18T09:12:17Z",
7+
"is_blocked": false,
8+
"blocking_call": null
9+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"session_id": "sess-orch-020",
3+
"events": [
4+
{
5+
"ts": "2026-05-18T09:00:04Z",
6+
"kind": "message",
7+
"payload": {
8+
"role": "user",
9+
"content": "Deploy a Kubernetes service with rollout validation"
10+
}
11+
},
12+
{
13+
"ts": "2026-05-18T09:04:11Z",
14+
"kind": "warning",
15+
"payload": {
16+
"warning_class": "ContextWindowPressure",
17+
"message": "planner/worker/reviewer roles sharing same context window"
18+
}
19+
},
20+
{
21+
"ts": "2026-05-18T09:12:17Z",
22+
"kind": "error",
23+
"payload": {
24+
"error_class": "OrchestrationCollapsed",
25+
"error_message": "All configured roles executed in a single execution loop"
26+
}
27+
}
28+
]
29+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
schema_version: '1.0'
2+
scenario_id: 020-multi-agent-orchestration-missing
3+
failure_mode: orchestration_missing
4+
severity: critical
5+
scenario_difficulty: 2
6+
7+
available_evidence:
8+
- hermes_session_log
9+
- hermes_runtime_state
10+
- hermes_config
11+
- hermes_orchestration_state
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"title": "Hermes A2A protocol negotiation failure",
3+
"summary": "Hermes failed while connecting to an external peer agent",
4+
"state": "firing",
5+
"severity": "critical",
6+
"alert_source": "hermes",
7+
"source": "hermes",
8+
"alert_key": "hermes-a2a-021",
9+
10+
"commonLabels": {
11+
"service": "hermes",
12+
"severity": "critical"
13+
},
14+
15+
"commonAnnotations": {
16+
"context_sources": "hermes",
17+
"session_id": "hermes-a2a-021",
18+
"failure_mode": "protocol_unsupported"
19+
}
20+
}

0 commit comments

Comments
 (0)