Sarah-Salah
diff --git a/‎app/tools/HermesSessionEvidenceTool/__init__.py‎
Lines changed: 104 additions & 0 deletions b/‎app/tools/HermesSessionEvidenceTool/__init__.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎app/types/root_cause_categories.py‎
Lines changed: 36 additions & 0 deletions b/‎app/types/root_cause_categories.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/alert.json‎
Lines changed: 17 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/alert.json‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/answer.yml‎
Lines changed: 25 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/answer.yml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_config.json‎
Lines changed: 34 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_config.json‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_orchestration_state.json‎
Lines changed: 32 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_orchestration_state.json‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_runtime_state.json‎
Lines changed: 9 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_runtime_state.json‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_session_log.json‎
Lines changed: 29 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/hermes_session_log.json‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/scenario.yml‎
Lines changed: 11 additions & 0 deletions b/‎tests/synthetic/hermes_rca/020-multi-agent-orchestration-missing/scenario.yml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎tests/synthetic/hermes_rca/021-a2a-protocol-unsupported/alert.json‎
Lines changed: 20 additions & 0 deletions b/‎tests/synthetic/hermes_rca/021-a2a-protocol-unsupported/alert.json‎
Lines changed: 20 additions & 0 deletions
@@ -237,6 +237,106 @@ def get_hermes_session_topology(
     return cast(dict[str, Any], backend.get_session_topology(session_id=session_id))
 
 
+@tool(
+    name="get_hermes_orchestration_state",
+    source="hermes",
+    description="Get Hermes orchestration role/topology execution state.",
+    use_cases=["Diagnose collapsed orchestration, isolated ACP sessions, and role execution drift"],
+    surfaces=("investigation",),
+    input_schema={
+        "type": "object",
+        "properties": {"session_id": {"type": "string"}},
+        "required": [],
+    },
+    is_available=_fixture_backend_only,
+    extract_params=_extract_params,
+)
+def get_hermes_orchestration_state(
+    session_id: str = "",
+    hermes_backend: Any = None,
+    **_kwargs: Any,
+) -> dict[str, Any]:
+    backend = _backend_or_error(hermes_backend, "get_hermes_orchestration_state")
+    if isinstance(backend, dict):
+        return backend
+    return cast(dict[str, Any], backend.get_orchestration_state(session_id=session_id))
+
+
+@tool(
+    name="get_hermes_routing_decisions",
+    source="hermes",
+    description="Get Hermes capability routing decisions and model selection outcomes.",
+    use_cases=["Diagnose ignored routing policies and default-model fallback behavior"],
+    surfaces=("investigation",),
+    input_schema={
+        "type": "object",
+        "properties": {"session_id": {"type": "string"}},
+        "required": [],
+    },
+    is_available=_fixture_backend_only,
+    extract_params=_extract_params,
+)
+def get_hermes_routing_decisions(
+    session_id: str = "",
+    hermes_backend: Any = None,
+    **_kwargs: Any,
+) -> dict[str, Any]:
+    backend = _backend_or_error(hermes_backend, "get_hermes_routing_decisions")
+    if isinstance(backend, dict):
+        return backend
+    return cast(dict[str, Any], backend.get_routing_decisions(session_id=session_id))
+
+
+@tool(
+    name="get_hermes_memory_state",
+    source="hermes",
+    description="Get Hermes memory backend health and parse/fallback state.",
+    use_cases=["Diagnose memory backend outages, corruption, and parse failures"],
+    surfaces=("investigation",),
+    input_schema={
+        "type": "object",
+        "properties": {"session_id": {"type": "string"}},
+        "required": [],
+    },
+    is_available=_fixture_backend_only,
+    extract_params=_extract_params,
+)
+def get_hermes_memory_state(
+    session_id: str = "",
+    hermes_backend: Any = None,
+    **_kwargs: Any,
+) -> dict[str, Any]:
+    backend = _backend_or_error(hermes_backend, "get_hermes_memory_state")
+    if isinstance(backend, dict):
+        return backend
+    return cast(dict[str, Any], backend.get_memory_state(session_id=session_id))
+
+
+@tool(
+    name="get_hermes_filesystem_state",
+    source="hermes",
+    description="Get Hermes filesystem persistence and corruption state.",
+    use_cases=["Diagnose corrupted memory snapshots and missing recovery backups"],
+    surfaces=("investigation",),
+    input_schema={
+        "type": "object",
+        "properties": {"session_id": {"type": "string"}},
+        "required": [],
+    },
+    is_available=_fixture_backend_only,
+    extract_params=_extract_params,
+)
+def get_hermes_filesystem_state(
+    session_id: str = "",
+    hermes_backend: Any = None,
+    **_kwargs: Any,
+) -> dict[str, Any]:
+    backend = _backend_or_error(hermes_backend, "get_hermes_filesystem_state")
+    if isinstance(backend, dict):
+        return backend
+    return cast(dict[str, Any], backend.get_filesystem_state(session_id=session_id))
+
+
 __all__ = [
     "get_hermes_session_log",
     "get_hermes_provider_traffic",
@@ -246,4 +346,8 @@ def get_hermes_session_topology(
     "get_hermes_runtime_state",
     "get_hermes_cron_state",
     "get_hermes_session_topology",
+    "get_hermes_orchestration_state",
+    "get_hermes_routing_decisions",
+    "get_hermes_memory_state",
+    "get_hermes_filesystem_state",
 ]
@@ -530,6 +530,36 @@ class RootCauseCategory:
         GROUP_WORKLOAD,
         "System remains up but materially slower due to cache-thrash/inefficiency regressions.",
     ),
+    RootCauseCategory(
+        "orchestration_missing",
+        GROUP_CODE_AND_CONFIG,
+        "Declared multi-agent orchestration/topology is missing or silently collapsed into a single-agent execution path.",
+    ),
+    RootCauseCategory(
+        "protocol_unsupported",
+        GROUP_CODE_AND_CONFIG,
+        "Requested agent communication protocol is unsupported by the runtime/client implementation.",
+    ),
+    RootCauseCategory(
+        "routing_ignored",
+        GROUP_CODE_AND_CONFIG,
+        "Capability-based model routing configuration exists but is ignored at runtime.",
+    ),
+    RootCauseCategory(
+        "memory_unavailable",
+        GROUP_CODE_AND_CONFIG,
+        "External memory backend is unreachable and the runtime silently falls back to degraded local memory.",
+    ),
+    RootCauseCategory(
+        "memory_corruption",
+        GROUP_CODE_AND_CONFIG,
+        "Persistent agent memory/filesystem state is corrupted without recovery or backup support.",
+    ),
+    RootCauseCategory(
+        "memory_parse_failure",
+        GROUP_CODE_AND_CONFIG,
+        "Memory tool failed due to strict JSON parsing incompatibility with model output.",
+    ),
     # ── Generic fallbacks (kept for backward compatibility) ────────────
     # These exist so legacy answer keys, eval pipelines, and prior LLM
     # outputs continue to validate. New diagnoses should always prefer a
@@ -598,6 +628,12 @@ class RootCauseCategory:
         "delivery_hang",
         "ghost_session",
         "performance_degradation",
+        "orchestration_missing",
+        "protocol_unsupported",
+        "routing_ignored",
+        "memory_unavailable",
+        "memory_corruption",
+        "memory_parse_failure",
     }
 )
 
 
@@ -0,0 +1,17 @@
+{
+  "title": "Hermes multi-agent workflow collapsed into single context",
+  "state": "firing",
+  "alert_source": "hermes",
+  "commonLabels": {
+    "alertname": "HermesOrchestrationFailure",
+    "severity": "critical",
+    "service": "hermes"
+  },
+  "commonAnnotations": {
+    "summary": "Planner/worker/reviewer workflow exceeded context window",
+    "description": "Hermes configured for multi-agent orchestration but all roles executed inside the same context window.",
+    "context_sources": "hermes",
+    "hermes_session_id": "sess-orch-020",
+    "failure_mode": "orchestration_missing"
+  }
+}
@@ -0,0 +1,25 @@
+root_cause_category: orchestration_missing
+
+required_keywords:
+  - orchestration
+  - planner
+  - worker
+  - reviewer
+  - scheduler
+
+forbidden_categories:
+  - unknown
+  - healthy
+  - user_error
+
+required_evidence_sources:
+  - hermes_orchestration_state
+
+optimal_trajectory:
+  - get_hermes_orchestration_state
+  - get_hermes_session_log
+  - get_hermes_config
+
+max_investigation_loops: 3
+
+model_response: Hermes was configured for planner/worker/reviewer orchestration but no real scheduler separated the roles into independent execution contexts.
@@ -0,0 +1,34 @@
+{
+  "provider": "openai",
+  "model": "gpt-5.4-mini",
+  "region": "us-east-1",
+  "providers": [
+    {
+      "name": "openai",
+      "base_url": "https://api.openai.com",
+      "auth_kind": "api_key"
+    }
+  ],
+  "transport": {
+    "sse_max_line_bytes": 131072,
+    "request_timeout_s": 120
+  },
+  "orchestration": {
+    "enabled": true,
+    "topology": "planner_worker_reviewer",
+    "roles": [
+      {
+        "name": "planner",
+        "model": "gpt-5.4-mini"
+      },
+      {
+        "name": "worker",
+        "model": "gpt-5.4-mini"
+      },
+      {
+        "name": "reviewer",
+        "model": "gpt-5.4-mini"
+      }
+    ]
+  }
+}
@@ -0,0 +1,32 @@
+{
+  "declared_roles": [
+    {
+      "name": "planner",
+      "model": "gpt-5.4-mini",
+      "system_prompt_excerpt": "Plan deployment strategy"
+    },
+    {
+      "name": "worker",
+      "model": "gpt-5.4-mini",
+      "system_prompt_excerpt": "Execute deployment tasks"
+    },
+    {
+      "name": "reviewer",
+      "model": "gpt-5.4-mini",
+      "system_prompt_excerpt": "Review deployment output"
+    }
+  ],
+  "declared_topology": "planner_worker_reviewer",
+  "observed": {
+    "actual_runs": [
+      {
+        "role": "default",
+        "model": "gpt-5.4-mini",
+        "input_tokens": 18240,
+        "output_tokens": 4120,
+        "context_window_used": 32768
+      }
+    ],
+    "actual_topology": "single_agent_loop"
+  }
+}
@@ -0,0 +1,9 @@
+{
+  "pid": 5821,
+  "started_at": "2026-05-18T09:00:00Z",
+  "frozen_now_ts": "2026-05-18T09:12:18Z",
+  "interrupt_queue_depth": 0,
+  "last_progress_ts": "2026-05-18T09:12:17Z",
+  "is_blocked": false,
+  "blocking_call": null
+}
@@ -0,0 +1,29 @@
+{
+  "session_id": "sess-orch-020",
+  "events": [
+    {
+      "ts": "2026-05-18T09:00:04Z",
+      "kind": "message",
+      "payload": {
+        "role": "user",
+        "content": "Deploy a Kubernetes service with rollout validation"
+      }
+    },
+    {
+      "ts": "2026-05-18T09:04:11Z",
+      "kind": "warning",
+      "payload": {
+        "warning_class": "ContextWindowPressure",
+        "message": "planner/worker/reviewer roles sharing same context window"
+      }
+    },
+    {
+      "ts": "2026-05-18T09:12:17Z",
+      "kind": "error",
+      "payload": {
+        "error_class": "OrchestrationCollapsed",
+        "error_message": "All configured roles executed in a single execution loop"
+      }
+    }
+  ]
+}
@@ -0,0 +1,11 @@
+schema_version: '1.0'
+scenario_id: 020-multi-agent-orchestration-missing
+failure_mode: orchestration_missing
+severity: critical
+scenario_difficulty: 2
+
+available_evidence:
+- hermes_session_log
+- hermes_runtime_state
+- hermes_config
+- hermes_orchestration_state
@@ -0,0 +1,20 @@
+{
+  "title": "Hermes A2A protocol negotiation failure",
+  "summary": "Hermes failed while connecting to an external peer agent",
+  "state": "firing",
+  "severity": "critical",
+  "alert_source": "hermes",
+  "source": "hermes",
+  "alert_key": "hermes-a2a-021",
+
+  "commonLabels": {
+    "service": "hermes",
+    "severity": "critical"
+  },
+
+  "commonAnnotations": {
+    "context_sources": "hermes",
+    "session_id": "hermes-a2a-021",
+    "failure_mode": "protocol_unsupported"
+  }
+}