Add triage agent test suite

TomasKorbar · TomasKorbar · commit 0cd910d413dd · 2025-11-21T10:12:51.000+01:00
Can be executed by `make run-triage-agent-e2e-tests`

Currently none of the tests reliably passes, but each of them
provides insight into portion of process that is not working.
diff --git a/Containerfile.c10s b/Containerfile.c10s
@@ -38,6 +38,7 @@ RUN dnf -y install --allowerasing \
       sed \
       gawk \
       rsync \
+      python3-tabulate \
     && dnf clean all
 
 RUN pip3 install --no-cache-dir \
@@ -46,7 +47,9 @@ RUN pip3 install --no-cache-dir \
       openinference-instrumentation-beeai \
       arize-phoenix-otel \
       redis \
-      specfile
+      specfile \
+      pytest \
+      pytest-asyncio
 
 # Create user
 RUN useradd -m -G wheel beeai
diff --git a/Makefile b/Makefile
@@ -27,8 +27,11 @@ run-triage-agent-standalone:
 		-e MOCK_JIRA=$(MOCK_JIRA) \
 		triage-agent
 
-
-
+.PHONY: run-triage-agent-e2e-tests
+run-triage-agent-e2e-tests:
+	$(COMPOSE_AGENTS) run --rm \
+		-e MOCK_JIRA="true" \
+		triage-agent-e2e-tests
 
 .PHONY: run-rebase-agent-c9s-standalone
 run-rebase-agent-c9s-standalone:
diff --git a/agents/metrics_middleware.py b/agents/metrics_middleware.py
@@ -0,0 +1,45 @@
+from datetime import datetime
+
+
+from beeai_framework.context import (
+    RunContextStartEvent,
+    RunContextFinishEvent,
+    RunMiddlewareProtocol,
+    RunContext
+)
+from beeai_framework.emitter import EmitterOptions, EventMeta
+from beeai_framework.emitter.utils import create_internal_event_matcher
+
+
+class MetricsMiddleware(RunMiddlewareProtocol):
+    def __init__(self):
+        self.start_time: datetime | None = None
+        self.end_time: datetime | None = None
+        self.tool_calls: int = 0
+
+    def bind(self, ctx: RunContext) -> None:
+        ctx.emitter.on(
+            create_internal_event_matcher("start", ctx.instance),
+            self._on_run_context_start,
+            EmitterOptions(is_blocking=True, priority=1),
+        )
+        ctx.emitter.on(
+            create_internal_event_matcher("finish", ctx.instance),
+            self._on_run_context_finish,
+            EmitterOptions(is_blocking=True, priority=1),
+        )
+
+    async def _on_run_context_start(self, event: RunContextStartEvent, meta: EventMeta):
+        self.start_time = datetime.now()
+
+    async def _on_run_context_finish(self, event: RunContextFinishEvent, meta: EventMeta):
+        self.end_time = datetime.now()
+
+    @property
+    def duration(self) -> float:
+        if self.start_time and self.end_time:
+            return (self.end_time - self.start_time).total_seconds()
+        return 0
+
+    def get_metrics(self) -> dict:
+        return {"duration": self.duration}
diff --git a/agents/tasks.py b/agents/tasks.py
@@ -9,9 +9,9 @@
 
 from common.models import LogOutputSchema, CachedMRMetadata
 from common.utils import is_cs_branch
-from constants import BRANCH_PREFIX, JIRA_COMMENT_TEMPLATE
-from utils import check_subprocess, run_subprocess, run_tool, mcp_tools
-from tools.specfile import UpdateReleaseTool
+from agents.constants import BRANCH_PREFIX, JIRA_COMMENT_TEMPLATE
+from agents.utils import check_subprocess, run_subprocess, run_tool, mcp_tools
+from agents.tools.specfile import UpdateReleaseTool
 
 logger = logging.getLogger(__name__)
 
diff --git a/agents/tests/e2e/conftest.py b/agents/tests/e2e/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+@pytest.hookimpl(wrapper=True)
+def pytest_terminal_summary(terminalreporter: pytest.TerminalReporter, exitstatus, config: pytest.Config):
+    yield
+    metrics = config.stash.get("metrics", None)
+
+    if metrics:
+        terminalreporter.write_sep("=", "Metrics")
+        terminalreporter.write_line(metrics, flush=True)
diff --git a/agents/tests/e2e/test_triage.py b/agents/tests/e2e/test_triage.py
@@ -0,0 +1,88 @@
+from tabulate import tabulate
+import pytest
+import os
+
+from agents.triage_agent import run_workflow, TriageState
+from agents.observability import setup_observability
+from common.models import TriageOutputSchema, Resolution, BackportData
+
+
+class TriageAgentTestCase:
+    def __init__(self, input, expected_output):
+        self.input = input
+        self.expected_output = expected_output
+        self.metrics: dict = None
+
+    async def run(self) -> TriageState:
+        return await run_workflow(self.input, False)
+
+
+test_cases=[
+    TriageAgentTestCase(input="RHEL-15216",
+                        expected_output=TriageOutputSchema(resolution=Resolution.BACKPORT,
+                                                           data=BackportData(package="dnsmasq",
+                                                           patch_urls=["http://thekelleys.org.uk/gitweb/?p=dnsmasq.git;a=patch;h=dd33e98da09c487a58b6cb6693b8628c0b234a3b"],
+                                                           justification="not-implemented",
+                                                           jira_issue="RHEL-15216",
+                                                           cve_id=None,
+                                                           fix_version="rhel-8.10"))
+    ),
+    TriageAgentTestCase(input="RHEL-112546",
+                        expected_output=TriageOutputSchema(resolution=Resolution.BACKPORT,
+                                                           data=BackportData(package="libtiff",
+                                                           patch_urls=["https://gitlab.com/libtiff/libtiff/-/commit/d1c0719e004fbb223c571d286c73911569d4dbb6.patch"],
+                                                           justification="not-implemented",
+                                                           jira_issue="RHEL-112546",
+                                                           cve_id="CVE-2025-9900",
+                                                           fix_version="rhel-9.6.z"))
+    ),
+    TriageAgentTestCase(input="RHEL-61943",
+                        expected_output=TriageOutputSchema(resolution=Resolution.BACKPORT,
+                                                           data=BackportData(package="dnsmasq",
+                                                           patch_urls=["http://thekelleys.org.uk/gitweb/?p=dnsmasq.git;a=patch;h=eb1fe15ca80b6bc43cd6bfdf309ec6c590aff811"],
+                                                           justification="not-implemented",
+                                                           jira_issue="RHEL-61943",
+                                                           cve_id=None,
+                                                           fix_version="rhel-8.10.z"))
+    ),
+    TriageAgentTestCase(input="RHEL-29712",
+                        expected_output=TriageOutputSchema(resolution=Resolution.BACKPORT,
+                                                           data=BackportData(package="bind",
+                                                           patch_urls=["https://gitlab.isc.org/isc-projects/bind9/-/commit/7e2f50c36958f8c98d54e6d131f088a4837ce269"],
+                                                           justification="not-implemented",
+                                                           jira_issue="RHEL-29712",
+                                                           cve_id=None,
+                                                           fix_version="rhel-8.10.z"))
+    ),
+]
+
+
+@pytest.fixture(scope="session", autouse=True)
+def observability_fixture():
+    return setup_observability(os.environ["COLLECTOR_ENDPOINT"])
+
+
+@pytest.fixture(scope="session", autouse=True)
+def mydata(request):
+    yield
+    collected_metrics = [[test_case.input] + list(test_case.metrics.values()) for test_case in test_cases]
+    request.config.stash["metrics"] = tabulate(collected_metrics, ["Issue", "Time"])
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    test_cases,
+)
+async def test_triage_agent(test_case: TriageAgentTestCase):
+    def verify_result(real_output: TriageOutputSchema, expected_output: TriageOutputSchema):
+        assert real_output.resolution == expected_output.resolution
+        assert real_output.data.package == expected_output.data.package
+        assert real_output.data.patch_urls == expected_output.data.patch_urls
+        assert real_output.data.jira_issue == expected_output.data.jira_issue
+        assert real_output.data.cve_id == expected_output.data.cve_id
+        assert real_output.data.fix_version == expected_output.data.fix_version
+
+    finished_state = await test_case.run()
+    test_case.metrics = finished_state.metrics
+    verify_result(finished_state.triage_result, test_case.expected_output)
diff --git a/agents/tools/commands.py b/agents/tools/commands.py
@@ -8,7 +8,7 @@
 from beeai_framework.emitter import Emitter
 from beeai_framework.tools import JSONToolOutput, Tool, ToolError, ToolRunOptions
 
-from utils import run_subprocess
+from agents.utils import run_subprocess
 
 TIMEOUT = 10 * 60  # seconds
 ELLIPSIZED_LINES = 200
diff --git a/agents/tools/specfile.py b/agents/tools/specfile.py
@@ -15,7 +15,7 @@
 
 from common.constants import BREWHUB_URL
 from common.validators import NonEmptyString
-from utils import get_absolute_path
+from agents.utils import get_absolute_path
 
 
 class GetPackageInfoToolInput(BaseModel):
diff --git a/agents/triage_agent.py b/agents/triage_agent.py
@@ -21,7 +21,8 @@
 from beeai_framework.workflows import Workflow
 from beeai_framework.utils.strings import to_json
 
-import tasks
+import agents.tasks as tasks
+from agents.metrics_middleware import MetricsMiddleware
 from common.config import load_rhel_config
 from common.models import (
     Task,
@@ -35,12 +36,12 @@
 )
 from common.utils import redis_client, fix_await
 from common.constants import JiraLabels, RedisQueues
-from observability import setup_observability
-from tools.commands import RunShellCommandTool
-from tools.patch_validator import PatchValidatorTool
-from tools.version_mapper import VersionMapperTool
-from tools.upstream_search import UpstreamSearchTool
-from utils import get_agent_execution_config, get_chat_model, get_tool_call_checker_config, mcp_tools, run_tool
+from agents.observability import setup_observability
+from agents.tools.commands import RunShellCommandTool
+from agents.tools.patch_validator import PatchValidatorTool
+from agents.tools.version_mapper import VersionMapperTool
+from agents.tools.upstream_search import UpstreamSearchTool
+from agents.utils import get_agent_execution_config, get_chat_model, get_tool_call_checker_config, mcp_tools, run_tool
 
 logger = logging.getLogger(__name__)
 
@@ -291,9 +292,11 @@ class TriageState(BaseModel):
     cve_eligibility_result: CVEEligibilityResult | None = Field(default=None)
     triage_result: OutputSchema | None = Field(default=None)
     target_branch: str | None = Field(default=None)
+    metrics: dict | None = Field(default=None)
 
 
 async def run_workflow(jira_issue, dry_run):
+    current_metrics_middleware = MetricsMiddleware()
     async with mcp_tools(os.getenv("MCP_GATEWAY_URL")) as gateway_tools:
         triage_agent = RequirementAgent(
             name="TriageAgent",
@@ -317,7 +320,7 @@ async def run_workflow(jira_issue, dry_run):
                 ConditionalRequirement(PatchValidatorTool, only_after="get_jira_details"),
                 ConditionalRequirement("set_jira_fields", only_after="get_jira_details"),
             ],
-            middlewares=[GlobalTrajectoryMiddleware(pretty=True)],
+            middlewares=[current_metrics_middleware, GlobalTrajectoryMiddleware(pretty=True)],
             role="Red Hat Enterprise Linux developer",
             instructions=[
                 "Use the `think` tool to reason through complex decisions and document your approach.",
@@ -497,6 +500,7 @@ async def comment_in_jira(state):
         workflow.add_step("comment_in_jira", comment_in_jira)
 
         response = await workflow.run(TriageState(jira_issue=jira_issue))
+        response.state.metrics = current_metrics_middleware.get_metrics()
         return response.state
 
 
diff --git a/compose.yaml b/compose.yaml
@@ -135,6 +135,15 @@ services:
     command: ["python", "agents/triage_agent.py"]
     profiles: ["agents"]
 
+  triage-agent-e2e-tests:
+    <<: *beeai-agent-c10s
+    environment:
+      <<: *beeai-env
+    # the option about default loop is here because of litellm issue
+    # https://github.com/BerriAI/litellm/issues/14521
+    command: ["pytest", "agents/tests/e2e/test_triage.py", "-o", "asyncio_default_test_loop_scope=session"]
+    profiles: ["agents"]
+
   backport-agent-c9s:
     <<: *beeai-agent-c9s
     command: ["python", "agents/backport_agent.py"]