From 3ed17bb5a414dabbe72c9fe6ee8f3f28a18a15a5 Mon Sep 17 00:00:00 2001 From: adonheis Date: Tue, 19 May 2026 15:17:08 -0400 Subject: [PATCH] refactor: extract shared load_golden() into evals/harness/fixtures.py The load_golden() helper was copy-pasted identically across 5 agent test files. Extract it into a shared module with an explicit fixtures_dir parameter; each consumer keeps a thin 2-line wrapper that preserves the existing zero-arg call signature, so no test files need changes. Closes RHAIENG-5096 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../mcp_agent/tests/behavioral/conftest.py | 12 ++++----- .../tests/behavioral/conftest.py | 11 +++----- .../agentic_rag/tests/behavioral/conftest.py | 12 ++++----- .../tests/behavioral/test_tool_usage.py | 12 +++------ .../tests/behavioral/conftest.py | 11 +++----- docs/adding-behavioral-tests.md | 14 +++++++++++ evals/harness/fixtures.py | 25 +++++++++++++++++++ 7 files changed, 61 insertions(+), 36 deletions(-) create mode 100644 evals/harness/fixtures.py diff --git a/agents/autogen/mcp_agent/tests/behavioral/conftest.py b/agents/autogen/mcp_agent/tests/behavioral/conftest.py index 3185e6de..6c203d88 100644 --- a/agents/autogen/mcp_agent/tests/behavioral/conftest.py +++ b/agents/autogen/mcp_agent/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -56,15 +57,12 @@ def eval_config() -> dict[str, Any]: return yaml.safe_load(f) +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/agents/crewai/websearch_agent/tests/behavioral/conftest.py b/agents/crewai/websearch_agent/tests/behavioral/conftest.py index 232626df..e7056d9a 100644 --- a/agents/crewai/websearch_agent/tests/behavioral/conftest.py +++ b/agents/crewai/websearch_agent/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -58,16 +59,12 @@ def eval_config() -> dict[str, Any]: SEARCH_EVIDENCE = ["openshift ai"] +FIXTURES_DIR = Path(__file__).parent / "fixtures" + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/agents/langgraph/agentic_rag/tests/behavioral/conftest.py b/agents/langgraph/agentic_rag/tests/behavioral/conftest.py index da39741e..f4ebd27a 100644 --- a/agents/langgraph/agentic_rag/tests/behavioral/conftest.py +++ b/agents/langgraph/agentic_rag/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -48,15 +49,12 @@ def _find_repo_root() -> Path: ) +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py b/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py index a8d5025e..2d26d003 100644 --- a/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py +++ b/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py @@ -20,7 +20,7 @@ from typing import Any import pytest -import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.scorers.tool_sequence import ( score_hallucinated_tools, score_tool_call_validity, @@ -29,16 +29,12 @@ pytestmark = pytest.mark.langgraph_react +FIXTURES_DIR = Path(__file__).parent / "fixtures" + def _load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) def _factual_queries() -> list[dict[str, Any]]: diff --git a/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py b/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py index 4fda1997..f78976d4 100644 --- a/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py +++ b/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -59,16 +60,12 @@ def eval_config() -> dict[str, Any]: PRICE_EVIDENCE = ["price", "cost", "$", "dollar"] REVIEW_EVIDENCE = ["review", "rating", "star", "recommend"] +FIXTURES_DIR = Path(__file__).parent / "fixtures" + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/docs/adding-behavioral-tests.md b/docs/adding-behavioral-tests.md index 1e06e3bd..947de1b8 100644 --- a/docs/adding-behavioral-tests.md +++ b/docs/adding-behavioral-tests.md @@ -38,6 +38,20 @@ The conftest defines fixtures specific to your agent. Because agent tests live u - `agent_thresholds` — pulls from the shared `eval_config` fixture - `run_eval` — overrides the root fixture to add MLflow trace enrichment +**`load_golden()` helper:** Import the shared loader from `harness.fixtures` and create a thin wrapper that binds `fixtures_dir` to `Path(__file__).parent / "fixtures"`: + +```python +from pathlib import Path +from typing import Any + +from harness.fixtures import load_golden as _load_golden_from + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + +def load_golden(category: str | None = None) -> list[dict[str, Any]]: + return _load_golden_from(FIXTURES_DIR, category) +``` + See existing agent implementations for working examples: - `agents/langgraph/react_agent/tests/behavioral/conftest.py` diff --git a/evals/harness/fixtures.py b/evals/harness/fixtures.py new file mode 100644 index 00000000..5d9d7c13 --- /dev/null +++ b/evals/harness/fixtures.py @@ -0,0 +1,25 @@ +"""Golden-query loader for behavioral tests.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + + +def load_golden( + fixtures_dir: Path | str, + category: str | None = None, +) -> list[dict[str, Any]]: + """Load golden queries from *fixtures_dir*/golden_queries.yaml. + + Expected YAML shape: ``{"queries": [{"category": str, "query": str, ...}]}`` + """ + path = Path(fixtures_dir) / "golden_queries.yaml" + with open(path, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + queries = data.get("queries", []) + if category: + queries = [q for q in queries if q.get("category") == category] + return queries