refactor: extract shared load_golden() into evals/harness/fixtures.py

andrewdonheiser · claude · andrewdonheiser · commit 3ed17bb5a414 · 2026-05-20T09:50:34.000-04:00
The load_golden() helper was copy-pasted identically across 5 agent
test files. Extract it into a shared module with an explicit
fixtures_dir parameter; each consumer keeps a thin 2-line wrapper
that preserves the existing zero-arg call signature, so no test
files need changes.

Closes RHAIENG-5096

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/agents/autogen/mcp_agent/tests/behavioral/conftest.py b/agents/autogen/mcp_agent/tests/behavioral/conftest.py
@@ -13,6 +13,7 @@
 import httpx
 import pytest
 import yaml
+from harness.fixtures import load_golden as _load_golden_from
 from harness.runner import TaskConfig, TaskResult, run_task
 
 try:
@@ -56,15 +57,12 @@ def eval_config() -> dict[str, Any]:
         return yaml.safe_load(f)
 
 
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+
 def load_golden(category: str | None = None) -> list[dict[str, Any]]:
     """Load golden queries from the fixtures directory, optionally filtering by category."""
-    path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
-    with open(path, encoding="utf-8") as f:
-        data = yaml.safe_load(f)
-    queries = data.get("queries", [])
-    if category:
-        queries = [q for q in queries if q.get("category") == category]
-    return queries
+    return _load_golden_from(FIXTURES_DIR, category)
 
 
 @pytest.fixture
diff --git a/agents/crewai/websearch_agent/tests/behavioral/conftest.py b/agents/crewai/websearch_agent/tests/behavioral/conftest.py
@@ -13,6 +13,7 @@
 import httpx
 import pytest
 import yaml
+from harness.fixtures import load_golden as _load_golden_from
 from harness.runner import TaskConfig, TaskResult, run_task
 
 try:
@@ -58,16 +59,12 @@ def eval_config() -> dict[str, Any]:
 
 SEARCH_EVIDENCE = ["openshift ai"]
 
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
 
 def load_golden(category: str | None = None) -> list[dict[str, Any]]:
     """Load golden queries from the fixtures directory, optionally filtering by category."""
-    path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
-    with open(path, encoding="utf-8") as f:
-        data = yaml.safe_load(f)
-    queries = data.get("queries", [])
-    if category:
-        queries = [q for q in queries if q.get("category") == category]
-    return queries
+    return _load_golden_from(FIXTURES_DIR, category)
 
 
 @pytest.fixture
diff --git a/agents/langgraph/agentic_rag/tests/behavioral/conftest.py b/agents/langgraph/agentic_rag/tests/behavioral/conftest.py
@@ -13,6 +13,7 @@
 import httpx
 import pytest
 import yaml
+from harness.fixtures import load_golden as _load_golden_from
 from harness.runner import TaskConfig, TaskResult, run_task
 
 try:
@@ -48,15 +49,12 @@ def _find_repo_root() -> Path:
     )
 
 
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+
 def load_golden(category: str | None = None) -> list[dict[str, Any]]:
     """Load golden queries from the fixtures directory, optionally filtering by category."""
-    path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
-    with open(path, encoding="utf-8") as f:
-        data = yaml.safe_load(f)
-    queries = data.get("queries", [])
-    if category:
-        queries = [q for q in queries if q.get("category") == category]
-    return queries
+    return _load_golden_from(FIXTURES_DIR, category)
 
 
 @pytest.fixture
diff --git a/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py b/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py
@@ -20,7 +20,7 @@
 from typing import Any
 
 import pytest
-import yaml
+from harness.fixtures import load_golden as _load_golden_from
 from harness.scorers.tool_sequence import (
     score_hallucinated_tools,
     score_tool_call_validity,
@@ -29,16 +29,12 @@
 
 pytestmark = pytest.mark.langgraph_react
 
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
 
 def _load_golden(category: str | None = None) -> list[dict[str, Any]]:
     """Load golden queries, optionally filtering by category."""
-    path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
-    with open(path, encoding="utf-8") as f:
-        data = yaml.safe_load(f)
-    queries = data.get("queries", [])
-    if category:
-        queries = [q for q in queries if q.get("category") == category]
-    return queries
+    return _load_golden_from(FIXTURES_DIR, category)
 
 
 def _factual_queries() -> list[dict[str, Any]]:
diff --git a/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py b/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py
@@ -13,6 +13,7 @@
 import httpx
 import pytest
 import yaml
+from harness.fixtures import load_golden as _load_golden_from
 from harness.runner import TaskConfig, TaskResult, run_task
 
 try:
@@ -59,16 +60,12 @@ def eval_config() -> dict[str, Any]:
 PRICE_EVIDENCE = ["price", "cost", "$", "dollar"]
 REVIEW_EVIDENCE = ["review", "rating", "star", "recommend"]
 
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
 
 def load_golden(category: str | None = None) -> list[dict[str, Any]]:
     """Load golden queries from the fixtures directory, optionally filtering by category."""
-    path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
-    with open(path, encoding="utf-8") as f:
-        data = yaml.safe_load(f)
-    queries = data.get("queries", [])
-    if category:
-        queries = [q for q in queries if q.get("category") == category]
-    return queries
+    return _load_golden_from(FIXTURES_DIR, category)
 
 
 @pytest.fixture
diff --git a/docs/adding-behavioral-tests.md b/docs/adding-behavioral-tests.md
@@ -38,6 +38,20 @@ The conftest defines fixtures specific to your agent. Because agent tests live u
 - `agent_thresholds` — pulls from the shared `eval_config` fixture
 - `run_eval` — overrides the root fixture to add MLflow trace enrichment
 
+**`load_golden()` helper:** Import the shared loader from `harness.fixtures` and create a thin wrapper that binds `fixtures_dir` to `Path(__file__).parent / "fixtures"`:
+
+```python
+from pathlib import Path
+from typing import Any
+
+from harness.fixtures import load_golden as _load_golden_from
+
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+def load_golden(category: str | None = None) -> list[dict[str, Any]]:
+    return _load_golden_from(FIXTURES_DIR, category)
+```
+
 See existing agent implementations for working examples:
 
 - `agents/langgraph/react_agent/tests/behavioral/conftest.py`
diff --git a/evals/harness/fixtures.py b/evals/harness/fixtures.py
@@ -0,0 +1,25 @@
+"""Golden-query loader for behavioral tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+def load_golden(
+    fixtures_dir: Path | str,
+    category: str | None = None,
+) -> list[dict[str, Any]]:
+    """Load golden queries from *fixtures_dir*/golden_queries.yaml.
+
+    Expected YAML shape: ``{"queries": [{"category": str, "query": str, ...}]}``
+    """
+    path = Path(fixtures_dir) / "golden_queries.yaml"
+    with open(path, encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    queries = data.get("queries", [])
+    if category:
+        queries = [q for q in queries if q.get("category") == category]
+    return queries