diff --git a/agents/autogen/mcp_agent/tests/behavioral/conftest.py b/agents/autogen/mcp_agent/tests/behavioral/conftest.py index 3185e6d..6c203d8 100644 --- a/agents/autogen/mcp_agent/tests/behavioral/conftest.py +++ b/agents/autogen/mcp_agent/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -56,15 +57,12 @@ def eval_config() -> dict[str, Any]: return yaml.safe_load(f) +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/agents/crewai/websearch_agent/tests/behavioral/conftest.py b/agents/crewai/websearch_agent/tests/behavioral/conftest.py index 232626d..e7056d9 100644 --- a/agents/crewai/websearch_agent/tests/behavioral/conftest.py +++ b/agents/crewai/websearch_agent/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -58,16 +59,12 @@ def eval_config() -> dict[str, Any]: SEARCH_EVIDENCE = ["openshift ai"] +FIXTURES_DIR = Path(__file__).parent / "fixtures" + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/agents/langgraph/agentic_rag/tests/behavioral/conftest.py b/agents/langgraph/agentic_rag/tests/behavioral/conftest.py index da39741..f4ebd27 100644 --- a/agents/langgraph/agentic_rag/tests/behavioral/conftest.py +++ b/agents/langgraph/agentic_rag/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -48,15 +49,12 @@ def _find_repo_root() -> Path: ) +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py b/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py index a8d5025..2d26d00 100644 --- a/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py +++ b/agents/langgraph/react_agent/tests/behavioral/test_tool_usage.py @@ -20,7 +20,7 @@ from typing import Any import pytest -import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.scorers.tool_sequence import ( score_hallucinated_tools, score_tool_call_validity, @@ -29,16 +29,12 @@ pytestmark = pytest.mark.langgraph_react +FIXTURES_DIR = Path(__file__).parent / "fixtures" + def _load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) def _factual_queries() -> list[dict[str, Any]]: diff --git a/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py b/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py index 4fda199..f78976d 100644 --- a/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py +++ b/agents/vanilla_python/openai_responses_agent/tests/behavioral/conftest.py @@ -13,6 +13,7 @@ import httpx import pytest import yaml +from harness.fixtures import load_golden as _load_golden_from from harness.runner import TaskConfig, TaskResult, run_task try: @@ -59,16 +60,12 @@ def eval_config() -> dict[str, Any]: PRICE_EVIDENCE = ["price", "cost", "$", "dollar"] REVIEW_EVIDENCE = ["review", "rating", "star", "recommend"] +FIXTURES_DIR = Path(__file__).parent / "fixtures" + def load_golden(category: str | None = None) -> list[dict[str, Any]]: """Load golden queries from the fixtures directory, optionally filtering by category.""" - path = Path(__file__).parent / "fixtures" / "golden_queries.yaml" - with open(path, encoding="utf-8") as f: - data = yaml.safe_load(f) - queries = data.get("queries", []) - if category: - queries = [q for q in queries if q.get("category") == category] - return queries + return _load_golden_from(FIXTURES_DIR, category) @pytest.fixture diff --git a/docs/adding-behavioral-tests.md b/docs/adding-behavioral-tests.md index 1e06e3b..947de1b 100644 --- a/docs/adding-behavioral-tests.md +++ b/docs/adding-behavioral-tests.md @@ -38,6 +38,20 @@ The conftest defines fixtures specific to your agent. Because agent tests live u - `agent_thresholds` — pulls from the shared `eval_config` fixture - `run_eval` — overrides the root fixture to add MLflow trace enrichment +**`load_golden()` helper:** Import the shared loader from `harness.fixtures` and create a thin wrapper that binds `fixtures_dir` to `Path(__file__).parent / "fixtures"`: + +```python +from pathlib import Path +from typing import Any + +from harness.fixtures import load_golden as _load_golden_from + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + +def load_golden(category: str | None = None) -> list[dict[str, Any]]: + return _load_golden_from(FIXTURES_DIR, category) +``` + See existing agent implementations for working examples: - `agents/langgraph/react_agent/tests/behavioral/conftest.py` diff --git a/evals/harness/fixtures.py b/evals/harness/fixtures.py new file mode 100644 index 0000000..5d9d7c1 --- /dev/null +++ b/evals/harness/fixtures.py @@ -0,0 +1,25 @@ +"""Golden-query loader for behavioral tests.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + + +def load_golden( + fixtures_dir: Path | str, + category: str | None = None, +) -> list[dict[str, Any]]: + """Load golden queries from *fixtures_dir*/golden_queries.yaml. + + Expected YAML shape: ``{"queries": [{"category": str, "query": str, ...}]}`` + """ + path = Path(fixtures_dir) / "golden_queries.yaml" + with open(path, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + queries = data.get("queries", []) + if category: + queries = [q for q in queries if q.get("category") == category] + return queries