Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions agents/autogen/mcp_agent/tests/behavioral/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import httpx
import pytest
import yaml
from harness.fixtures import load_golden as _load_golden_from
from harness.runner import TaskConfig, TaskResult, run_task

try:
Expand Down Expand Up @@ -56,15 +57,12 @@ def eval_config() -> dict[str, Any]:
return yaml.safe_load(f)


FIXTURES_DIR = Path(__file__).parent / "fixtures"


def load_golden(category: str | None = None) -> list[dict[str, Any]]:
"""Load golden queries from the fixtures directory, optionally filtering by category."""
path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
with open(path, encoding="utf-8") as f:
data = yaml.safe_load(f)
queries = data.get("queries", [])
if category:
queries = [q for q in queries if q.get("category") == category]
return queries
return _load_golden_from(FIXTURES_DIR, category)


@pytest.fixture
Expand Down
11 changes: 4 additions & 7 deletions agents/crewai/websearch_agent/tests/behavioral/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import httpx
import pytest
import yaml
from harness.fixtures import load_golden as _load_golden_from
from harness.runner import TaskConfig, TaskResult, run_task

try:
Expand Down Expand Up @@ -58,16 +59,12 @@ def eval_config() -> dict[str, Any]:

SEARCH_EVIDENCE = ["openshift ai"]

FIXTURES_DIR = Path(__file__).parent / "fixtures"


def load_golden(category: str | None = None) -> list[dict[str, Any]]:
"""Load golden queries from the fixtures directory, optionally filtering by category."""
path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
with open(path, encoding="utf-8") as f:
data = yaml.safe_load(f)
queries = data.get("queries", [])
if category:
queries = [q for q in queries if q.get("category") == category]
return queries
return _load_golden_from(FIXTURES_DIR, category)


@pytest.fixture
Expand Down
12 changes: 5 additions & 7 deletions agents/langgraph/agentic_rag/tests/behavioral/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import httpx
import pytest
import yaml
from harness.fixtures import load_golden as _load_golden_from
from harness.runner import TaskConfig, TaskResult, run_task

try:
Expand Down Expand Up @@ -48,15 +49,12 @@ def _find_repo_root() -> Path:
)


FIXTURES_DIR = Path(__file__).parent / "fixtures"


def load_golden(category: str | None = None) -> list[dict[str, Any]]:
"""Load golden queries from the fixtures directory, optionally filtering by category."""
path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
with open(path, encoding="utf-8") as f:
data = yaml.safe_load(f)
queries = data.get("queries", [])
if category:
queries = [q for q in queries if q.get("category") == category]
return queries
return _load_golden_from(FIXTURES_DIR, category)


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from typing import Any

import pytest
import yaml
from harness.fixtures import load_golden as _load_golden_from
from harness.scorers.tool_sequence import (
score_hallucinated_tools,
score_tool_call_validity,
Expand All @@ -29,16 +29,12 @@

pytestmark = pytest.mark.langgraph_react

FIXTURES_DIR = Path(__file__).parent / "fixtures"


def _load_golden(category: str | None = None) -> list[dict[str, Any]]:
"""Load golden queries, optionally filtering by category."""
path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
with open(path, encoding="utf-8") as f:
data = yaml.safe_load(f)
queries = data.get("queries", [])
if category:
queries = [q for q in queries if q.get("category") == category]
return queries
return _load_golden_from(FIXTURES_DIR, category)


def _factual_queries() -> list[dict[str, Any]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import httpx
import pytest
import yaml
from harness.fixtures import load_golden as _load_golden_from
from harness.runner import TaskConfig, TaskResult, run_task

try:
Expand Down Expand Up @@ -59,16 +60,12 @@ def eval_config() -> dict[str, Any]:
PRICE_EVIDENCE = ["price", "cost", "$", "dollar"]
REVIEW_EVIDENCE = ["review", "rating", "star", "recommend"]

FIXTURES_DIR = Path(__file__).parent / "fixtures"


def load_golden(category: str | None = None) -> list[dict[str, Any]]:
"""Load golden queries from the fixtures directory, optionally filtering by category."""
path = Path(__file__).parent / "fixtures" / "golden_queries.yaml"
with open(path, encoding="utf-8") as f:
data = yaml.safe_load(f)
queries = data.get("queries", [])
if category:
queries = [q for q in queries if q.get("category") == category]
return queries
return _load_golden_from(FIXTURES_DIR, category)


@pytest.fixture
Expand Down
14 changes: 14 additions & 0 deletions docs/adding-behavioral-tests.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,20 @@ The conftest defines fixtures specific to your agent. Because agent tests live u
- `agent_thresholds` — pulls from the shared `eval_config` fixture
- `run_eval` — overrides the root fixture to add MLflow trace enrichment

**`load_golden()` helper:** Import the shared loader from `harness.fixtures` and create a thin wrapper that binds `fixtures_dir` to `Path(__file__).parent / "fixtures"`:

```python
from pathlib import Path
from typing import Any

from harness.fixtures import load_golden as _load_golden_from

FIXTURES_DIR = Path(__file__).parent / "fixtures"

def load_golden(category: str | None = None) -> list[dict[str, Any]]:
return _load_golden_from(FIXTURES_DIR, category)
```
Comment thread
coderabbitai[bot] marked this conversation as resolved.

See existing agent implementations for working examples:

- `agents/langgraph/react_agent/tests/behavioral/conftest.py`
Expand Down
25 changes: 25 additions & 0 deletions evals/harness/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Golden-query loader for behavioral tests."""

from __future__ import annotations

from pathlib import Path
from typing import Any

import yaml
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def load_golden(
fixtures_dir: Path | str,
category: str | None = None,
) -> list[dict[str, Any]]:
Comment thread
andrewdonheiser marked this conversation as resolved.
"""Load golden queries from *fixtures_dir*/golden_queries.yaml.

Expected YAML shape: ``{"queries": [{"category": str, "query": str, ...}]}``
"""
path = Path(fixtures_dir) / "golden_queries.yaml"
with open(path, encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
queries = data.get("queries", [])
if category:
queries = [q for q in queries if q.get("category") == category]
return queries
Loading