Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions skills/evaluate/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,12 @@ fallback instead of retrying the failing call.
seed_content: <original seed YAML, if available>
acceptance_criterion: <specific AC to check, optional>
artifact_type: "code" (or "docs", "config")
working_dir: <absolute project root, recommended>
trigger_consensus: false (true if user requests Stage 3)
```

`working_dir` controls both Stage 1 command execution and Stage 2 source-file visibility. Pass the absolute project root whenever available; if omitted, the MCP handler falls back to the registered brownfield default, seed project metadata, then the MCP server cwd.

4. Present results clearly:
- Show each stage's pass/fail status
- Highlight the final approval decision
Expand Down
138 changes: 114 additions & 24 deletions src/ouroboros/mcp/tools/evaluation_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from ouroboros.config import get_llm_backend_for_role, get_llm_model_for_role
from ouroboros.core.errors import ValidationError
from ouroboros.core.project_paths import resolve_path_against_base, resolve_seed_project_path
from ouroboros.core.seed import Seed
from ouroboros.core.types import Result
from ouroboros.mcp.errors import MCPServerError, MCPToolError
Expand Down Expand Up @@ -56,6 +57,80 @@
log = structlog.get_logger(__name__)


async def _default_brownfield_project_dir() -> Path | None:
"""Return the registered default brownfield project directory, if any."""
from ouroboros.persistence.brownfield import BrownfieldStore

store = BrownfieldStore()
try:
await store.initialize()
default_repo = await store.get_default()
except Exception as exc: # noqa: BLE001 - fallback discovery must be best-effort
log.warning("mcp.tool.evaluate.brownfield_default_lookup_failed", error=str(exc))
return None
finally:
await store.close()

if default_repo is None or not default_repo.path:
return None

resolved = Path(default_repo.path).expanduser().resolve()
if not resolved.is_dir():
log.warning(
"mcp.tool.evaluate.brownfield_default_unusable",
path=str(resolved),
)
return None
return resolved


def _seed_project_dir(seed: Seed | None, *, stable_base: Path) -> Path | None:
"""Resolve a contained project directory encoded in seed metadata/context."""
resolution = resolve_seed_project_path(seed, stable_base=stable_base)
if resolution.path is None:
return None

resolved = resolution.path
if resolved.is_file():
return resolved.parent
if resolved.exists() and not resolved.is_dir():
return None
return resolved


async def _resolve_evaluate_working_dir(
explicit_working_dir: str | None,
seed: Seed | None,
) -> Path:
"""Resolve the project root that gates Stage 1 and Stage 2 evaluation.

Precedence is explicit tool argument, registered brownfield default,
seed-declared project directory, then the MCP server cwd. The last
fallback preserves the historical behavior, but only after project-aware
sources have been exhausted.
"""
stable_base = Path.cwd().resolve()
if explicit_working_dir:
resolved = resolve_path_against_base(explicit_working_dir, stable_base=stable_base)
if resolved is not None:
return resolved

brownfield_default = await _default_brownfield_project_dir()
if brownfield_default is not None:
if brownfield_default.is_dir():
return brownfield_default.resolve()
log.warning(
"mcp.tool.evaluate.brownfield_default_unusable",
path=str(brownfield_default),
)

seed_dir = _seed_project_dir(seed, stable_base=stable_base)
if seed_dir is not None:
return seed_dir

return stable_base


def _evaluation_allowed_tools(runtime_backend: str | None) -> list[str]:
"""Return the policy-derived read-only tool envelope for evaluation."""
return allowed_runtime_builtin_tool_names(
Expand Down Expand Up @@ -340,7 +415,8 @@ def definition(self) -> MCPToolDefinition:
type=ToolInputType.STRING,
description=(
"Project root used to resolve Stage 1 mechanical verification "
"commands. Commands are read from .ouroboros/mechanical.toml; "
"commands and Stage 2 source-file visibility. Commands are "
"read from .ouroboros/mechanical.toml; "
"when the file is missing, the evaluator makes one AI detect "
"call that inspects manifests (package.json, pyproject.toml, "
"Cargo.toml, Makefile, ...) and authors the toml. Stage 1 "
Expand All @@ -363,8 +439,6 @@ async def handle(
Returns:
Result containing evaluation results or error.
"""
from pathlib import Path

from ouroboros.evaluation import (
EvaluationContext,
EvaluationPipeline,
Expand Down Expand Up @@ -424,14 +498,34 @@ async def handle(
trigger_consensus=trigger_consensus,
)

# Parse seed before dispatch so working_dir fallback is available for
# both plugin/subagent and in-process evaluation paths.
goal = ""
constraints: tuple[str, ...] = ()
seed_id = session_id # fallback
seed: Seed | None = None

if seed_content:
try:
seed_dict = yaml.safe_load(seed_content)
seed = Seed.from_dict(seed_dict)
goal = seed.goal
constraints = tuple(seed.constraints)
seed_id = seed.metadata.seed_id
except (yaml.YAMLError, ValidationError, PydanticValidationError) as e:
log.warning("mcp.tool.evaluate.seed_parse_warning", error=str(e))
# Continue without seed data - not fatal

working_dir = await _resolve_evaluate_working_dir(arguments.get("working_dir"), seed)

# --- Subagent dispatch: gate on runtime + opencode_mode ---
payload = build_evaluate_subagent(
session_id=session_id,
artifact=artifact,
artifact_type=artifact_type,
seed_content=seed_content,
acceptance_criterion=acceptance_criterion,
working_dir=arguments.get("working_dir"),
working_dir=str(working_dir),
trigger_consensus=trigger_consensus,
)
if should_dispatch_via_plugin(self.agent_runtime_backend, self.opencode_mode):
Expand All @@ -456,22 +550,6 @@ async def handle(
owns_event_store = False

try:
# Extract goal/constraints from seed if provided
goal = ""
constraints: tuple[str, ...] = ()
seed_id = session_id # fallback

if seed_content:
try:
seed_dict = yaml.safe_load(seed_content)
seed = Seed.from_dict(seed_dict)
goal = seed.goal
constraints = tuple(seed.constraints)
seed_id = seed.metadata.seed_id
except (yaml.YAMLError, ValidationError, PydanticValidationError) as e:
log.warning("mcp.tool.evaluate.seed_parse_warning", error=str(e))
# Continue without seed data - not fatal

# Try to enrich from session repository if event_store available
if not goal:
if store is None:
Expand Down Expand Up @@ -508,8 +586,6 @@ async def handle(
allowed_tools=_evaluation_allowed_tools(backend),
max_turns=20,
)
working_dir_str = arguments.get("working_dir")
working_dir = Path(working_dir_str).resolve() if working_dir_str else Path.cwd()
log.info(
"mcp.tool.evaluate.started",
session_id=session_id,
Expand Down Expand Up @@ -1786,13 +1862,27 @@ async def handle(
else:
ac_for_payload = None

seed: Seed | None = None
seed_content = arguments.get("seed_content")
if seed_content:
try:
seed_dict = yaml.safe_load(seed_content)
seed = Seed.from_dict(seed_dict)
except (yaml.YAMLError, ValidationError, PydanticValidationError) as e:
log.warning("mcp.tool.start_evaluate.seed_parse_warning", error=str(e))

working_dir = await _resolve_evaluate_working_dir(
arguments.get("working_dir"),
seed,
)

payload = build_evaluate_subagent(
session_id=session_id,
artifact=artifact,
artifact_type=arguments.get("artifact_type", "code"),
seed_content=arguments.get("seed_content"),
seed_content=seed_content,
acceptance_criterion=ac_for_payload,
working_dir=arguments.get("working_dir"),
working_dir=str(working_dir),
trigger_consensus=arguments.get("trigger_consensus", False),
)
return await dispatch_plugin_terminal(
Expand Down
153 changes: 152 additions & 1 deletion tests/unit/mcp/tools/test_evaluate_multi_ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from __future__ import annotations

from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
Expand All @@ -20,7 +22,7 @@
MechanicalResult,
SemanticResult,
)
from ouroboros.mcp.tools.evaluation_handlers import EvaluateHandler
from ouroboros.mcp.tools.evaluation_handlers import EvaluateHandler, _resolve_evaluate_working_dir
from ouroboros.mcp.types import ToolInputType


Expand Down Expand Up @@ -72,6 +74,155 @@ def _failing_eval(execution_id: str, *, reason: str) -> EvaluationResult:
)


class TestEvaluateWorkingDirResolution:
"""Working dir fallback keeps Stage 2 pointed at the project root."""

async def test_explicit_working_dir_wins(self, tmp_path: Path) -> None:
explicit = tmp_path / "project"
explicit.mkdir()

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=tmp_path / "default"),
):
resolved = await _resolve_evaluate_working_dir(str(explicit), None)

assert resolved == explicit.resolve()

async def test_brownfield_default_used_before_cwd(self, tmp_path: Path, monkeypatch) -> None:
cwd = tmp_path / "hermes"
default = tmp_path / "repo"
cwd.mkdir()
default.mkdir()
monkeypatch.chdir(cwd)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=default),
):
resolved = await _resolve_evaluate_working_dir(None, None)

assert resolved == default.resolve()

async def test_seed_metadata_used_when_no_default(self, tmp_path: Path, monkeypatch) -> None:
cwd = tmp_path / "hermes"
project = cwd / "project"
project.mkdir(parents=True)
monkeypatch.chdir(cwd)
seed = SimpleNamespace(
metadata=SimpleNamespace(project_dir="project", working_directory=None),
brownfield_context=None,
)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=None),
):
resolved = await _resolve_evaluate_working_dir(None, seed)

assert resolved == project.resolve()

async def test_brownfield_default_wins_over_seed_metadata(
self, tmp_path: Path, monkeypatch
) -> None:
cwd = tmp_path / "hermes"
default = tmp_path / "repo-default"
seed_project = cwd / "seed-project"
cwd.mkdir()
default.mkdir()
seed_project.mkdir()
monkeypatch.chdir(cwd)
seed = SimpleNamespace(
metadata=SimpleNamespace(project_dir="seed-project", working_directory=None),
brownfield_context=None,
)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=default),
):
resolved = await _resolve_evaluate_working_dir(None, seed)

assert resolved == default.resolve()

async def test_stale_brownfield_default_falls_back_to_seed(
self, tmp_path: Path, monkeypatch
) -> None:
cwd = tmp_path / "hermes"
stale_default = tmp_path / "missing-default"
seed_project = cwd / "seed-project"
cwd.mkdir()
seed_project.mkdir()
monkeypatch.chdir(cwd)
seed = SimpleNamespace(
metadata=SimpleNamespace(project_dir="seed-project", working_directory=None),
brownfield_context=None,
)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=stale_default),
):
resolved = await _resolve_evaluate_working_dir(None, seed)

assert resolved == seed_project.resolve()

async def test_non_directory_brownfield_default_falls_back_to_seed(
self, tmp_path: Path, monkeypatch
) -> None:
cwd = tmp_path / "hermes"
file_default = tmp_path / "default-file"
seed_project = cwd / "seed-project"
cwd.mkdir()
file_default.write_text("not a directory")
seed_project.mkdir()
monkeypatch.chdir(cwd)
seed = SimpleNamespace(
metadata=SimpleNamespace(project_dir="seed-project", working_directory=None),
brownfield_context=None,
)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=file_default),
):
resolved = await _resolve_evaluate_working_dir(None, seed)

assert resolved == seed_project.resolve()

async def test_seed_metadata_escape_falls_back_to_cwd(
self, tmp_path: Path, monkeypatch
) -> None:
cwd = tmp_path / "hermes"
outside = tmp_path / "outside"
cwd.mkdir()
outside.mkdir()
monkeypatch.chdir(cwd)
seed = SimpleNamespace(
metadata=SimpleNamespace(project_dir=str(outside), working_directory=None),
brownfield_context=None,
)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=None),
):
resolved = await _resolve_evaluate_working_dir(None, seed)

assert resolved == cwd.resolve()

async def test_cwd_fallback_last(self, tmp_path: Path, monkeypatch) -> None:
monkeypatch.chdir(tmp_path)

with patch(
"ouroboros.mcp.tools.evaluation_handlers._default_brownfield_project_dir",
new=AsyncMock(return_value=None),
):
resolved = await _resolve_evaluate_working_dir(None, None)

assert resolved == tmp_path.resolve()


class TestDefinitionAcceptsMultiAC:
"""The tool schema must advertise the new acceptance_criteria parameter."""

Expand Down
Loading