olaservo
diff --git a/‎scripts/skills_e2e_agent/agent.py‎
Lines changed: 289 additions & 0 deletions b/‎scripts/skills_e2e_agent/agent.py‎
Lines changed: 289 additions & 0 deletions
diff --git a/‎scripts/skills_e2e_agent/fastagent.config.yaml‎
Lines changed: 23 additions & 0 deletions b/‎scripts/skills_e2e_agent/fastagent.config.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎scripts/skills_e2e_smart_agent/agent.py‎
Lines changed: 59 additions & 0 deletions b/‎scripts/skills_e2e_smart_agent/agent.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎scripts/skills_e2e_smart_agent/fastagent.config.yaml‎
Lines changed: 17 additions & 0 deletions b/‎scripts/skills_e2e_smart_agent/fastagent.config.yaml‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,289 @@
+"""Scenario #1 end-to-end run: review a real PR via the pull-requests skill.
+
+Exercises the **write path** of the Skills-over-MCP SEP: the model must
+activate the `pull-requests` skill bundled with olaservo/github-mcp-server
+(branch `add-agent-skills`), then follow the three-step pending-review
+workflow (create → add_comment_to_pending_review → submit_pending) to post
+a real GitHub review.
+
+Pass criteria (all five must hold):
+1. `read_skill` called with `skill://pull-requests/SKILL.md` before any
+   mutating PR tool.
+2. `pull_request_review_write` called with `method="create"` and no `event`.
+3. At least one `add_comment_to_pending_review` call after step 2.
+4. `pull_request_review_write` called with `method="submit_pending"` and
+   `event` in {APPROVE, REQUEST_CHANGES, COMMENT}.
+5. No single-shot bypass (no `pull_request_review_write` call that sets
+   both `method="create"` and `event`).
+
+Pre-reqs:
+- ANTHROPIC_API_KEY in env.
+- GITHUB_TOKEN in env (or resolvable via `gh auth token`).
+- Server running **without** `--read-only` (Scenario #1 requires the
+  write path):
+      GITHUB_PERSONAL_ACCESS_TOKEN=$(gh auth token) \\
+      DISABLE_INSTRUCTIONS=true \\
+        ./github-mcp-server http --port 8082 --toolsets=pull_requests
+  (or `--toolsets=all` for the broader-tool-list variant).
+- A reviewable PR on the subject repo. Produce one with
+  `experiments/code-review-subject/scripts/create-pr-input-validation.sh`.
+
+Usage:
+    # One-step (PR_NUMBER auto-detected from the canonical head branch):
+    GITHUB_TOKEN=$(gh auth token) uv run scripts/skills_e2e_agent/agent.py
+
+    # Or target a specific PR:
+    GITHUB_TOKEN=$(gh auth token) REPO=olaservo/code-review-subject \\
+        PR_NUMBER=7 uv run scripts/skills_e2e_agent/agent.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import subprocess
+import sys
+import time
+
+from fast_agent import FastAgent
+
+DEFAULT_REPO = "olaservo/code-review-subject"
+DEFAULT_HEAD_BRANCH = "feature/input-validation-enhancement"
+EXPECTED_SKILL_URI = "skill://pull-requests/SKILL.md"
+VALID_VERDICTS = {"APPROVE", "REQUEST_CHANGES", "COMMENT"}
+MUTATING_PR_TOOLS = {
+    "pull_request_review_write",
+    "add_comment_to_pending_review",
+}
+
+
+fast = FastAgent("skills-over-mcp Scenario #1: review a real PR")
+
+
+def _resolve_pr_number(repo: str) -> int:
+    env_val = os.environ.get("PR_NUMBER")
+    if env_val:
+        try:
+            return int(env_val)
+        except ValueError:
+            sys.exit(f"PR_NUMBER={env_val!r} is not an integer")
+
+    try:
+        out = subprocess.check_output(
+            [
+                "gh", "pr", "list",
+                "--repo", repo,
+                "--head", DEFAULT_HEAD_BRANCH,
+                "--state", "open",
+                "--json", "number",
+                "--jq", ".[0].number",
+            ],
+            encoding="utf-8",
+        ).strip()
+    except Exception as exc:
+        sys.exit(
+            f"PR_NUMBER not set and auto-detect failed: {exc}. "
+            f"Run experiments/code-review-subject/scripts/create-pr-input-validation.sh "
+            f"first, or pass PR_NUMBER explicitly."
+        )
+    if not out:
+        sys.exit(
+            f"No open PR on {repo} head={DEFAULT_HEAD_BRANCH}. "
+            f"Run the scaffolding script first."
+        )
+    return int(out)
+
+
+REPO = os.environ.get("REPO", DEFAULT_REPO)
+PR_NUMBER = _resolve_pr_number(REPO)
+OWNER, REPO_NAME = REPO.split("/", 1)
+
+PR_REVIEW_PROMPT = (
+    f"Review PR #{PR_NUMBER} on {REPO}. Read the diff, then leave comments "
+    f"on any issues you find, and submit the review with an appropriate "
+    f"verdict (APPROVE, REQUEST_CHANGES, or COMMENT)."
+)
+
+
+def _extract_tool_calls(agent) -> list[tuple[str, dict]]:
+    """Walk agent.message_history in order; return [(tool_name, args), ...]."""
+    calls: list[tuple[str, dict]] = []
+    for msg in agent.message_history:
+        if not msg.tool_calls:
+            continue
+        for req in msg.tool_calls.values():
+            params = req.params
+            name = getattr(params, "name", None) or ""
+            args = getattr(params, "arguments", None) or {}
+            calls.append((name, dict(args)))
+    return calls
+
+
+def _find_review_url(repo: str, pr_number: int) -> str | None:
+    try:
+        out = subprocess.check_output(
+            [
+                "gh", "api",
+                f"repos/{repo}/pulls/{pr_number}/reviews",
+                "--jq", ".[-1].html_url",
+            ],
+            encoding="utf-8",
+        ).strip()
+    except Exception:
+        return None
+    return out or None
+
+
+def _evaluate(calls: list[tuple[str, dict]]) -> dict:
+    """Compute the five pass-criteria booleans plus diagnostics."""
+    read_skill_idx = None
+    first_mutating_idx = None
+    create_idx = None
+    submit_idx = None
+    single_shot_indices: list[int] = []
+    comment_indices: list[int] = []
+    verdict = None
+    other_calls: list[tuple[int, str]] = []
+
+    for i, (name, args) in enumerate(calls):
+        is_expected = (
+            name == "read_skill"
+            or name in MUTATING_PR_TOOLS
+        )
+
+        if name == "read_skill" and args.get("path") == EXPECTED_SKILL_URI:
+            if read_skill_idx is None:
+                read_skill_idx = i
+
+        if name in MUTATING_PR_TOOLS and first_mutating_idx is None:
+            first_mutating_idx = i
+
+        if name == "pull_request_review_write":
+            method = args.get("method")
+            event = args.get("event")
+            if method == "create":
+                if event:
+                    single_shot_indices.append(i)
+                elif create_idx is None:
+                    create_idx = i
+            elif method == "submit_pending":
+                if submit_idx is None:
+                    submit_idx = i
+                    verdict = event
+
+        if name == "add_comment_to_pending_review":
+            comment_indices.append(i)
+
+        if not is_expected:
+            other_calls.append((i, name))
+
+    skill_before_write = (
+        read_skill_idx is not None
+        and (first_mutating_idx is None or read_skill_idx < first_mutating_idx)
+    )
+    create_pending_ok = create_idx is not None
+    comments_ok = (
+        create_idx is not None
+        and any(ci > create_idx for ci in comment_indices)
+    )
+    submit_ok = (
+        submit_idx is not None
+        and verdict in VALID_VERDICTS
+        and (create_idx is None or submit_idx > create_idx)
+    )
+    no_bypass = not single_shot_indices
+
+    overall = (
+        skill_before_write
+        and create_pending_ok
+        and comments_ok
+        and submit_ok
+        and no_bypass
+    )
+
+    return {
+        "skill_before_write": skill_before_write,
+        "create_pending_ok": create_pending_ok,
+        "comments_ok": comments_ok,
+        "comment_count": len(comment_indices),
+        "submit_ok": submit_ok,
+        "verdict": verdict,
+        "no_bypass": no_bypass,
+        "single_shot_indices": single_shot_indices,
+        "other_calls": other_calls,
+        "overall": overall,
+    }
+
+
+@fast.agent(
+    name="pr_reviewer",
+    instruction=(
+        "You are a software engineer assisting with GitHub pull request "
+        "workflows. {{agentSkills}}"
+    ),
+    servers=["github_skills"],
+)
+async def main() -> int:
+    print(f"Target: {REPO} PR #{PR_NUMBER}")
+    print(f"Prompt: {PR_REVIEW_PROMPT}")
+    print()
+
+    async with fast.run() as agent:
+        start = time.monotonic()
+        response = await agent.send(PR_REVIEW_PROMPT)
+        elapsed = time.monotonic() - start
+
+        calls = _extract_tool_calls(agent)
+        result = _evaluate(calls)
+
+        print()
+        print("=" * 72)
+        print("Ordered tool calls:")
+        if not calls:
+            print("  (none)")
+        for i, (name, args) in enumerate(calls):
+            compact = {k: args[k] for k in args if k not in {"body"}}
+            print(f"  [{i}] {name}  {json.dumps(compact, default=str)[:180]}")
+        print()
+
+        banner = [
+            ("skill-read-before-write",
+             result["skill_before_write"], None),
+            ("create-pending-review",
+             result["create_pending_ok"], None),
+            ("add-comment(s)",
+             result["comments_ok"], f"count={result['comment_count']}"),
+            ("submit-pending-with-verdict",
+             result["submit_ok"], f"verdict={result['verdict']}"),
+            ("no-single-shot-bypass",
+             result["no_bypass"],
+             f"bypass_indices={result['single_shot_indices']}"
+             if result["single_shot_indices"] else None),
+        ]
+        for label, ok, note in banner:
+            status = "PASS" if ok else "FAIL"
+            extra = f" ({note})" if note else ""
+            print(f"  {label:<32} {status}{extra}")
+        print(f"  {'overall':<32} {'PASS' if result['overall'] else 'FAIL'}")
+        print()
+
+        if result["other_calls"]:
+            print("Tool calls outside prescribed workflow:")
+            for i, name in result["other_calls"]:
+                print(f"  [{i}] {name}")
+            print()
+
+        review_url = _find_review_url(REPO, PR_NUMBER)
+        print(f"Review URL: {review_url or '(not found via gh api)'}")
+        print(f"Wall-clock: {elapsed:.1f}s")
+        print("=" * 72)
+        print()
+        print("Final assistant response:")
+        print(response if isinstance(response, str) else str(response))
+
+        return 0 if result["overall"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
@@ -0,0 +1,23 @@
+default_model: anthropic.claude-sonnet-4-6
+
+logger:
+  progress_display: false
+  show_chat: true
+  show_tools: true
+  truncate_tools: true
+
+mcp:
+  servers:
+    github_skills:
+      transport: http
+      url: http://localhost:8082/mcp
+      headers:
+        Authorization: "Bearer ${GITHUB_TOKEN}"
+      # Contamination guarantee for the activation test: suppress the
+      # server's `instructions` field from the system prompt so the model's
+      # only signal that the pull-requests skill exists comes from the host's
+      # `<available_skills>` block (built from `skill://index.json`).
+      # If the model still calls `read_skill` with the skill:// URI and
+      # surfaces SKILL.md-only method names, the activation went through the
+      # SEP integration end-to-end, not via inlined server text.
+      include_instructions: false
@@ -0,0 +1,59 @@
+"""Smart-agent variant of the Skills-over-MCP activation test.
+
+`SmartAgent` already exposes a model-callable `get_resource(uri, server_name?)`
+tool that matches the SEP's illustrative `read_resource(server, uri)` shape
+(with server_name optional — cleaner activation profile per the host guide
+pitfall #2). That tool existed before this branch's work.
+
+This harness tests: when a smart agent has the github_skills server attached
+and my Skills-over-MCP discovery runs (loader populates `<available_skills>`
+with the skill:// URI), what does the model do? Does it use `get_resource`
+(prior-art smart-agent tool) or `read_skill` (the URI-aware extension I
+added)? Does activation PASS either way?
+
+Pre-reqs identical to the basic-agent harness.
+"""
+
+import asyncio
+import sys
+
+from fast_agent import FastAgent
+
+fast = FastAgent("skills-over-mcp smart-agent activation test")
+
+PR_REVIEW_PROMPT = (
+    "I need to review pull request #42 on owner/repo and leave several "
+    "line-specific comments before approving. Briefly walk me through the "
+    "exact tool sequence you would use and name each tool/method you would "
+    "call. Use the available skills if any are relevant."
+)
+
+
+@fast.smart(
+    name="smart_pr_reviewer",
+    instruction=(
+        "You are a software engineer assisting with GitHub pull request "
+        "workflows. {{agentSkills}}"
+    ),
+    servers=["github_skills"],
+)
+async def main() -> int:
+    async with fast.run() as agent:
+        response = await agent.send(PR_REVIEW_PROMPT)
+        text = response.lower() if isinstance(response, str) else str(response).lower()
+        activated = (
+            "submit_pending".lower() in text
+            or "create_pending_pull_request_review" in text
+        )
+        print()
+        print("=" * 60)
+        print(
+            f"Activation indicator (skill-specific method mentioned): "
+            f"{'PASS' if activated else 'FAIL'}"
+        )
+        print("=" * 60)
+        return 0 if activated else 1
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
@@ -0,0 +1,17 @@
+default_model: anthropic.claude-sonnet-4-6
+
+logger:
+  progress_display: false
+  show_chat: true
+  show_tools: true
+  truncate_tools: true
+
+mcp:
+  servers:
+    github_skills:
+      transport: http
+      url: http://localhost:8082/mcp
+      headers:
+        Authorization: "Bearer ${GITHUB_TOKEN}"
+      # Same contamination isolation as the basic-agent variant.
+      include_instructions: false