|
| 1 | +"""Scenario #1 end-to-end run: review a real PR via the pull-requests skill. |
| 2 | +
|
| 3 | +Exercises the **write path** of the Skills-over-MCP SEP: the model must |
| 4 | +activate the `pull-requests` skill bundled with olaservo/github-mcp-server |
| 5 | +(branch `add-agent-skills`), then follow the three-step pending-review |
| 6 | +workflow (create → add_comment_to_pending_review → submit_pending) to post |
| 7 | +a real GitHub review. |
| 8 | +
|
| 9 | +Pass criteria (all five must hold): |
| 10 | +1. `read_skill` called with `skill://pull-requests/SKILL.md` before any |
| 11 | + mutating PR tool. |
| 12 | +2. `pull_request_review_write` called with `method="create"` and no `event`. |
| 13 | +3. At least one `add_comment_to_pending_review` call after step 2. |
| 14 | +4. `pull_request_review_write` called with `method="submit_pending"` and |
| 15 | + `event` in {APPROVE, REQUEST_CHANGES, COMMENT}. |
| 16 | +5. No single-shot bypass (no `pull_request_review_write` call that sets |
| 17 | + both `method="create"` and `event`). |
| 18 | +
|
| 19 | +Pre-reqs: |
| 20 | +- ANTHROPIC_API_KEY in env. |
| 21 | +- GITHUB_TOKEN in env (or resolvable via `gh auth token`). |
| 22 | +- Server running **without** `--read-only` (Scenario #1 requires the |
| 23 | + write path): |
| 24 | + GITHUB_PERSONAL_ACCESS_TOKEN=$(gh auth token) \\ |
| 25 | + DISABLE_INSTRUCTIONS=true \\ |
| 26 | + ./github-mcp-server http --port 8082 --toolsets=pull_requests |
| 27 | + (or `--toolsets=all` for the broader-tool-list variant). |
| 28 | +- A reviewable PR on the subject repo. Produce one with |
| 29 | + `experiments/code-review-subject/scripts/create-pr-input-validation.sh`. |
| 30 | +
|
| 31 | +Usage: |
| 32 | + # One-step (PR_NUMBER auto-detected from the canonical head branch): |
| 33 | + GITHUB_TOKEN=$(gh auth token) uv run scripts/skills_e2e_agent/agent.py |
| 34 | +
|
| 35 | + # Or target a specific PR: |
| 36 | + GITHUB_TOKEN=$(gh auth token) REPO=olaservo/code-review-subject \\ |
| 37 | + PR_NUMBER=7 uv run scripts/skills_e2e_agent/agent.py |
| 38 | +""" |
| 39 | + |
| 40 | +from __future__ import annotations |
| 41 | + |
| 42 | +import asyncio |
| 43 | +import json |
| 44 | +import os |
| 45 | +import subprocess |
| 46 | +import sys |
| 47 | +import time |
| 48 | + |
| 49 | +from fast_agent import FastAgent |
| 50 | + |
| 51 | +DEFAULT_REPO = "olaservo/code-review-subject" |
| 52 | +DEFAULT_HEAD_BRANCH = "feature/input-validation-enhancement" |
| 53 | +EXPECTED_SKILL_URI = "skill://pull-requests/SKILL.md" |
| 54 | +VALID_VERDICTS = {"APPROVE", "REQUEST_CHANGES", "COMMENT"} |
| 55 | +MUTATING_PR_TOOLS = { |
| 56 | + "pull_request_review_write", |
| 57 | + "add_comment_to_pending_review", |
| 58 | +} |
| 59 | + |
| 60 | + |
| 61 | +fast = FastAgent("skills-over-mcp Scenario #1: review a real PR") |
| 62 | + |
| 63 | + |
| 64 | +def _resolve_pr_number(repo: str) -> int: |
| 65 | + env_val = os.environ.get("PR_NUMBER") |
| 66 | + if env_val: |
| 67 | + try: |
| 68 | + return int(env_val) |
| 69 | + except ValueError: |
| 70 | + sys.exit(f"PR_NUMBER={env_val!r} is not an integer") |
| 71 | + |
| 72 | + try: |
| 73 | + out = subprocess.check_output( |
| 74 | + [ |
| 75 | + "gh", "pr", "list", |
| 76 | + "--repo", repo, |
| 77 | + "--head", DEFAULT_HEAD_BRANCH, |
| 78 | + "--state", "open", |
| 79 | + "--json", "number", |
| 80 | + "--jq", ".[0].number", |
| 81 | + ], |
| 82 | + encoding="utf-8", |
| 83 | + ).strip() |
| 84 | + except Exception as exc: |
| 85 | + sys.exit( |
| 86 | + f"PR_NUMBER not set and auto-detect failed: {exc}. " |
| 87 | + f"Run experiments/code-review-subject/scripts/create-pr-input-validation.sh " |
| 88 | + f"first, or pass PR_NUMBER explicitly." |
| 89 | + ) |
| 90 | + if not out: |
| 91 | + sys.exit( |
| 92 | + f"No open PR on {repo} head={DEFAULT_HEAD_BRANCH}. " |
| 93 | + f"Run the scaffolding script first." |
| 94 | + ) |
| 95 | + return int(out) |
| 96 | + |
| 97 | + |
| 98 | +REPO = os.environ.get("REPO", DEFAULT_REPO) |
| 99 | +PR_NUMBER = _resolve_pr_number(REPO) |
| 100 | +OWNER, REPO_NAME = REPO.split("/", 1) |
| 101 | + |
| 102 | +PR_REVIEW_PROMPT = ( |
| 103 | + f"Review PR #{PR_NUMBER} on {REPO}. Read the diff, then leave comments " |
| 104 | + f"on any issues you find, and submit the review with an appropriate " |
| 105 | + f"verdict (APPROVE, REQUEST_CHANGES, or COMMENT)." |
| 106 | +) |
| 107 | + |
| 108 | + |
| 109 | +def _extract_tool_calls(agent) -> list[tuple[str, dict]]: |
| 110 | + """Walk agent.message_history in order; return [(tool_name, args), ...].""" |
| 111 | + calls: list[tuple[str, dict]] = [] |
| 112 | + for msg in agent.message_history: |
| 113 | + if not msg.tool_calls: |
| 114 | + continue |
| 115 | + for req in msg.tool_calls.values(): |
| 116 | + params = req.params |
| 117 | + name = getattr(params, "name", None) or "" |
| 118 | + args = getattr(params, "arguments", None) or {} |
| 119 | + calls.append((name, dict(args))) |
| 120 | + return calls |
| 121 | + |
| 122 | + |
| 123 | +def _find_review_url(repo: str, pr_number: int) -> str | None: |
| 124 | + try: |
| 125 | + out = subprocess.check_output( |
| 126 | + [ |
| 127 | + "gh", "api", |
| 128 | + f"repos/{repo}/pulls/{pr_number}/reviews", |
| 129 | + "--jq", ".[-1].html_url", |
| 130 | + ], |
| 131 | + encoding="utf-8", |
| 132 | + ).strip() |
| 133 | + except Exception: |
| 134 | + return None |
| 135 | + return out or None |
| 136 | + |
| 137 | + |
| 138 | +def _evaluate(calls: list[tuple[str, dict]]) -> dict: |
| 139 | + """Compute the five pass-criteria booleans plus diagnostics.""" |
| 140 | + read_skill_idx = None |
| 141 | + first_mutating_idx = None |
| 142 | + create_idx = None |
| 143 | + submit_idx = None |
| 144 | + single_shot_indices: list[int] = [] |
| 145 | + comment_indices: list[int] = [] |
| 146 | + verdict = None |
| 147 | + other_calls: list[tuple[int, str]] = [] |
| 148 | + |
| 149 | + for i, (name, args) in enumerate(calls): |
| 150 | + is_expected = ( |
| 151 | + name == "read_skill" |
| 152 | + or name in MUTATING_PR_TOOLS |
| 153 | + ) |
| 154 | + |
| 155 | + if name == "read_skill" and args.get("path") == EXPECTED_SKILL_URI: |
| 156 | + if read_skill_idx is None: |
| 157 | + read_skill_idx = i |
| 158 | + |
| 159 | + if name in MUTATING_PR_TOOLS and first_mutating_idx is None: |
| 160 | + first_mutating_idx = i |
| 161 | + |
| 162 | + if name == "pull_request_review_write": |
| 163 | + method = args.get("method") |
| 164 | + event = args.get("event") |
| 165 | + if method == "create": |
| 166 | + if event: |
| 167 | + single_shot_indices.append(i) |
| 168 | + elif create_idx is None: |
| 169 | + create_idx = i |
| 170 | + elif method == "submit_pending": |
| 171 | + if submit_idx is None: |
| 172 | + submit_idx = i |
| 173 | + verdict = event |
| 174 | + |
| 175 | + if name == "add_comment_to_pending_review": |
| 176 | + comment_indices.append(i) |
| 177 | + |
| 178 | + if not is_expected: |
| 179 | + other_calls.append((i, name)) |
| 180 | + |
| 181 | + skill_before_write = ( |
| 182 | + read_skill_idx is not None |
| 183 | + and (first_mutating_idx is None or read_skill_idx < first_mutating_idx) |
| 184 | + ) |
| 185 | + create_pending_ok = create_idx is not None |
| 186 | + comments_ok = ( |
| 187 | + create_idx is not None |
| 188 | + and any(ci > create_idx for ci in comment_indices) |
| 189 | + ) |
| 190 | + submit_ok = ( |
| 191 | + submit_idx is not None |
| 192 | + and verdict in VALID_VERDICTS |
| 193 | + and (create_idx is None or submit_idx > create_idx) |
| 194 | + ) |
| 195 | + no_bypass = not single_shot_indices |
| 196 | + |
| 197 | + overall = ( |
| 198 | + skill_before_write |
| 199 | + and create_pending_ok |
| 200 | + and comments_ok |
| 201 | + and submit_ok |
| 202 | + and no_bypass |
| 203 | + ) |
| 204 | + |
| 205 | + return { |
| 206 | + "skill_before_write": skill_before_write, |
| 207 | + "create_pending_ok": create_pending_ok, |
| 208 | + "comments_ok": comments_ok, |
| 209 | + "comment_count": len(comment_indices), |
| 210 | + "submit_ok": submit_ok, |
| 211 | + "verdict": verdict, |
| 212 | + "no_bypass": no_bypass, |
| 213 | + "single_shot_indices": single_shot_indices, |
| 214 | + "other_calls": other_calls, |
| 215 | + "overall": overall, |
| 216 | + } |
| 217 | + |
| 218 | + |
| 219 | +@fast.agent( |
| 220 | + name="pr_reviewer", |
| 221 | + instruction=( |
| 222 | + "You are a software engineer assisting with GitHub pull request " |
| 223 | + "workflows. {{agentSkills}}" |
| 224 | + ), |
| 225 | + servers=["github_skills"], |
| 226 | +) |
| 227 | +async def main() -> int: |
| 228 | + print(f"Target: {REPO} PR #{PR_NUMBER}") |
| 229 | + print(f"Prompt: {PR_REVIEW_PROMPT}") |
| 230 | + print() |
| 231 | + |
| 232 | + async with fast.run() as agent: |
| 233 | + start = time.monotonic() |
| 234 | + response = await agent.send(PR_REVIEW_PROMPT) |
| 235 | + elapsed = time.monotonic() - start |
| 236 | + |
| 237 | + calls = _extract_tool_calls(agent) |
| 238 | + result = _evaluate(calls) |
| 239 | + |
| 240 | + print() |
| 241 | + print("=" * 72) |
| 242 | + print("Ordered tool calls:") |
| 243 | + if not calls: |
| 244 | + print(" (none)") |
| 245 | + for i, (name, args) in enumerate(calls): |
| 246 | + compact = {k: args[k] for k in args if k not in {"body"}} |
| 247 | + print(f" [{i}] {name} {json.dumps(compact, default=str)[:180]}") |
| 248 | + print() |
| 249 | + |
| 250 | + banner = [ |
| 251 | + ("skill-read-before-write", |
| 252 | + result["skill_before_write"], None), |
| 253 | + ("create-pending-review", |
| 254 | + result["create_pending_ok"], None), |
| 255 | + ("add-comment(s)", |
| 256 | + result["comments_ok"], f"count={result['comment_count']}"), |
| 257 | + ("submit-pending-with-verdict", |
| 258 | + result["submit_ok"], f"verdict={result['verdict']}"), |
| 259 | + ("no-single-shot-bypass", |
| 260 | + result["no_bypass"], |
| 261 | + f"bypass_indices={result['single_shot_indices']}" |
| 262 | + if result["single_shot_indices"] else None), |
| 263 | + ] |
| 264 | + for label, ok, note in banner: |
| 265 | + status = "PASS" if ok else "FAIL" |
| 266 | + extra = f" ({note})" if note else "" |
| 267 | + print(f" {label:<32} {status}{extra}") |
| 268 | + print(f" {'overall':<32} {'PASS' if result['overall'] else 'FAIL'}") |
| 269 | + print() |
| 270 | + |
| 271 | + if result["other_calls"]: |
| 272 | + print("Tool calls outside prescribed workflow:") |
| 273 | + for i, name in result["other_calls"]: |
| 274 | + print(f" [{i}] {name}") |
| 275 | + print() |
| 276 | + |
| 277 | + review_url = _find_review_url(REPO, PR_NUMBER) |
| 278 | + print(f"Review URL: {review_url or '(not found via gh api)'}") |
| 279 | + print(f"Wall-clock: {elapsed:.1f}s") |
| 280 | + print("=" * 72) |
| 281 | + print() |
| 282 | + print("Final assistant response:") |
| 283 | + print(response if isinstance(response, str) else str(response)) |
| 284 | + |
| 285 | + return 0 if result["overall"] else 1 |
| 286 | + |
| 287 | + |
| 288 | +if __name__ == "__main__": |
| 289 | + sys.exit(asyncio.run(main())) |
0 commit comments