Skip to content

Commit dd33c37

Browse files
committed
Route URI-backed skills through read_skill; polish and add Scenario #1 e2e
Force read_skill when any manifest is URI-backed so activation routes through the SEP rather than inlined server text. Consolidate SKILL.md URI helpers, make SkillReader scheme-agnostic, migrate logging to structured data= kwargs, fix a misleading MCP-vs-MCP collision warning, pin include_instructions:false in the activation test, add a smart-agent variant documenting the tool-redundancy finding, and rewrite the e2e harness to run Scenario #1 against a real PR. Squashes 1ffdef6, 0193369, e4955b4, 10d08e4, 8766815, 2ea41dd, cfaeeee, c404a3a, 48f951d, d8946ec.
1 parent 268d564 commit dd33c37

13 files changed

Lines changed: 781 additions & 74 deletions

File tree

scripts/skills_e2e_agent/agent.py

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
"""Scenario #1 end-to-end run: review a real PR via the pull-requests skill.
2+
3+
Exercises the **write path** of the Skills-over-MCP SEP: the model must
4+
activate the `pull-requests` skill bundled with olaservo/github-mcp-server
5+
(branch `add-agent-skills`), then follow the three-step pending-review
6+
workflow (create → add_comment_to_pending_review → submit_pending) to post
7+
a real GitHub review.
8+
9+
Pass criteria (all five must hold):
10+
1. `read_skill` called with `skill://pull-requests/SKILL.md` before any
11+
mutating PR tool.
12+
2. `pull_request_review_write` called with `method="create"` and no `event`.
13+
3. At least one `add_comment_to_pending_review` call after step 2.
14+
4. `pull_request_review_write` called with `method="submit_pending"` and
15+
`event` in {APPROVE, REQUEST_CHANGES, COMMENT}.
16+
5. No single-shot bypass (no `pull_request_review_write` call that sets
17+
both `method="create"` and `event`).
18+
19+
Pre-reqs:
20+
- ANTHROPIC_API_KEY in env.
21+
- GITHUB_TOKEN in env (or resolvable via `gh auth token`).
22+
- Server running **without** `--read-only` (Scenario #1 requires the
23+
write path):
24+
GITHUB_PERSONAL_ACCESS_TOKEN=$(gh auth token) \\
25+
DISABLE_INSTRUCTIONS=true \\
26+
./github-mcp-server http --port 8082 --toolsets=pull_requests
27+
(or `--toolsets=all` for the broader-tool-list variant).
28+
- A reviewable PR on the subject repo. Produce one with
29+
`experiments/code-review-subject/scripts/create-pr-input-validation.sh`.
30+
31+
Usage:
32+
# One-step (PR_NUMBER auto-detected from the canonical head branch):
33+
GITHUB_TOKEN=$(gh auth token) uv run scripts/skills_e2e_agent/agent.py
34+
35+
# Or target a specific PR:
36+
GITHUB_TOKEN=$(gh auth token) REPO=olaservo/code-review-subject \\
37+
PR_NUMBER=7 uv run scripts/skills_e2e_agent/agent.py
38+
"""
39+
40+
from __future__ import annotations
41+
42+
import asyncio
43+
import json
44+
import os
45+
import subprocess
46+
import sys
47+
import time
48+
49+
from fast_agent import FastAgent
50+
51+
DEFAULT_REPO = "olaservo/code-review-subject"
52+
DEFAULT_HEAD_BRANCH = "feature/input-validation-enhancement"
53+
EXPECTED_SKILL_URI = "skill://pull-requests/SKILL.md"
54+
VALID_VERDICTS = {"APPROVE", "REQUEST_CHANGES", "COMMENT"}
55+
MUTATING_PR_TOOLS = {
56+
"pull_request_review_write",
57+
"add_comment_to_pending_review",
58+
}
59+
60+
61+
fast = FastAgent("skills-over-mcp Scenario #1: review a real PR")
62+
63+
64+
def _resolve_pr_number(repo: str) -> int:
65+
env_val = os.environ.get("PR_NUMBER")
66+
if env_val:
67+
try:
68+
return int(env_val)
69+
except ValueError:
70+
sys.exit(f"PR_NUMBER={env_val!r} is not an integer")
71+
72+
try:
73+
out = subprocess.check_output(
74+
[
75+
"gh", "pr", "list",
76+
"--repo", repo,
77+
"--head", DEFAULT_HEAD_BRANCH,
78+
"--state", "open",
79+
"--json", "number",
80+
"--jq", ".[0].number",
81+
],
82+
encoding="utf-8",
83+
).strip()
84+
except Exception as exc:
85+
sys.exit(
86+
f"PR_NUMBER not set and auto-detect failed: {exc}. "
87+
f"Run experiments/code-review-subject/scripts/create-pr-input-validation.sh "
88+
f"first, or pass PR_NUMBER explicitly."
89+
)
90+
if not out:
91+
sys.exit(
92+
f"No open PR on {repo} head={DEFAULT_HEAD_BRANCH}. "
93+
f"Run the scaffolding script first."
94+
)
95+
return int(out)
96+
97+
98+
REPO = os.environ.get("REPO", DEFAULT_REPO)
99+
PR_NUMBER = _resolve_pr_number(REPO)
100+
OWNER, REPO_NAME = REPO.split("/", 1)
101+
102+
PR_REVIEW_PROMPT = (
103+
f"Review PR #{PR_NUMBER} on {REPO}. Read the diff, then leave comments "
104+
f"on any issues you find, and submit the review with an appropriate "
105+
f"verdict (APPROVE, REQUEST_CHANGES, or COMMENT)."
106+
)
107+
108+
109+
def _extract_tool_calls(agent) -> list[tuple[str, dict]]:
110+
"""Walk agent.message_history in order; return [(tool_name, args), ...]."""
111+
calls: list[tuple[str, dict]] = []
112+
for msg in agent.message_history:
113+
if not msg.tool_calls:
114+
continue
115+
for req in msg.tool_calls.values():
116+
params = req.params
117+
name = getattr(params, "name", None) or ""
118+
args = getattr(params, "arguments", None) or {}
119+
calls.append((name, dict(args)))
120+
return calls
121+
122+
123+
def _find_review_url(repo: str, pr_number: int) -> str | None:
124+
try:
125+
out = subprocess.check_output(
126+
[
127+
"gh", "api",
128+
f"repos/{repo}/pulls/{pr_number}/reviews",
129+
"--jq", ".[-1].html_url",
130+
],
131+
encoding="utf-8",
132+
).strip()
133+
except Exception:
134+
return None
135+
return out or None
136+
137+
138+
def _evaluate(calls: list[tuple[str, dict]]) -> dict:
139+
"""Compute the five pass-criteria booleans plus diagnostics."""
140+
read_skill_idx = None
141+
first_mutating_idx = None
142+
create_idx = None
143+
submit_idx = None
144+
single_shot_indices: list[int] = []
145+
comment_indices: list[int] = []
146+
verdict = None
147+
other_calls: list[tuple[int, str]] = []
148+
149+
for i, (name, args) in enumerate(calls):
150+
is_expected = (
151+
name == "read_skill"
152+
or name in MUTATING_PR_TOOLS
153+
)
154+
155+
if name == "read_skill" and args.get("path") == EXPECTED_SKILL_URI:
156+
if read_skill_idx is None:
157+
read_skill_idx = i
158+
159+
if name in MUTATING_PR_TOOLS and first_mutating_idx is None:
160+
first_mutating_idx = i
161+
162+
if name == "pull_request_review_write":
163+
method = args.get("method")
164+
event = args.get("event")
165+
if method == "create":
166+
if event:
167+
single_shot_indices.append(i)
168+
elif create_idx is None:
169+
create_idx = i
170+
elif method == "submit_pending":
171+
if submit_idx is None:
172+
submit_idx = i
173+
verdict = event
174+
175+
if name == "add_comment_to_pending_review":
176+
comment_indices.append(i)
177+
178+
if not is_expected:
179+
other_calls.append((i, name))
180+
181+
skill_before_write = (
182+
read_skill_idx is not None
183+
and (first_mutating_idx is None or read_skill_idx < first_mutating_idx)
184+
)
185+
create_pending_ok = create_idx is not None
186+
comments_ok = (
187+
create_idx is not None
188+
and any(ci > create_idx for ci in comment_indices)
189+
)
190+
submit_ok = (
191+
submit_idx is not None
192+
and verdict in VALID_VERDICTS
193+
and (create_idx is None or submit_idx > create_idx)
194+
)
195+
no_bypass = not single_shot_indices
196+
197+
overall = (
198+
skill_before_write
199+
and create_pending_ok
200+
and comments_ok
201+
and submit_ok
202+
and no_bypass
203+
)
204+
205+
return {
206+
"skill_before_write": skill_before_write,
207+
"create_pending_ok": create_pending_ok,
208+
"comments_ok": comments_ok,
209+
"comment_count": len(comment_indices),
210+
"submit_ok": submit_ok,
211+
"verdict": verdict,
212+
"no_bypass": no_bypass,
213+
"single_shot_indices": single_shot_indices,
214+
"other_calls": other_calls,
215+
"overall": overall,
216+
}
217+
218+
219+
@fast.agent(
220+
name="pr_reviewer",
221+
instruction=(
222+
"You are a software engineer assisting with GitHub pull request "
223+
"workflows. {{agentSkills}}"
224+
),
225+
servers=["github_skills"],
226+
)
227+
async def main() -> int:
228+
print(f"Target: {REPO} PR #{PR_NUMBER}")
229+
print(f"Prompt: {PR_REVIEW_PROMPT}")
230+
print()
231+
232+
async with fast.run() as agent:
233+
start = time.monotonic()
234+
response = await agent.send(PR_REVIEW_PROMPT)
235+
elapsed = time.monotonic() - start
236+
237+
calls = _extract_tool_calls(agent)
238+
result = _evaluate(calls)
239+
240+
print()
241+
print("=" * 72)
242+
print("Ordered tool calls:")
243+
if not calls:
244+
print(" (none)")
245+
for i, (name, args) in enumerate(calls):
246+
compact = {k: args[k] for k in args if k not in {"body"}}
247+
print(f" [{i}] {name} {json.dumps(compact, default=str)[:180]}")
248+
print()
249+
250+
banner = [
251+
("skill-read-before-write",
252+
result["skill_before_write"], None),
253+
("create-pending-review",
254+
result["create_pending_ok"], None),
255+
("add-comment(s)",
256+
result["comments_ok"], f"count={result['comment_count']}"),
257+
("submit-pending-with-verdict",
258+
result["submit_ok"], f"verdict={result['verdict']}"),
259+
("no-single-shot-bypass",
260+
result["no_bypass"],
261+
f"bypass_indices={result['single_shot_indices']}"
262+
if result["single_shot_indices"] else None),
263+
]
264+
for label, ok, note in banner:
265+
status = "PASS" if ok else "FAIL"
266+
extra = f" ({note})" if note else ""
267+
print(f" {label:<32} {status}{extra}")
268+
print(f" {'overall':<32} {'PASS' if result['overall'] else 'FAIL'}")
269+
print()
270+
271+
if result["other_calls"]:
272+
print("Tool calls outside prescribed workflow:")
273+
for i, name in result["other_calls"]:
274+
print(f" [{i}] {name}")
275+
print()
276+
277+
review_url = _find_review_url(REPO, PR_NUMBER)
278+
print(f"Review URL: {review_url or '(not found via gh api)'}")
279+
print(f"Wall-clock: {elapsed:.1f}s")
280+
print("=" * 72)
281+
print()
282+
print("Final assistant response:")
283+
print(response if isinstance(response, str) else str(response))
284+
285+
return 0 if result["overall"] else 1
286+
287+
288+
if __name__ == "__main__":
289+
sys.exit(asyncio.run(main()))
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
default_model: anthropic.claude-sonnet-4-6
2+
3+
logger:
4+
progress_display: false
5+
show_chat: true
6+
show_tools: true
7+
truncate_tools: true
8+
9+
mcp:
10+
servers:
11+
github_skills:
12+
transport: http
13+
url: http://localhost:8082/mcp
14+
headers:
15+
Authorization: "Bearer ${GITHUB_TOKEN}"
16+
# Contamination guarantee for the activation test: suppress the
17+
# server's `instructions` field from the system prompt so the model's
18+
# only signal that the pull-requests skill exists comes from the host's
19+
# `<available_skills>` block (built from `skill://index.json`).
20+
# If the model still calls `read_skill` with the skill:// URI and
21+
# surfaces SKILL.md-only method names, the activation went through the
22+
# SEP integration end-to-end, not via inlined server text.
23+
include_instructions: false
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""Smart-agent variant of the Skills-over-MCP activation test.
2+
3+
`SmartAgent` already exposes a model-callable `get_resource(uri, server_name?)`
4+
tool that matches the SEP's illustrative `read_resource(server, uri)` shape
5+
(with server_name optional — cleaner activation profile per the host guide
6+
pitfall #2). That tool existed before this branch's work.
7+
8+
This harness tests: when a smart agent has the github_skills server attached
9+
and my Skills-over-MCP discovery runs (loader populates `<available_skills>`
10+
with the skill:// URI), what does the model do? Does it use `get_resource`
11+
(prior-art smart-agent tool) or `read_skill` (the URI-aware extension I
12+
added)? Does activation PASS either way?
13+
14+
Pre-reqs identical to the basic-agent harness.
15+
"""
16+
17+
import asyncio
18+
import sys
19+
20+
from fast_agent import FastAgent
21+
22+
fast = FastAgent("skills-over-mcp smart-agent activation test")
23+
24+
PR_REVIEW_PROMPT = (
25+
"I need to review pull request #42 on owner/repo and leave several "
26+
"line-specific comments before approving. Briefly walk me through the "
27+
"exact tool sequence you would use and name each tool/method you would "
28+
"call. Use the available skills if any are relevant."
29+
)
30+
31+
32+
@fast.smart(
33+
name="smart_pr_reviewer",
34+
instruction=(
35+
"You are a software engineer assisting with GitHub pull request "
36+
"workflows. {{agentSkills}}"
37+
),
38+
servers=["github_skills"],
39+
)
40+
async def main() -> int:
41+
async with fast.run() as agent:
42+
response = await agent.send(PR_REVIEW_PROMPT)
43+
text = response.lower() if isinstance(response, str) else str(response).lower()
44+
activated = (
45+
"submit_pending".lower() in text
46+
or "create_pending_pull_request_review" in text
47+
)
48+
print()
49+
print("=" * 60)
50+
print(
51+
f"Activation indicator (skill-specific method mentioned): "
52+
f"{'PASS' if activated else 'FAIL'}"
53+
)
54+
print("=" * 60)
55+
return 0 if activated else 1
56+
57+
58+
if __name__ == "__main__":
59+
sys.exit(asyncio.run(main()))
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
default_model: anthropic.claude-sonnet-4-6
2+
3+
logger:
4+
progress_display: false
5+
show_chat: true
6+
show_tools: true
7+
truncate_tools: true
8+
9+
mcp:
10+
servers:
11+
github_skills:
12+
transport: http
13+
url: http://localhost:8082/mcp
14+
headers:
15+
Authorization: "Bearer ${GITHUB_TOKEN}"
16+
# Same contamination isolation as the basic-agent variant.
17+
include_instructions: false

0 commit comments

Comments
 (0)