From b6d71656afef8f0ddd7255358059b2eeff6f6e43 Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 18:00:22 +0700
Subject: [PATCH 1/8] refactor(cli): modularize cmd_run scenario builder

cmd_run.rs grew past the 300 prod-LoC convention while the in-flight
refactor unified single-tool, weighted-pattern and legacy tool_call
configs through one build_scenario factory and added M5-M7 kind
dispatch.

Split into private builder/params/patterns submodules. The public
surface (run_from_config + the re-exported parse_dur_str shared with
deadlock-probe and cross) is unchanged and now guarded by a cmd_run
integration test. CHANGELOG and the custom-scenario doc are synced to
the new layout.

ci-checks green: fmt, clippy -D warnings, build --locked, 260+ tests,
doc -D warnings.
---
 AGENTS.md                                     |  30 +-
 CHANGELOG.md                                  | 120 ++++----
 README.md                                     |   2 +-
 crates/mcp-loadtest-cli/Cargo.toml            |   1 +
 crates/mcp-loadtest-cli/src/cmd_run.rs        | 130 ++-------
 .../mcp-loadtest-cli/src/cmd_run/builder.rs   | 259 ++++++++++++++++++
 crates/mcp-loadtest-cli/src/cmd_run/params.rs | 218 +++++++++++++++
 .../mcp-loadtest-cli/src/cmd_run/patterns.rs  | 195 +++++++++++++
 crates/mcp-loadtest-cli/src/main.rs           |  14 +-
 crates/mcp-loadtest-cli/tests/cmd_run.rs      |  45 +++
 crates/mcp-loadtest/src/config/example.rs     |   7 +-
 crates/mcp-loadtest/src/config/validate.rs    |   5 +-
 crates/mcp-loadtest/src/metrics/mod.rs        |   2 +-
 crates/mcp-loadtest/src/scenario/fuzzer.rs    |  14 +-
 crates/mcp-loadtest/src/scenario/pattern.rs   | 133 ++++++++-
 crates/mcp-loadtest/src/scenario/soak.rs      |   2 +-
 crates/mcp-loadtest/src/scenario/sustained.rs |   4 +-
 docs/examples/custom-scenario.md              |   5 +-
 18 files changed, 979 insertions(+), 207 deletions(-)
 create mode 100644 crates/mcp-loadtest-cli/src/cmd_run/builder.rs
 create mode 100644 crates/mcp-loadtest-cli/src/cmd_run/params.rs
 create mode 100644 crates/mcp-loadtest-cli/src/cmd_run/patterns.rs
 create mode 100644 crates/mcp-loadtest-cli/tests/cmd_run.rs
diff --git a/AGENTS.md b/AGENTS.md
index daa974a..4d7f428 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -166,18 +166,18 @@ Started 2026-05-11. Final sprint before user-review hand-off. **Repo stays priva
 
 | Agent | Scope | Files OWNED | Status |
 |---|---|---|---|
-| **U** | Protocol fuzzer scenario — random/malformed JSON-RPC payloads | `src/scenario/fuzzer.rs` (new); `src/analysis/fuzz_report.rs` (new); `tests/fuzzer.rs` (new) | open |
-| **V** | Coverage tracking + per-tool SLO assertions | `src/analysis/coverage.rs` (new); add `coverage: Option<CoverageReport>` to `Report`; extend `Run::execute`; `tests/coverage.rs` (new) | open |
+| **U** | Protocol fuzzer scenario — random/malformed JSON-RPC payloads | `src/scenario/fuzzer.rs` (new); `src/analysis/fuzz_report.rs` (new); `tests/fuzzer.rs` (new) | done (2026-05-11) |
+| **V** | Coverage tracking + per-tool SLO assertions | `src/analysis/coverage.rs` (new); add `coverage: Option<CoverageReport>` to `Report`; extend `Run::execute`; `tests/coverage.rs` (new) | done (2026-05-11) |
 | **W** | `mcp-loadtest serve --mcp` self-hosted MCP server | `src/serve/mod.rs` + `src/serve/tools.rs` (new); add `Serve` Cmd; `tests/serve_smoke.rs` (new) | done (2026-05-11) |
-| **X** | README rewrite + cookbook | `README.md`; `docs/examples/{ci-integration,custom-scenario,debugging-deadlocks}.md` (new) | open |
+| **X** | README rewrite + cookbook | `README.md`; `docs/examples/{ci-integration,custom-scenario,debugging-deadlocks}.md` (new) | done (2026-05-11) |
 
 ### M7 sprint exit criteria
 
-- [ ] ci-checks green; `cargo test --workspace` 100% pass
-- [ ] `mcp-loadtest fuzz --server "..."` against `mock-broken.py` reports interesting findings (panic / hang / parse error)
-- [ ] Coverage shows "echo" exercised in scenarios_basic
-- [ ] `mcp-loadtest serve --mcp` boots; `tools/list` returns `deadlock_probe`/`sustained_load`/`compare_runs` (minimum)
-- [ ] README leads with the deadlock demo + has 3 cookbook examples
+- [x] ci-checks green; `cargo test --workspace` 100% pass (Windows PowerShell script, 2026-05-11)
+- [ ] Fuzzer scenario via `mcp-loadtest run --config <fuzzer.toml>` against `mock-broken.py` reports interesting findings (dedicated `fuzz` subcommand deferred)
+- [x] Coverage shows "echo" exercised in coverage/scenario tests
+- [x] `mcp-loadtest serve --mcp` boots; `tools/list` returns `deadlock_probe`/`sustained_load`/`compare_runs` (minimum)
+- [x] README leads with the deadlock demo + has 3 cookbook examples
 - [ ] /security-review + /release-checks pass
 
 After M7: STOP. Hand off to user for review BEFORE flipping repo public.
@@ -195,7 +195,7 @@ Started 2026-05-11. Target: 1 week.
 | **Q** | Real-time TUI dashboard via ratatui + crossterm | `feat/m6-tui` | `src/tui/mod.rs` (uncomment + enrich); `src/tui/dashboard.rs` (new); `tests/tui_smoke.rs` (new) | `metrics::Recorder` (clone), `report/*` | done (2026-05-11) |
 | **R** | `race_detector` analyzer + `RaceCheck` scenario | `feat/m6-race` | `src/analysis/race_detector.rs`; `src/scenario/race_check.rs` (new); `tests/race_check.rs` (new) | session, scenario trait | done (2026-05-11) |
 | **S** | Cross-server CLI subcommand `mcp-loadtest cross` | `feat/m6-cross` | `crates/mcp-loadtest-cli/src/cmd_cross.rs` (new); add `Cross` Cmd variant in main.rs | run, scenario, report (read) | done (2026-05-11) |
-| **T** | ProcessSampler enrichment (fd + threads) + Soak leak-regression | `feat/m6-process-leak` | `src/metrics/process.rs` (extend); `src/report/mod.rs` ONLY to add `peak_fd: u64`, `peak_threads: u64` to ProcessStats (locked-API expansion); refactor `Soak` to use the new fields | sysinfo API | open |
+| **T** | ProcessSampler enrichment (fd + threads) + Soak leak-regression | `feat/m6-process-leak` | `src/metrics/process.rs` (extend); `src/report/mod.rs` ONLY to add `peak_fd: u64`, `peak_threads: u64` to ProcessStats (locked-API expansion); refactor `Soak` to use the new fields | sysinfo API | done (2026-05-11) |
 
 ### M6 conflicts to expect
 
@@ -220,9 +220,9 @@ Started 2026-05-11. Target: 1 week. Mostly post-hoc analysis on top of the M3 `R
 
 | Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
 |---|---|---|---|---|---|
-| **M** | `Ramp` scenario + `BreakingPointDetector` | `feat/m5-breaking-point` | `src/analysis/breaking_point.rs` (new); `src/scenario/ramp.rs` (new); `tests/breaking_point.rs` (new) | `analysis/mod.rs` (declare module), `scenario/mod.rs` (register), `metrics/*`, locked `Report` | open |
-| **N** | Performance grading (A-F per latency / concurrency / error) | `feat/m5-grading` | `src/analysis/grading.rs` (new); `tests/grading.rs` (new) | `report/*` (read), `metrics/*` | open |
-| **O** | Pattern engine — multi-step + weighted random + think-time | `feat/m5-patterns` | `src/scenario/pattern.rs` (new); refactor `src/scenario/sustained.rs` to drive Patterns; `tests/patterns.rs` (new) | `Session::call_tool`, locked `Scenario` trait | open |
+| **M** | `Ramp` scenario + `BreakingPointDetector` | `feat/m5-breaking-point` | `src/analysis/breaking_point.rs` (new); `src/scenario/ramp.rs` (new); `tests/breaking_point.rs` (new) | `analysis/mod.rs` (declare module), `scenario/mod.rs` (register), `metrics/*`, locked `Report` | done (2026-05-11) |
+| **N** | Performance grading (A-F per latency / concurrency / error) | `feat/m5-grading` | `src/analysis/grading.rs` (new); `tests/grading.rs` (new) | `report/*` (read), `metrics/*` | done (2026-05-11) |
+| **O** | Pattern engine — multi-step + weighted random + think-time | `feat/m5-patterns` | `src/scenario/pattern.rs` (new); refactor `src/scenario/sustained.rs` to drive Patterns; `tests/patterns.rs` (new) | `Session::call_tool`, locked `Scenario` trait | done (2026-05-11) |
 | **P** | `compare` subcommand + soak scenario polish | `feat/m5-compare-soak` | `src/scenario/soak.rs` (new); `crates/mcp-loadtest-cli/src/cmd_compare.rs` (new); add `mcp-loadtest compare baseline.json current.json` to CLI | `report/json.rs` (read) for the on-disk shape | done |
 
 ### M5 conflicts / notes
@@ -255,9 +255,9 @@ Pre-M4 (integration agent, done): toolchain bumped 1.85→stable; `Transport` tr
 
 | Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
 |---|---|---|---|---|---|
-| **J** | HTTP transport (Streamable HTTP simple variant) | `feat/m4-http` | `src/protocol/transport/http.rs`; `tests/http_transport.rs` (new) | `protocol/transport/mod.rs` (locked Transport trait), `session.rs` | open |
-| **K** | SSE transport (event-stream subscribe + POST send) | `feat/m4-sse` | `src/protocol/transport/sse.rs`; `tests/sse_transport.rs` (new) | `protocol/transport/mod.rs`, `session.rs` | open |
-| **L** | Python HTTP+SSE mock fixtures | `feat/m4-fixtures-http` | `tests/fixtures/mock-http-server.py`; `tests/fixtures/mock-sse-server.py`; `tests/fixtures/_http_common.py` (allowed) | existing fixture conventions | open |
+| **J** | HTTP transport (Streamable HTTP simple variant) | `feat/m4-http` | `src/protocol/transport/http.rs`; `tests/http_transport.rs` (new) | `protocol/transport/mod.rs` (locked Transport trait), `session.rs` | done (2026-05-11) |
+| **K** | SSE transport (event-stream subscribe + POST send) | `feat/m4-sse` | `src/protocol/transport/sse.rs`; `tests/sse_transport.rs` (new) | `protocol/transport/mod.rs`, `session.rs` | done (2026-05-11) |
+| **L** | Python HTTP+SSE mock fixtures | `feat/m4-fixtures-http` | `tests/fixtures/mock-http-server.py`; `tests/fixtures/mock-sse-server.py`; `tests/fixtures/_http_common.py` (allowed) | existing fixture conventions | done (2026-05-11) |
 
 Agent I (stdio extraction + Transport trait) done by integration agent pre-sprint; left as 3 agents instead of 4 to reduce coordination risk.
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf3992f..549b13d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,75 +8,77 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+
 - Project scaffolding: workspace layout, Cargo config, CLAUDE.md hierarchy, slash commands, CI workflow.
 - Design document covering motivation, types, algorithms, test matrix, milestones (DESIGN.md, 20 sections).
 - Project-structure document for AI-assisted development workflow (PROJECT-STRUCTURE.md).
 - M1 protocol stack:
-  - `protocol::jsonrpc` — JSON-RPC 2.0 message types (`OutgoingRequest`, `OutgoingNotification`, `ResponseEnvelope`, `ResponsePayload`, `ErrorObject`).
-  - `protocol::mcp` — MCP types (`InitializeParams`/`Result`, `Tool`, `ListToolsResult`, `CallToolParams`, `CallToolResult`, `Content`).
+    - `protocol::jsonrpc` — JSON-RPC 2.0 message types (`OutgoingRequest`, `OutgoingNotification`, `ResponseEnvelope`, `ResponsePayload`, `ErrorObject`).
+    - `protocol::mcp` — MCP types (`InitializeParams`/`Result`, `Tool`, `ListToolsResult`, `CallToolParams`, `CallToolResult`, `Content`).
 - M1 `Session` — stdio MCP session that spawns a child, runs `initialize` + `notifications/initialized`, exposes `list_tools` / `call_tool` / `shutdown`. Synchronous request/response only; concurrent in-flight requests deferred to M2.
 - Python test fixtures: `_common.py` (framing helpers) + `mock-normal.py` (echoes args).
 - End-to-end integration test `happy_path.rs` covering spawn → initialize → tools/list → tools/call → shutdown against `mock-normal.py`.
 - M2 scenarios + metrics core (delivered via 4-agent parallel sprint):
-  - `scenario::Scenario` trait + `RunContext` + `ScenarioOutcome` (interface contract pinned in AGENTS.md).
-  - `scenario::sustained::Sustained` — concurrent-load workload (M2 sequential against single Session; multi-Session pool is M3).
-  - `scenario::deadlock_probe::DeadlockProbe` — Vibe-Trading-bug-class detector. Wraps each `tools/call` with `hang_detect`; bails on first deadlock to avoid flooding a wedged session.
-  - `scenario::cold_start::ColdStart` — placeholder (M3 will activate once `RunContext` gains a session-spawning factory).
-  - `hang_detector::hang_detect` — two-phase `tokio::select!` watchdog (DESIGN.md §15.1) classifying each call as Ok / Slow / Deadlock / Err.
-  - `metrics::Recorder` — Arc-shared, lock-free outcome counters + 16-shard `hdrhistogram` for per-call latencies (microsecond resolution, 1µs..=1h range).
-  - `ScenarioMetrics`, `LatencyStats` (p50/p95/p99/p999/mean/min/max/count), `ThroughputStats`, `OutcomeCounts`.
+    - `scenario::Scenario` trait + `RunContext` + `ScenarioOutcome` (interface contract pinned in AGENTS.md).
+    - `scenario::sustained::Sustained` — concurrent-load workload (M2 sequential against single Session; multi-Session pool is M3).
+    - `scenario::deadlock_probe::DeadlockProbe` — Vibe-Trading-bug-class detector. Wraps each `tools/call` with `hang_detect`; bails on first deadlock to avoid flooding a wedged session.
+    - `scenario::cold_start::ColdStart` — placeholder (M3 will activate once `RunContext` gains a session-spawning factory).
+    - `hang_detector::hang_detect` — two-phase `tokio::select!` watchdog (DESIGN.md §15.1) classifying each call as Ok / Slow / Deadlock / Err.
+    - `metrics::Recorder` — Arc-shared, lock-free outcome counters + 16-shard `hdrhistogram` for per-call latencies (microsecond resolution, 1µs..=1h range).
+    - `ScenarioMetrics`, `LatencyStats` (p50/p95/p99/p999/mean/min/max/count), `ThroughputStats`, `OutcomeCounts`.
 - M2 fixtures: `mock-broken.py` (canonical deadlock pattern), `mock-slow.py` (2s tool latency), `mock-crash.py` (~1% mid-call exit).
 - M2 integration tests: `scenarios_basic.rs` (sustained + cold_start placeholder + cancellation), `deadlock.rs` (`mock_normal_no_deadlock` + `mock_broken_detects_deadlock` — the killer test that catches the Vibe-Trading bug class in <7s).
 - M3 reports + first internal release (delivered via 4-agent parallel sprint):
-  - `config::Config` + `ServerConfig` + `ScenarioConfig` + `ThresholdsConfig` + `OutputConfig` — TOML schema with humantime durations, semantic validation (`Config::from_toml_str`/`from_file`), and `example_config()` printer.
-  - `report::Report` + `ProcessStats` + `ProcessSample` + `ServerInfo` + `ThresholdViolation` + `ReportError` + `Reporter` trait.
-  - `report::markdown::MarkdownReporter` — DESIGN §17.3 template (status badge, summary, latency table, errors, process line, threshold violations, trace).
-  - `report::json::JsonReporter` — DESIGN §17.2 schema via a `ReportView` wrapper (durations as `_ms`, ISO 8601 timestamps); `Report` stays unmodified.
-  - `report::terminal::TerminalReporter` — ANSI-colored compact summary; respects `NO_COLOR` / `CLICOLOR` / non-tty automatically.
-  - `metrics::process::ProcessSampler` — sysinfo 0.32 backed periodic RSS+CPU sampler, two-phase CPU baseline, cancellation-aware, best-effort on dead PIDs.
-  - `run::Run` + `RunError` + `Run::execute()` — full orchestrator: ulid run-id, run dir creation, `Session` spawn, scenario drive, metrics snapshot, threshold evaluation, bounded shutdown.
-  - 79 tests passing across lib + 6 integration test files.
+    - `config::Config` + `ServerConfig` + `ScenarioConfig` + `ThresholdsConfig` + `OutputConfig` — TOML schema with humantime durations, semantic validation (`Config::from_toml_str`/`from_file`), and `example_config()` printer.
+    - `report::Report` + `ProcessStats` + `ProcessSample` + `ServerInfo` + `ThresholdViolation` + `ReportError` + `Reporter` trait.
+    - `report::markdown::MarkdownReporter` — DESIGN §17.3 template (status badge, summary, latency table, errors, process line, threshold violations, trace).
+    - `report::json::JsonReporter` — DESIGN §17.2 schema via a `ReportView` wrapper (durations as `_ms`, ISO 8601 timestamps); `Report` stays unmodified.
+    - `report::terminal::TerminalReporter` — ANSI-colored compact summary; respects `NO_COLOR` / `CLICOLOR` / non-tty automatically.
+    - `metrics::process::ProcessSampler` — sysinfo 0.32 backed periodic RSS+CPU sampler, two-phase CPU baseline, cancellation-aware, best-effort on dead PIDs.
+    - `run::Run` + `RunError` + `Run::execute()` — full orchestrator: ulid run-id, run dir creation, `Session` spawn, scenario drive, metrics snapshot, threshold evaluation, bounded shutdown.
+    - 79 tests passing across lib + 6 integration test files.
 - M3 CLI: `mcp-loadtest example-config | run --config <path> | deadlock-probe --server "..." | list-scenarios`. Run + DeadlockProbe write `report.md` and `metrics.json` under `runs/<ulid>/`; `deadlock-probe` exits non-zero when `deadlock_count > 0`.
 - M3 Vibe-Trading regression test: clones `HKUDS/Vibe-Trading@71220c7` (parent of PR #85) into `target/vibe-trading-fixture/`, runs `DeadlockProbe`, asserts deadlock detected. `#[ignore]`d by default; run with `cargo test --test vibe_trading_regression -- --ignored --nocapture`.
 - M4 transport parity (3-agent parallel sprint):
-  - `protocol::transport::Transport` async trait + `TransportError` (Io/Http/Closed/Timeout/Other).
-  - `StdioTransport` extracted from `Session` (legacy single-call path stays via `Session::spawn`).
-  - `HttpTransport` (Streamable HTTP simple JSON variant; SSE-response detection routes to a clear M5 deferral).
-  - `SseTransport` with background reader task + endpoint handshake + id-correlation buffer.
-  - Python fixtures `mock-http-server.py` + `mock-sse-server.py` (stdlib http.server only).
-  - `ServerConfig.url` + transport-aware `Run::execute` dispatch.
-  - Toolchain bumped 1.85 → stable to satisfy icu transitive MSRV pulled in by `url`.
+    - `protocol::transport::Transport` async trait + `TransportError` (Io/Http/Closed/Timeout/Other).
+    - `StdioTransport` extracted from `Session` (legacy single-call path stays via `Session::spawn`).
+    - `HttpTransport` (Streamable HTTP simple JSON variant; SSE-response detection routes to a clear M5 deferral).
+    - `SseTransport` with background reader task + endpoint handshake + id-correlation buffer.
+    - Python fixtures `mock-http-server.py` + `mock-sse-server.py` (stdlib http.server only).
+    - `ServerConfig.url` + transport-aware `Run::execute` dispatch.
+    - Toolchain bumped 1.85 → stable to satisfy icu transitive MSRV pulled in by `url`.
 - M5 analysis parity (4-agent parallel sprint):
-  - `analysis::breaking_point` (BreakingPointDetector w/ first-violator semantics on per-step deltas).
-  - `analysis::grading` (Grade A-F per latency/concurrency/error, worst-of-three rollup).
-  - `scenario::ramp` (linear-stepped concurrency; integrates with breaking_point).
-  - `scenario::pattern` (multi-step + weighted-random + think-time + ErrorBehavior).
-  - `Sustained` refactor to drive Patterns; `run_patterns` free function for multi-pattern callers.
-  - `scenario::soak` (periodic snapshots + leak signal via mean-latency regression).
-  - `mcp-loadtest compare baseline.json current.json` CLI subcommand (markdown/JSON diff).
+    - `analysis::breaking_point` (BreakingPointDetector w/ first-violator semantics on per-step deltas).
+    - `analysis::grading` (Grade A-F per latency/concurrency/error, worst-of-three rollup).
+    - `scenario::ramp` (linear-stepped concurrency; integrates with breaking_point).
+    - `scenario::pattern` (multi-step + weighted-random + think-time + ErrorBehavior).
+    - `Sustained` refactor to drive Patterns; `run_patterns` free function for multi-pattern callers.
+    - `scenario::soak` (periodic snapshots + leak signal via mean-latency regression).
+    - `mcp-loadtest compare baseline.json current.json` CLI subcommand (markdown/JSON diff).
 - M6 differentiators v1 (4-agent parallel sprint):
-  - `tui::dashboard` (ratatui + crossterm live polling; quits on q/Esc, propagates cancel).
-  - `analysis::race_detector` + `scenario::race_check` (key-sorted JSON canonicalization, divergence reporting).
-  - `mcp-loadtest cross --server "..." --server "..."` (side-by-side multi-server comparison with grading).
-  - `ProcessStats` enriched with `peak_fd`/`final_fd`/`peak_threads`/`final_threads` (best-effort: Linux /proc/fd; macOS/Windows degrade to 0).
-  - `Soak` linear-regression-based leak signal helper (`detect_leak`).
+    - `tui::dashboard` (ratatui + crossterm live polling; quits on q/Esc, propagates cancel).
+    - `analysis::race_detector` + `scenario::race_check` (key-sorted JSON canonicalization, divergence reporting).
+    - `mcp-loadtest cross --server "..." --server "..."` (side-by-side multi-server comparison with grading).
+    - `ProcessStats` enriched with `peak_fd`/`final_fd`/`peak_threads`/`final_threads` (best-effort: Linux /proc/fd; macOS/Windows degrade to 0).
+    - `Soak` linear-regression-based leak signal helper (`detect_leak`).
 - M7 differentiators v2 + v0.1.0 polish (4-agent parallel sprint):
-  - `scenario::fuzzer` (FuzzPayload enum: UnknownMethod / NumericMethod / GiantPayload / ControlChars / Nested / NullParams / StringParams; raw-byte variants documented + skipped pending Transport::raw_send hook).
-  - `analysis::fuzz_report` (FuzzClass classification + has_critical signal).
-  - `analysis::coverage` (CoverageReport: registered vs exercised tools + `coverage_pct`).
-  - `ToolSlo` per-tool latency budget assertions in `ThresholdsConfig`.
-  - `Recorder::record_tool` + `snapshot_per_tool` (per-tool counters; existing `record`/`snapshot` aggregate untouched for back-compat).
-  - `mcp-loadtest serve --mcp` self-hosted MCP server (DESIGN §21.2 differentiator) — exposes `deadlock_probe`/`sustained_load`/`compare_runs` as MCP tools so AI agents drive load tests directly via stdio JSON-RPC.
-  - README rewritten to lead with the deadlock demo + competitive positioning vs. reaatech.
-  - `docs/examples/{ci-integration,custom-scenario,debugging-deadlocks}.md` cookbook.
+    - `scenario::fuzzer` (FuzzPayload enum: UnknownMethod / NumericMethod / GiantPayload / ControlChars / Nested / NullParams / StringParams; raw-byte variants documented + skipped pending Transport::raw_send hook).
+    - `analysis::fuzz_report` (FuzzClass classification + has_critical signal).
+    - `analysis::coverage` (CoverageReport: registered vs exercised tools + `coverage_pct`).
+    - `ToolSlo` per-tool latency budget assertions in `ThresholdsConfig`.
+    - `Recorder::record_tool` + `snapshot_per_tool` (per-tool counters; existing `record`/`snapshot` aggregate untouched for back-compat).
+    - `mcp-loadtest serve --mcp` self-hosted MCP server (DESIGN §21.2 differentiator) — exposes `deadlock_probe`/`sustained_load`/`compare_runs` as MCP tools so AI agents drive load tests directly via stdio JSON-RPC.
+    - README rewritten to lead with the deadlock demo + competitive positioning vs. reaatech.
+    - `docs/examples/{ci-integration,custom-scenario,debugging-deadlocks}.md` cookbook.
 - Post-M7 competitive-gap close (3-agent parallel sprint, post-review):
-  - `scenario::spike` — sudden-burst concurrency pattern (baseline → spike window → cooldown). Closes the reaatech parity gap.
-  - `report::html::HtmlReporter` — self-contained `report.html` with inline SVG histogram, escaped HTML, no external CDN or JS. Closes the IBM/spbiju enterprise-report gap. Wired into the CLI as the `"html"` output format.
-  - `protocol::transport::ws::WsTransport` — WebSocket transport via tokio-tungstenite (rustls + webpki-roots); 16 MB per-frame OOM cap mirroring stdio. Activates the `"ws"` scheme that was previously parser-accepted but rejected at runtime.
-  - SECURITY.md — security disclosure policy at repo root (in-scope vs out-of-scope surfaces, reporting flow, recent hardening notes).
+    - `scenario::spike` — sudden-burst concurrency pattern (baseline → spike window → cooldown). Closes the reaatech parity gap.
+    - `report::html::HtmlReporter` — self-contained `report.html` with inline SVG histogram, escaped HTML, no external CDN or JS. Closes the IBM/spbiju enterprise-report gap. Wired into the CLI as the `"html"` output format.
+    - `protocol::transport::ws::WsTransport` — WebSocket transport via tokio-tungstenite (rustls + webpki-roots); 16 MB per-frame OOM cap mirroring stdio. Activates the `"ws"` scheme that was previously parser-accepted but rejected at runtime.
+    - SECURITY.md — security disclosure policy at repo root (in-scope vs out-of-scope surfaces, reporting flow, recent hardening notes).
 - 12 new tests bring the suite to ~255 passing (was 243): spike happy path + 5 HTML reporter (escaping, chart, violation styling) + 2 WS (echo roundtrip + scheme rejection) + 1 config parse-spike-kind + scenario name/schema asserts.
 
 ### Refactor / cleanup
+
 - `report::common` extracted (post-M4 `/simplify` pass): `fmt_duration`, `fmt_count`, `format_server_command`, `describe_failure`, `format_iso8601_utc` shared between markdown + terminal reporters (-69 LoC net).
 - `ThresholdViolation.metric: String` → `kind: ThresholdKind` enum (post-M3 QF-5); JSON wire format preserved via `#[serde(rename = "metric")]` + per-variant snake_case rename.
 - `Report::passed()` now treats `deadlock_count > 0` as a hard failure (post-M3 QF-1).
@@ -89,42 +91,50 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ADR 0004 — strategic decision to compete head-on (Path A) over contributing to reaatech (Path B) or repositioning (Path C).
 
 ### Fixed
+
 - `Soak::leak_threshold_mb_per_sec` renamed to `latency_drift_ms_per_sec` (the units were always ms/sec; the old name lied).
 - Fuzzer: skipped raw-transport iterations no longer bump `total_calls` or pollute `CallOutcome::Cancelled` (they never hit the wire).
 - Fuzzer: server-accepted malformed payloads now record `CallOutcome::Malformed` and bump `error_count` so threshold evaluators surface them.
 - Run: `memory_growth_mb` now compares `peak − final` instead of bare peak, so a steady-state high-RSS process no longer false-positives.
 
 ### Performance
+
 - Sustained / ramp / soak scenarios: `tokio::task::yield_now()` instead of `sleep(ZERO)` to avoid registering no-op reactor timers.
 - Metrics: per-tool `BTreeMap` moved behind `RwLock`; fast path is now a read lock.
 - `cmd_cross`: cross-server runs use `futures::future::join_all` (was sequential, now N-way parallel).
 - Fuzzer: `LazyLock` for `GIANT_PAYLOAD` + `NESTED_PAYLOAD` so multi-MB / 100-deep payloads build once per process.
 - Release profile: `panic = "abort"` shaves ~600 KB off the stripped binary (5.7 MB → 5.1 MB on x86_64-pc-windows-gnu).
 - Hot-path zero-copy refactor (Phase 1 pre-public audit):
-  - `OutgoingRequest` / `OutgoingNotification` now borrow `method: &'a str` and `params: &'a P` (generic, `P: ?Sized + Serialize`). Eliminates the intermediate `serde_json::to_value()` round-trip on every tool call.
-  - `CallToolParams { name: &'a str, arguments: &'a Value }`.
-  - `Session::call_tool(&str, &Value)` — was `(&str, Value)`. Scenarios drop `.clone()` and pass `&self.args`; deep-clone of the JSON args tree per iteration is gone.
+    - `OutgoingRequest` / `OutgoingNotification` now borrow `method: &'a str` and `params: &'a P` (generic, `P: ?Sized + Serialize`). Eliminates the intermediate `serde_json::to_value()` round-trip on every tool call.
+    - `CallToolParams { name: &'a str, arguments: &'a Value }`.
+    - `Session::call_tool(&str, &Value)` — was `(&str, Value)`. Scenarios drop `.clone()` and pass `&self.args`; deep-clone of the JSON args tree per iteration is gone.
 - Transports: `pending: VecDeque<String>` (id-mismatch buffer) is now capped at `MAX_PENDING_FRAMES = 256` in both `sse` and `ws`. Overflow surfaces `TransportError::Other` instead of growing without bound.
 
 ### Tests / benches
+
 - Criterion microbenchmarks added under `crates/mcp-loadtest/benches/`: `record`, `histogram`, `session_loopback` (Transport-trait loopback, no I/O), `hang_detect`. Run with `cargo bench -p mcp-loadtest`. Closes the DESIGN.md §19 perf-claim gap.
 - Phase 3 coverage gaps closed (pre-publish audit):
-  - `tests/ramp.rs` — first integration test for the `ramp` scenario (was unit-only).
-  - `tests/spike.rs` — added `spike_against_crashing_server_survives_without_hang` (failure-mode coverage; uses `mock-crash.py`; assertions are crash-stochastic-aware so the test isn't flaky).
-  - `src/protocol/transport/ws.rs` — 3 new failure-mode tests: server-closes-mid-call, cancel-during-request, oversized-frame-rejected.
-  - `tests/reporter_snapshots.rs` — 5 new tests: substring landmarks for html + terminal (insta-snapshot parity with markdown + json was skipped because both reporters have too much structural variance for stable snapshots); empty-metrics renders without panic for html, terminal, and json (catches divide-by-zero in throughput math).
+    - `tests/ramp.rs` — first integration test for the `ramp` scenario (was unit-only).
+    - `tests/spike.rs` — added `spike_against_crashing_server_survives_without_hang` (failure-mode coverage; uses `mock-crash.py`; assertions are crash-stochastic-aware so the test isn't flaky).
+    - `src/protocol/transport/ws.rs` — 3 new failure-mode tests: server-closes-mid-call, cancel-during-request, oversized-frame-rejected.
+    - `tests/reporter_snapshots.rs` — 5 new tests: substring landmarks for html + terminal (insta-snapshot parity with markdown + json was skipped because both reporters have too much structural variance for stable snapshots); empty-metrics renders without panic for html, terminal, and json (catches divide-by-zero in throughput math).
 - Test suite: 260 passing (was 250 pre-Phase-3); 0 flakes in a 3-run check; 1 `#[ignore]` left (Vibe-Trading regression — requires external checkout).
 - `cold_start` scenario remains an intentional placeholder in v0.1.0; real handshake-time histogram measurement is queued for v0.2 (DESIGN §8). The existing `cold_start_is_an_inert_placeholder` test pins the placeholder contract so the v0.2 work is forced to update assertions.
 
 ### Changed
+
 - `ServerConfig::stdio(command, args)` constructor + `split_server_command()` free fn relocated to `config` module (kills 4 + 3 hand-rolled literals across the codebase).
 - `classify_error` / `is_terminal_error` deduped into `scenario/mod.rs` (kills 3 byte-identical copies across pattern / ramp / soak).
 - Regression thresholds `P99_REGRESSION_PCT` + `ERROR_RATE_REGRESSION_PP` lifted into `analysis::regression` and shared between `cmd_compare` and `serve/tools::compare_runs`.
+- Unified scenario builder: a single `build_scenario` factory now drives every config — `sustained` accepts weighted `patterns` / legacy `tool_call` arrays (via `PatternScenario`) alongside the single-`tool` path, and all M5–M7 kinds (`ramp` / `soak` / `spike` / `race_check` / `fuzzer` / `pattern`) dispatch through it.
+- `cmd_run` split into private `builder` / `params` / `patterns` submodules (each under the 300 prod-LoC convention); public surface unchanged (`run_from_config` + the re-exported `parse_dur_str` `main.rs` shares with `deadlock-probe` / `cross`).
 
 ### Deprecated
+
 - `DEFAULT_LEAK_THRESHOLD_MB_PER_SEC` → use `DEFAULT_LATENCY_DRIFT_MS_PER_SEC`. The old constant remains as an alias for one release and will be removed in v0.2.0.
 
 ### Notes
+
 - ✅ The M8 file-split pass completed in the pre-publish review. All source files have production code (excluding `#[cfg(test)] mod tests`) under the 300-line convention. See `POST_PUBLISH_ISSUES.md` for the per-wave summary of what split where.
 - `serve` and `tui` modules will move behind cargo feature flags in a future release to keep the default build slim.
 - HTTP / SSE transport host-allowlist for SSRF defense is deferred. Currently mitigated by `Policy::none()` on redirects; the allowlist is operator-facing config and will land alongside the broader transport-hardening pass.
diff --git a/README.md b/README.md
index 89010ee..705f863 100644
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ Available on crates.io once v0.1.0 publishes.
 
 ## Status
 
-Pre-public release. M1–M7 complete; running `cargo nextest run` shows ~255 tests passing on Windows + Linux. The killer demo (deadlock_probe catches the Vibe-Trading PR #85 bug on the unpatched commit) is in `crates/mcp-loadtest/tests/vibe_trading_regression.rs`. Repo is private during a security/code review pass; v0.1.0 tag + crates.io publish are the next steps.
+Pre-public release. M1–M7 implementation is in place; `pwsh scripts/ci-checks.ps1` is green on Windows (fmt, clippy, build, test, doc; `cargo-deny` skipped when absent) with 260+ tests. The killer demo (deadlock_probe catches the Vibe-Trading PR #85 bug on the unpatched commit) is in `crates/mcp-loadtest/tests/vibe_trading_regression.rs`. Repo is private during a security/code review pass; v0.1.0 tag + crates.io publish are the next steps.
 
 ## Development
 
diff --git a/crates/mcp-loadtest-cli/Cargo.toml b/crates/mcp-loadtest-cli/Cargo.toml
index b8146cc..5c9f482 100644
--- a/crates/mcp-loadtest-cli/Cargo.toml
+++ b/crates/mcp-loadtest-cli/Cargo.toml
@@ -25,6 +25,7 @@ path = "src/lib.rs"
 [[bin]]
 name = "mcp-loadtest"
 path = "src/main.rs"
+doc = false
 
 [dependencies]
 # CLI binary needs both `serve` (for `mcp-loadtest serve --mcp`) and `tui`
diff --git a/crates/mcp-loadtest-cli/src/cmd_run.rs b/crates/mcp-loadtest-cli/src/cmd_run.rs
index 8b60f25..0fc32ac 100644
--- a/crates/mcp-loadtest-cli/src/cmd_run.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_run.rs
@@ -1,24 +1,32 @@
 //! `mcp-loadtest run --config <path>` — load a TOML config, build the
 //! requested scenario, drive a `Run`, and emit reports.
+//!
+//! Split into focused submodules (kept private; the public surface is the
+//! [`run_from_config`] entry plus the re-exported [`parse_dur_str`] helper
+//! `main.rs` shares with the `deadlock-probe` / `cross` subcommands):
+//! - `builder` — scenario `kind` → `Box<dyn Scenario>` dispatch
+//! - `params` — generic TOML param plucking + duration parsing
+//! - `patterns` — weighted multi-step pattern-config parsing
 
 use std::path::Path;
-use std::time::Duration;
 
-use anyhow::{Context, Result, anyhow};
-use serde_json::{Value, json};
+use anyhow::{Context, Result};
 
 use mcp_loadtest::config::Config;
 use mcp_loadtest::run::Run;
-use mcp_loadtest::scenario::Scenario;
-use mcp_loadtest::scenario::cold_start::ColdStart;
-use mcp_loadtest::scenario::deadlock_probe::DeadlockProbe;
-use mcp_loadtest::scenario::spike::Spike;
-use mcp_loadtest::scenario::sustained::Sustained;
 
 use crate::emit::emit_reports;
 
+mod builder;
+mod params;
+mod patterns;
+
+use builder::build_scenario;
+pub use params::parse_dur_str;
+
 /// Top-level entry for `Cmd::Run`. Loads `path`, builds the scenario, executes
-/// the run, emits reports, and surfaces any threshold violations as an error.
+/// the run, emits reports, and surfaces any threshold violations as an error
+/// (non-zero exit code — the CI gating contract; see DESIGN.md §15.4).
 pub async fn run_from_config(path: &Path) -> Result<()> {
     let config =
         Config::from_file(path).with_context(|| format!("loading config {}", path.display()))?;
@@ -38,107 +46,3 @@ pub async fn run_from_config(path: &Path) -> Result<()> {
     }
     Ok(())
 }
-
-/// Dispatch on the scenario `kind` string from a TOML config, plucking
-/// per-kind parameters out of the (untyped) `params` blob.
-pub(crate) fn build_scenario(kind: &str, params: &Value) -> Result<Box<dyn Scenario>> {
-    match kind {
-        "sustained" => {
-            let concurrent = params
-                .get("concurrent")
-                .and_then(Value::as_u64)
-                .unwrap_or(10) as u32;
-            let duration = parse_dur_field(params.get("duration"), Duration::from_secs(60))?;
-            let tool = required_str(params, "tool")?;
-            let args = params.get("args").cloned().unwrap_or(json!({}));
-            Ok(Box::new(Sustained {
-                concurrent,
-                duration,
-                tool,
-                args,
-            }))
-        }
-        "deadlock_probe" => {
-            let concurrent = params
-                .get("concurrent")
-                .and_then(Value::as_u64)
-                .unwrap_or(20) as u32;
-            let hang_threshold =
-                parse_dur_field(params.get("hang_threshold"), Duration::from_secs(5))?;
-            let grace_period =
-                parse_dur_field(params.get("grace_period"), Duration::from_secs(10))?;
-            let tool = required_str(params, "tool")?;
-            let args = params.get("args").cloned().unwrap_or(json!({}));
-            Ok(Box::new(DeadlockProbe {
-                concurrent,
-                hang_threshold,
-                grace_period,
-                tool,
-                args,
-            }))
-        }
-        "cold_start" => {
-            let iterations = params
-                .get("iterations")
-                .and_then(Value::as_u64)
-                .unwrap_or(5) as u32;
-            let warmup = params
-                .get("warmup")
-                .and_then(Value::as_bool)
-                .unwrap_or(true);
-            Ok(Box::new(ColdStart { iterations, warmup }))
-        }
-        "spike" => {
-            let baseline_concurrent = params
-                .get("baseline_concurrent")
-                .and_then(Value::as_u64)
-                .unwrap_or(5) as u32;
-            let spike_concurrent = params
-                .get("spike_concurrent")
-                .and_then(Value::as_u64)
-                .unwrap_or(50) as u32;
-            let warmup = parse_dur_field(params.get("warmup"), Duration::from_secs(30))?;
-            let spike_duration =
-                parse_dur_field(params.get("spike_duration"), Duration::from_secs(30))?;
-            let cooldown = parse_dur_field(params.get("cooldown"), Duration::from_secs(30))?;
-            let tool = required_str(params, "tool")?;
-            let args = params.get("args").cloned().unwrap_or(json!({}));
-            Ok(Box::new(Spike {
-                baseline_concurrent,
-                spike_concurrent,
-                warmup,
-                spike_duration,
-                cooldown,
-                tool,
-                args,
-            }))
-        }
-        other => Err(anyhow!("unknown scenario kind: {other}")),
-    }
-}
-
-/// Extract a required string field from a `params` blob, erroring with a
-/// human-readable message if missing or the wrong shape.
-pub(crate) fn required_str(params: &Value, key: &str) -> Result<String> {
-    params
-        .get(key)
-        .and_then(Value::as_str)
-        .map(str::to_string)
-        .ok_or_else(|| anyhow!("scenario.{key} (string) is required"))
-}
-
-/// Parse a `params.<field>` slot as a humantime duration string, falling
-/// back to `default` if the slot is absent and erroring on wrong types.
-pub(crate) fn parse_dur_field(v: Option<&Value>, default: Duration) -> Result<Duration> {
-    match v {
-        Some(Value::String(s)) => parse_dur_str(s),
-        None => Ok(default),
-        _ => Err(anyhow!("duration must be a string like '60s'")),
-    }
-}
-
-/// Parse a CLI / TOML duration string (`"60s"`, `"500ms"`, ...) into a
-/// `std::time::Duration`. Wraps `humantime::parse_duration` with context.
-pub fn parse_dur_str(s: &str) -> Result<Duration> {
-    humantime::parse_duration(s).with_context(|| format!("parsing duration {s:?}"))
-}
diff --git a/crates/mcp-loadtest-cli/src/cmd_run/builder.rs b/crates/mcp-loadtest-cli/src/cmd_run/builder.rs
new file mode 100644
index 0000000..dbc32f3
--- /dev/null
+++ b/crates/mcp-loadtest-cli/src/cmd_run/builder.rs
@@ -0,0 +1,259 @@
+//! Scenario `kind` → `Box<dyn Scenario>` dispatch.
+//!
+//! Pure function over the (untyped) TOML `params` blob — the single place
+//! where a config's `scenario.kind` string is mapped to a concrete scenario,
+//! plucking per-kind parameters via [`super::params`] / [`super::patterns`].
+
+use std::time::Duration;
+
+use anyhow::{Result, anyhow};
+use serde_json::{Value, json};
+
+use mcp_loadtest::scenario::Scenario;
+use mcp_loadtest::scenario::cold_start::ColdStart;
+use mcp_loadtest::scenario::deadlock_probe::DeadlockProbe;
+use mcp_loadtest::scenario::fuzzer::Fuzzer;
+use mcp_loadtest::scenario::pattern::PatternScenario;
+use mcp_loadtest::scenario::race_check::RaceCheck;
+use mcp_loadtest::scenario::ramp::Ramp;
+use mcp_loadtest::scenario::soak::Soak;
+use mcp_loadtest::scenario::spike::Spike;
+use mcp_loadtest::scenario::sustained::Sustained;
+
+use super::params::{
+    f64_field, has_pattern_config, parse_breaking_point, parse_dur_field, parse_fuzz_payloads,
+    required_dur_alias, required_str, required_u32_alias, u32_field, u64_field,
+};
+use super::patterns::parse_patterns;
+
+/// Dispatch on the scenario `kind` string from a TOML config, plucking
+/// per-kind parameters out of the (untyped) `params` blob.
+pub(crate) fn build_scenario(kind: &str, params: &Value) -> Result<Box<dyn Scenario>> {
+    match kind {
+        "sustained" => {
+            let concurrent = u32_field(params, "concurrent", 10)?;
+            let duration = parse_dur_field(params.get("duration"), Duration::from_secs(60))?;
+            if has_pattern_config(params) {
+                let patterns = parse_patterns(params)?;
+                return Ok(Box::new(PatternScenario::sustained(
+                    concurrent, duration, patterns,
+                )));
+            }
+            let tool = required_str(params, "tool")?;
+            let args = params.get("args").cloned().unwrap_or(json!({}));
+            Ok(Box::new(Sustained {
+                concurrent,
+                duration,
+                tool,
+                args,
+            }))
+        }
+        "deadlock_probe" => {
+            let concurrent = u32_field(params, "concurrent", 20)?;
+            let hang_threshold =
+                parse_dur_field(params.get("hang_threshold"), Duration::from_secs(5))?;
+            let grace_period =
+                parse_dur_field(params.get("grace_period"), Duration::from_secs(10))?;
+            let tool = required_str(params, "tool")?;
+            let args = params.get("args").cloned().unwrap_or(json!({}));
+            Ok(Box::new(DeadlockProbe {
+                concurrent,
+                hang_threshold,
+                grace_period,
+                tool,
+                args,
+            }))
+        }
+        "cold_start" => {
+            let iterations = u32_field(params, "iterations", 5)?;
+            let warmup = params
+                .get("warmup")
+                .and_then(Value::as_bool)
+                .unwrap_or(true);
+            Ok(Box::new(ColdStart { iterations, warmup }))
+        }
+        "spike" => {
+            let baseline_concurrent = u32_field(params, "baseline_concurrent", 5)?;
+            let spike_concurrent = u32_field(params, "spike_concurrent", 50)?;
+            let warmup = parse_dur_field(params.get("warmup"), Duration::from_secs(30))?;
+            let spike_duration =
+                parse_dur_field(params.get("spike_duration"), Duration::from_secs(30))?;
+            let cooldown = parse_dur_field(params.get("cooldown"), Duration::from_secs(30))?;
+            let tool = required_str(params, "tool")?;
+            let args = params.get("args").cloned().unwrap_or(json!({}));
+            Ok(Box::new(Spike {
+                baseline_concurrent,
+                spike_concurrent,
+                warmup,
+                spike_duration,
+                cooldown,
+                tool,
+                args,
+            }))
+        }
+        "ramp" => {
+            let from_concurrent =
+                required_u32_alias(params, &["from_concurrent", "ramp_from", "from"])?;
+            let to_concurrent = required_u32_alias(params, &["to_concurrent", "ramp_to", "to"])?;
+            let step_duration =
+                required_dur_alias(params, &["step_duration", "step", "ramp_step"])?;
+            let step_increment = u32_field(params, "step_increment", 1)?;
+            let tool = required_str(params, "tool")?;
+            let args = params.get("args").cloned().unwrap_or(json!({}));
+            let breaking_point = parse_breaking_point(params, step_duration)?;
+            Ok(Box::new(Ramp {
+                from_concurrent,
+                to_concurrent,
+                step_duration,
+                step_increment,
+                tool,
+                args,
+                breaking_point,
+            }))
+        }
+        "soak" => {
+            let defaults = Soak::default();
+            let concurrent = u32_field(params, "concurrent", defaults.concurrent)?;
+            let duration = parse_dur_field(params.get("duration"), defaults.duration)?;
+            let tool = required_str(params, "tool")?;
+            let args = params.get("args").cloned().unwrap_or(json!({}));
+            let sample_interval =
+                parse_dur_field(params.get("sample_interval"), defaults.sample_interval)?;
+            let latency_drift_ms_per_sec = f64_field(
+                params,
+                "latency_drift_ms_per_sec",
+                defaults.latency_drift_ms_per_sec,
+            )?;
+            Ok(Box::new(Soak {
+                concurrent,
+                duration,
+                tool,
+                args,
+                sample_interval,
+                latency_drift_ms_per_sec,
+            }))
+        }
+        "race_check" => {
+            let concurrent = u32_field(params, "concurrent", 10)?;
+            let tool = required_str(params, "tool")?;
+            let args = params.get("args").cloned().unwrap_or(json!({}));
+            Ok(Box::new(RaceCheck {
+                concurrent,
+                tool,
+                args,
+            }))
+        }
+        "fuzzer" => {
+            let iterations = u32_field(params, "iterations", Fuzzer::default().iterations)?;
+            let seed = u64_field(params, "seed", Fuzzer::default().seed)?;
+            let payloads = parse_fuzz_payloads(params)?;
+            Ok(Box::new(Fuzzer {
+                iterations,
+                seed,
+                payloads,
+            }))
+        }
+        "pattern" => {
+            let concurrent = u32_field(params, "concurrent", 10)?;
+            let duration = parse_dur_field(params.get("duration"), Duration::from_secs(60))?;
+            let patterns = parse_patterns(params)?;
+            Ok(Box::new(PatternScenario::new(
+                concurrent, duration, patterns,
+            )))
+        }
+        other => Err(anyhow!("unknown scenario kind: {other}")),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mcp_loadtest::config::Config;
+
+    #[test]
+    fn build_scenario_supports_every_validated_kind() {
+        let cases = [
+            (
+                "sustained",
+                json!({"tool": "echo", "duration": "1s"}),
+                "sustained",
+            ),
+            (
+                "deadlock_probe",
+                json!({"tool": "echo", "hang_threshold": "10ms", "grace_period": "10ms"}),
+                "deadlock_probe",
+            ),
+            ("cold_start", json!({"iterations": 1}), "cold_start"),
+            (
+                "spike",
+                json!({"tool": "echo", "warmup": "1ms", "spike_duration": "1ms", "cooldown": "1ms"}),
+                "spike",
+            ),
+            (
+                "ramp",
+                json!({"from_concurrent": 1, "to_concurrent": 2, "step_duration": "1ms", "tool": "echo"}),
+                "ramp",
+            ),
+            (
+                "soak",
+                json!({"tool": "echo", "duration": "1ms", "sample_interval": "1ms"}),
+                "soak",
+            ),
+            ("race_check", json!({"tool": "echo"}), "race_check"),
+            ("fuzzer", json!({"iterations": 1}), "fuzzer"),
+            (
+                "pattern",
+                json!({
+                    "duration": "1s",
+                    "patterns": [{
+                        "name": "read",
+                        "steps": [{ "tool": "echo", "args": { "x": 1 } }]
+                    }]
+                }),
+                "pattern",
+            ),
+        ];
+
+        for (kind, params, expected_name) in cases {
+            let scenario = build_scenario(kind, &params)
+                .unwrap_or_else(|err| panic!("building {kind} failed: {err:#}"));
+            assert_eq!(scenario.name(), expected_name);
+        }
+    }
+
+    #[test]
+    fn sustained_accepts_legacy_tool_call_array() {
+        let scenario = build_scenario(
+            "sustained",
+            &json!({
+                "duration": "1s",
+                "tool_call": [
+                    { "name": "echo", "args": { "x": 1 }, "weight": 1.0 },
+                    { "name": "echo", "args": { "x": 2 }, "weight": 0.5 }
+                ]
+            }),
+        )
+        .expect("legacy tool_call sustained config should build");
+
+        assert_eq!(scenario.name(), "sustained");
+    }
+
+    #[test]
+    fn example_config_builds_runnable_scenario() {
+        let cfg = Config::from_toml_str(&mcp_loadtest::config::example_config())
+            .expect("example config should parse");
+        let scenario = build_scenario(&cfg.scenario.kind, &cfg.scenario.params)
+            .expect("example config scenario should build");
+
+        assert_eq!(scenario.name(), "sustained");
+    }
+
+    #[test]
+    fn fuzzer_rejects_unknown_payload_label() {
+        let err = match build_scenario("fuzzer", &json!({"payloads": ["NotARealPayload"]})) {
+            Ok(_) => panic!("unknown payload must error"),
+            Err(err) => err,
+        };
+        assert!(err.to_string().contains("unknown fuzzer payload"));
+    }
+}
diff --git a/crates/mcp-loadtest-cli/src/cmd_run/params.rs b/crates/mcp-loadtest-cli/src/cmd_run/params.rs
new file mode 100644
index 0000000..88766d2
--- /dev/null
+++ b/crates/mcp-loadtest-cli/src/cmd_run/params.rs
@@ -0,0 +1,218 @@
+//! Generic TOML-param plucking + duration parsing for the scenario builder.
+//!
+//! These are the type-coercion helpers shared by [`super::builder`] and
+//! [`super::patterns`]: pull a typed value out of the untyped `params` blob
+//! produced by the TOML config, erroring with a human-readable
+//! `scenario.<field>` message on the wrong shape.
+
+use std::time::Duration;
+
+use anyhow::{Context, Result, anyhow};
+use serde_json::Value;
+
+use mcp_loadtest::analysis::breaking_point::BreakingPointConfig;
+use mcp_loadtest::scenario::fuzzer::FuzzPayload;
+
+/// Extract a required string field from a `params` blob, erroring with a
+/// human-readable message if missing or the wrong shape.
+pub(crate) fn required_str(params: &Value, key: &str) -> Result<String> {
+    params
+        .get(key)
+        .and_then(Value::as_str)
+        .map(str::to_string)
+        .ok_or_else(|| anyhow!("scenario.{key} (string) is required"))
+}
+
+/// True when the config carries any of the pattern-style keys (`patterns`,
+/// `tool_call`, `tool_calls`) — i.e. it should drive the weighted-pattern
+/// engine instead of the single-`tool` path.
+pub(crate) fn has_pattern_config(params: &Value) -> bool {
+    params.get("patterns").is_some()
+        || params.get("tool_call").is_some()
+        || params.get("tool_calls").is_some()
+}
+
+/// Read an optional `u32` field, falling back to `default` when absent.
+pub(crate) fn u32_field(params: &Value, key: &str, default: u32) -> Result<u32> {
+    match params.get(key) {
+        Some(v) => value_as_u32(v, key),
+        None => Ok(default),
+    }
+}
+
+/// Read a required `u32` field, accepting any of `keys` as aliases.
+pub(crate) fn required_u32_alias(params: &Value, keys: &[&str]) -> Result<u32> {
+    for key in keys {
+        if let Some(v) = params.get(*key) {
+            return value_as_u32(v, key);
+        }
+    }
+    Err(anyhow!("scenario.{} (integer) is required", keys[0]))
+}
+
+fn value_as_u32(v: &Value, key: &str) -> Result<u32> {
+    let n = v
+        .as_u64()
+        .ok_or_else(|| anyhow!("scenario.{key} must be a non-negative integer"))?;
+    u32::try_from(n).with_context(|| format!("scenario.{key} exceeds u32::MAX"))
+}
+
+/// Read an optional `u64` field, falling back to `default` when absent.
+pub(crate) fn u64_field(params: &Value, key: &str, default: u64) -> Result<u64> {
+    match params.get(key) {
+        Some(v) => v
+            .as_u64()
+            .ok_or_else(|| anyhow!("scenario.{key} must be a non-negative integer")),
+        None => Ok(default),
+    }
+}
+
+/// Read an optional `f64` field, falling back to `default` when absent.
+pub(crate) fn f64_field(params: &Value, key: &str, default: f64) -> Result<f64> {
+    match params.get(key) {
+        Some(v) => v
+            .as_f64()
+            .ok_or_else(|| anyhow!("scenario.{key} must be a number")),
+        None => Ok(default),
+    }
+}
+
+/// Read a required duration field, accepting any of `keys` as aliases.
+pub(crate) fn required_dur_alias(params: &Value, keys: &[&str]) -> Result<Duration> {
+    for key in keys {
+        if let Some(v) = params.get(*key) {
+            return parse_dur_field(Some(v), Duration::ZERO)
+                .with_context(|| format!("parsing scenario.{key}"));
+        }
+    }
+    Err(anyhow!(
+        "scenario.{} (duration string like '5s') is required",
+        keys[0]
+    ))
+}
+
+/// Parse the optional `breaking_point` sub-object inside a `ramp` config.
+/// Absent → `None` (no breaking-point detection). `window_secs` defaults to
+/// the ramp's `step_duration` so each step is judged over its own window.
+pub(crate) fn parse_breaking_point(
+    params: &Value,
+    step_duration: Duration,
+) -> Result<Option<BreakingPointConfig>> {
+    let Some(v) = params.get("breaking_point") else {
+        return Ok(None);
+    };
+    let obj = v
+        .as_object()
+        .ok_or_else(|| anyhow!("scenario.breaking_point must be an object"))?;
+    let max_p99_latency = parse_dur_field(
+        obj.get("max_p99_latency")
+            .or_else(|| obj.get("p99_latency")),
+        Duration::MAX,
+    )
+    .context("parsing scenario.breaking_point.max_p99_latency")?;
+    let max_error_rate = match obj.get("max_error_rate").or_else(|| obj.get("error_rate")) {
+        Some(v) => v
+            .as_f64()
+            .ok_or_else(|| anyhow!("scenario.breaking_point.max_error_rate must be a number"))?,
+        None => f64::INFINITY,
+    };
+    let window_secs = match obj.get("window_secs") {
+        Some(v) => v
+            .as_f64()
+            .ok_or_else(|| anyhow!("scenario.breaking_point.window_secs must be a number"))?,
+        None => step_duration.as_secs_f64(),
+    };
+    Ok(Some(BreakingPointConfig {
+        max_p99_latency,
+        max_error_rate,
+        window_secs,
+    }))
+}
+
+/// Parse the optional `payloads` array of a `fuzzer` config into the
+/// corresponding [`FuzzPayload`] variants (case-insensitive label match).
+pub(crate) fn parse_fuzz_payloads(params: &Value) -> Result<Vec<FuzzPayload>> {
+    let Some(v) = params.get("payloads") else {
+        return Ok(Vec::new());
+    };
+    let arr = v
+        .as_array()
+        .ok_or_else(|| anyhow!("scenario.payloads must be an array of strings"))?;
+    let all = FuzzPayload::all();
+    let mut out = Vec::with_capacity(arr.len());
+    for item in arr {
+        let label = item
+            .as_str()
+            .ok_or_else(|| anyhow!("scenario.payloads entries must be strings"))?;
+        let payload = all
+            .iter()
+            .copied()
+            .find(|p| p.label() == label || p.label().eq_ignore_ascii_case(label))
+            .ok_or_else(|| anyhow!("unknown fuzzer payload label: {label}"))?;
+        out.push(payload);
+    }
+    Ok(out)
+}
+
+/// Parse a `params.<field>` slot as a humantime duration string, falling
+/// back to `default` if the slot is absent and erroring on wrong types.
+pub(crate) fn parse_dur_field(v: Option<&Value>, default: Duration) -> Result<Duration> {
+    match v {
+        Some(Value::String(s)) => parse_dur_str(s),
+        None => Ok(default),
+        _ => Err(anyhow!("duration must be a string like '60s'")),
+    }
+}
+
+/// Parse a CLI / TOML duration string (`"60s"`, `"500ms"`, ...) into a
+/// [`std::time::Duration`]. Wraps `humantime::parse_duration` with context.
+pub fn parse_dur_str(s: &str) -> Result<Duration> {
+    humantime::parse_duration(s).with_context(|| format!("parsing duration {s:?}"))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn required_str_errors_when_missing() {
+        let err = required_str(&json!({}), "tool").unwrap_err();
+        assert!(err.to_string().contains("scenario.tool"));
+    }
+
+    #[test]
+    fn u32_field_falls_back_to_default() {
+        assert_eq!(u32_field(&json!({}), "concurrent", 7).unwrap(), 7);
+        assert_eq!(
+            u32_field(&json!({"concurrent": 3}), "concurrent", 7).unwrap(),
+            3
+        );
+    }
+
+    #[test]
+    fn value_as_u32_rejects_overflow_and_negatives() {
+        assert!(value_as_u32(&json!(-1), "x").is_err());
+        assert!(value_as_u32(&json!(u64::from(u32::MAX) + 1), "x").is_err());
+    }
+
+    #[test]
+    fn parse_dur_field_defaults_and_type_checks() {
+        assert_eq!(
+            parse_dur_field(None, Duration::from_secs(5)).unwrap(),
+            Duration::from_secs(5)
+        );
+        assert_eq!(
+            parse_dur_field(Some(&json!("250ms")), Duration::ZERO).unwrap(),
+            Duration::from_millis(250)
+        );
+        assert!(parse_dur_field(Some(&json!(5)), Duration::ZERO).is_err());
+    }
+
+    #[test]
+    fn required_dur_alias_accepts_any_alias() {
+        let p = json!({"ramp_step": "1s"});
+        let d = required_dur_alias(&p, &["step_duration", "step", "ramp_step"]).unwrap();
+        assert_eq!(d, Duration::from_secs(1));
+    }
+}
diff --git a/crates/mcp-loadtest-cli/src/cmd_run/patterns.rs b/crates/mcp-loadtest-cli/src/cmd_run/patterns.rs
new file mode 100644
index 0000000..f116673
--- /dev/null
+++ b/crates/mcp-loadtest-cli/src/cmd_run/patterns.rs
@@ -0,0 +1,195 @@
+//! Weighted multi-step pattern-config parsing.
+//!
+//! Turns the `patterns` / `tool_call` / single-`tool` config shapes into the
+//! [`Pattern`] model the pattern engine drives. Legacy `tool_call` arrays are
+//! still accepted so older configs keep working (see DESIGN.md §11).
+
+use std::time::Duration;
+
+use anyhow::{Context, Result, anyhow};
+use serde_json::{Value, json};
+
+use mcp_loadtest::scenario::pattern::{ErrorBehavior, Pattern, PatternStep};
+
+use super::params::parse_dur_field;
+
+/// Resolve the pattern set from a config blob: explicit `patterns`, legacy
+/// `tool_call[s]`, or a single `tool` + `args` collapsed into one pattern.
+pub(crate) fn parse_patterns(params: &Value) -> Result<Vec<Pattern>> {
+    if let Some(v) = params.get("patterns") {
+        return parse_pattern_array(v);
+    }
+    if let Some(v) = params.get("tool_call").or_else(|| params.get("tool_calls")) {
+        return parse_tool_call_array(v);
+    }
+    if let Some(tool) = params.get("tool").and_then(Value::as_str) {
+        let args = params.get("args").cloned().unwrap_or(json!({}));
+        return Ok(vec![Pattern::single_call(tool, args)]);
+    }
+    Err(anyhow!(
+        "scenario.patterns is required (or provide scenario.tool + scenario.args for a single-step pattern)"
+    ))
+}
+
+fn parse_pattern_array(v: &Value) -> Result<Vec<Pattern>> {
+    let arr = v
+        .as_array()
+        .ok_or_else(|| anyhow!("scenario.patterns must be an array"))?;
+    if arr.is_empty() {
+        return Err(anyhow!("scenario.patterns must not be empty"));
+    }
+
+    let mut patterns = Vec::with_capacity(arr.len());
+    for (idx, item) in arr.iter().enumerate() {
+        let obj = item
+            .as_object()
+            .ok_or_else(|| anyhow!("scenario.patterns[{idx}] must be an object"))?;
+        let name = obj
+            .get("name")
+            .and_then(Value::as_str)
+            .map(str::to_string)
+            .unwrap_or_else(|| format!("pattern-{}", idx + 1));
+        let weight = match obj.get("weight") {
+            Some(v) => v
+                .as_f64()
+                .ok_or_else(|| anyhow!("scenario.patterns[{idx}].weight must be a number"))?,
+            None => 1.0,
+        };
+        let think_time = parse_dur_field(obj.get("think_time"), Duration::ZERO)
+            .with_context(|| format!("parsing scenario.patterns[{idx}].think_time"))?;
+        let on_step_error = parse_error_behavior(obj.get("on_step_error"), idx)?;
+        let steps = match obj.get("steps") {
+            Some(steps) => parse_steps(steps, &format!("scenario.patterns[{idx}].steps"))?,
+            None => {
+                if obj.get("tool").is_some() {
+                    vec![parse_step(
+                        item,
+                        &format!("scenario.patterns[{idx}]"),
+                        "tool",
+                    )?]
+                } else {
+                    return Err(anyhow!("scenario.patterns[{idx}].steps is required"));
+                }
+            }
+        };
+        patterns.push(Pattern {
+            name,
+            weight,
+            think_time,
+            on_step_error,
+            steps,
+        });
+    }
+    Ok(patterns)
+}
+
+fn parse_tool_call_array(v: &Value) -> Result<Vec<Pattern>> {
+    let arr = v
+        .as_array()
+        .ok_or_else(|| anyhow!("scenario.tool_call must be an array"))?;
+    if arr.is_empty() {
+        return Err(anyhow!("scenario.tool_call must not be empty"));
+    }
+
+    let mut patterns = Vec::with_capacity(arr.len());
+    for (idx, item) in arr.iter().enumerate() {
+        let step = parse_step(item, &format!("scenario.tool_call[{idx}]"), "name")?;
+        let weight = item.get("weight").and_then(Value::as_f64).unwrap_or(1.0);
+        patterns.push(Pattern {
+            name: format!("tool:{}", step.tool),
+            weight,
+            think_time: Duration::ZERO,
+            on_step_error: ErrorBehavior::Continue,
+            steps: vec![step],
+        });
+    }
+    Ok(patterns)
+}
+
+fn parse_steps(v: &Value, path: &str) -> Result<Vec<PatternStep>> {
+    let arr = v
+        .as_array()
+        .ok_or_else(|| anyhow!("{path} must be an array"))?;
+    if arr.is_empty() {
+        return Err(anyhow!("{path} must not be empty"));
+    }
+    arr.iter()
+        .enumerate()
+        .map(|(idx, item)| parse_step(item, &format!("{path}[{idx}]"), "tool"))
+        .collect()
+}
+
+fn parse_step(v: &Value, path: &str, preferred_tool_key: &str) -> Result<PatternStep> {
+    let obj = v
+        .as_object()
+        .ok_or_else(|| anyhow!("{path} must be an object"))?;
+    let fallback_key = if preferred_tool_key == "tool" {
+        "name"
+    } else {
+        "tool"
+    };
+    let tool = obj
+        .get(preferred_tool_key)
+        .or_else(|| obj.get(fallback_key))
+        .and_then(Value::as_str)
+        .map(str::to_string)
+        .ok_or_else(|| anyhow!("{path}.{preferred_tool_key} (string) is required"))?;
+    let args = obj.get("args").cloned().unwrap_or(json!({}));
+    Ok(PatternStep { tool, args })
+}
+
+fn parse_error_behavior(v: Option<&Value>, idx: usize) -> Result<ErrorBehavior> {
+    match v {
+        None => Ok(ErrorBehavior::Continue),
+        Some(Value::String(s)) if s == "continue" => Ok(ErrorBehavior::Continue),
+        Some(Value::String(s)) if s == "abort" => Ok(ErrorBehavior::Abort),
+        Some(Value::String(s)) => Err(anyhow!(
+            "scenario.patterns[{idx}].on_step_error must be `continue` or `abort`, got `{s}`"
+        )),
+        Some(_) => Err(anyhow!(
+            "scenario.patterns[{idx}].on_step_error must be a string"
+        )),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn single_tool_collapses_to_one_pattern() {
+        let p = parse_patterns(&json!({ "tool": "echo", "args": { "x": 1 } })).unwrap();
+        assert_eq!(p.len(), 1);
+        assert_eq!(p[0].steps.len(), 1);
+        assert_eq!(p[0].steps[0].tool, "echo");
+    }
+
+    #[test]
+    fn legacy_tool_call_array_maps_each_entry() {
+        let p = parse_patterns(&json!({
+            "tool_call": [
+                { "name": "a", "weight": 2.0 },
+                { "name": "b" }
+            ]
+        }))
+        .unwrap();
+        assert_eq!(p.len(), 2);
+        assert_eq!(p[0].weight, 2.0);
+        assert_eq!(p[1].name, "tool:b");
+    }
+
+    #[test]
+    fn empty_patterns_array_is_rejected() {
+        assert!(parse_patterns(&json!({ "patterns": [] })).is_err());
+    }
+
+    #[test]
+    fn invalid_on_step_error_is_rejected() {
+        let err = parse_patterns(&json!({
+            "patterns": [{ "name": "p", "on_step_error": "explode",
+                           "steps": [{ "tool": "echo" }] }]
+        }))
+        .unwrap_err();
+        assert!(err.to_string().contains("on_step_error"));
+    }
+}
diff --git a/crates/mcp-loadtest-cli/src/main.rs b/crates/mcp-loadtest-cli/src/main.rs
index dc2507a..1b9bdf5 100644
--- a/crates/mcp-loadtest-cli/src/main.rs
+++ b/crates/mcp-loadtest-cli/src/main.rs
@@ -128,9 +128,19 @@ async fn main() -> Result<()> {
             Ok(())
         }
         Cmd::ListScenarios => {
-            println!("sustained        — concurrent-load workload");
+            println!(
+                "sustained        — steady workload; also accepts weighted tool_call/pattern configs"
+            );
             println!("deadlock_probe   — Vibe-Trading-bug-class detector");
-            println!("cold_start       — startup-perf benchmark (placeholder, activates in M3+)");
+            println!(
+                "cold_start       — startup-perf placeholder (needs session factory for real cold-start timing)"
+            );
+            println!("spike            — baseline → burst → cooldown workload");
+            println!("ramp             — stepped concurrency + optional breaking-point detection");
+            println!("soak             — long steady run with periodic samples and drift notes");
+            println!("race_check       — identical calls + response-divergence detection");
+            println!("fuzzer           — malformed payload probe");
+            println!("pattern          — weighted multi-step tool-call sequences");
             Ok(())
         }
         Cmd::Run { config } => cmd_run::run_from_config(&config).await,
diff --git a/crates/mcp-loadtest-cli/tests/cmd_run.rs b/crates/mcp-loadtest-cli/tests/cmd_run.rs
new file mode 100644
index 0000000..270c9f3
--- /dev/null
+++ b/crates/mcp-loadtest-cli/tests/cmd_run.rs
@@ -0,0 +1,45 @@
+//! Integration tests for the `run` subcommand's public surface.
+//!
+//! The `cmd_run` refactor split the module into private submodules
+//! (`builder` / `params` / `patterns`). This guards the two public items
+//! `main.rs` depends on across the bin↔lib boundary: the re-exported
+//! [`cmd_run::parse_dur_str`] helper (shared with `deadlock-probe` / `cross`)
+//! and the [`cmd_run::run_from_config`] entry point.
+
+use std::path::Path;
+use std::time::Duration;
+
+use mcp_loadtest_cli::cmd_run;
+
+#[test]
+fn parse_dur_str_reexport_is_reachable_and_parses() {
+    assert_eq!(
+        cmd_run::parse_dur_str("250ms").expect("250ms parses"),
+        Duration::from_millis(250)
+    );
+    assert_eq!(
+        cmd_run::parse_dur_str("2s").expect("2s parses"),
+        Duration::from_secs(2)
+    );
+    assert!(
+        cmd_run::parse_dur_str("not-a-duration").is_err(),
+        "garbage duration should error"
+    );
+}
+
+#[test]
+fn run_from_config_errors_on_missing_file() {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("build current-thread runtime");
+
+    let result = rt.block_on(cmd_run::run_from_config(Path::new(
+        "definitely/does/not/exist.toml",
+    )));
+
+    assert!(
+        result.is_err(),
+        "a missing config path must surface an error, got: {result:?}"
+    );
+}
diff --git a/crates/mcp-loadtest/src/config/example.rs b/crates/mcp-loadtest/src/config/example.rs
index 43c7955..87a6b32 100644
--- a/crates/mcp-loadtest/src/config/example.rs
+++ b/crates/mcp-loadtest/src/config/example.rs
@@ -16,7 +16,7 @@ pub fn example_config() -> String {
 command = "python"
 # CLI args passed to `command`. Empty list = none.
 args = ["-m", "my_mcp"]
-# Transport: "stdio" (M1+, supported), "sse" / "ws" (M4, parse-only in M3).
+# Transport: "stdio", "http", "sse", or "ws".
 transport = "stdio"
 # Time budget for the `initialize` round-trip. humantime: "10s", "500ms", "1m".
 startup_timeout = "10s"
@@ -28,7 +28,8 @@ startup_timeout = "10s"
 LOG_LEVEL = "warn"
 
 [scenario]
-# Scenario kind: one of "sustained", "deadlock_probe", "cold_start".
+# Scenario kind: one of "sustained", "deadlock_probe", "cold_start",
+# "spike", "ramp", "soak", "race_check", "fuzzer", or "pattern".
 type = "sustained"
 # How long the scenario drives traffic.
 duration = "60s"
@@ -71,7 +72,7 @@ memory_growth_mb = 50.0
 [output]
 # Where per-run dirs are created. Each run gets its own timestamped subdir.
 report_dir = "./runs"
-# Output formats to emit. Any subset of "terminal", "markdown", "json".
+# Output formats to emit. Any subset of "terminal", "markdown", "json", "html".
 formats = ["terminal", "markdown", "json"]
 "#
     .to_string()
diff --git a/crates/mcp-loadtest/src/config/validate.rs b/crates/mcp-loadtest/src/config/validate.rs
index ecd9658..f54aaea 100644
--- a/crates/mcp-loadtest/src/config/validate.rs
+++ b/crates/mcp-loadtest/src/config/validate.rs
@@ -4,10 +4,7 @@
 
 use super::{Config, ConfigError};
 
-/// Transports recognized by the parser + runtime. `stdio` (M1+), `http` and
-/// `sse` (M4); `ws` reserved for a future milestone — parses cleanly so
-/// config files can be drafted ahead of time, but `Run` rejects it at
-/// construction.
+/// Transports recognized by the parser + runtime.
 pub(super) const KNOWN_TRANSPORTS: &[&str] = &["stdio", "http", "sse", "ws"];
 
 /// Scenario kinds the parser knows about. Mirrors §8 of DESIGN.md and the
diff --git a/crates/mcp-loadtest/src/metrics/mod.rs b/crates/mcp-loadtest/src/metrics/mod.rs
index a5dd387..5868470 100644
--- a/crates/mcp-loadtest/src/metrics/mod.rs
+++ b/crates/mcp-loadtest/src/metrics/mod.rs
@@ -10,7 +10,7 @@
 //! `record` < 50µs (DESIGN.md §19); typical cost is dominated by hdrhistogram
 //! bucket math (a few hundred ns).
 //!
-//! Module layout: [`types`] (public value types), [`histogram`] (sharded
+//! Module layout: `types` (public value types), [`histogram`] (sharded
 //! wrapper), [`process`] (RSS/CPU sampling), [`throughput`] (atomic counters).
 
 pub mod histogram;
diff --git a/crates/mcp-loadtest/src/scenario/fuzzer.rs b/crates/mcp-loadtest/src/scenario/fuzzer.rs
index 5942c39..701bea5 100644
--- a/crates/mcp-loadtest/src/scenario/fuzzer.rs
+++ b/crates/mcp-loadtest/src/scenario/fuzzer.rs
@@ -31,11 +31,13 @@
 //! [`crate::analysis::fuzz_report`]):
 //!
 //! - `Ok(_)` → [`FuzzClass::Accepted`] (server tolerated the input — interesting)
-//! - [`SessionError::Server`] with code in `-32600..=-32603`
+//! - [`crate::session::SessionError::Server`] with code in `-32600..=-32603`
 //!   → [`FuzzClass::ProtocolError`]
-//! - [`SessionError::Server`] with other codes → [`FuzzClass::ServerError`]
-//! - [`SessionError::Json`] / [`SessionError::IdMismatch`] → [`FuzzClass::ParseError`]
-//! - [`SessionError::Transport`] / [`SessionError::Io`] → [`FuzzClass::Disconnected`]
+//! - [`crate::session::SessionError::Server`] with other codes → [`FuzzClass::ServerError`]
+//! - [`crate::session::SessionError::Json`] / [`crate::session::SessionError::IdMismatch`]
+//!   → [`FuzzClass::ParseError`]
+//! - [`crate::session::SessionError::Transport`] / [`crate::session::SessionError::Io`]
+//!   → [`FuzzClass::Disconnected`]
 //! - [`HangOutcome::Deadlock`] from [`hang_detect`] → [`FuzzClass::Deadlock`]
 //!
 //! `ProtocolError` is the **expected, healthy** outcome for most malformed
@@ -44,9 +46,9 @@
 //!
 //! ## Module layout
 //!
-//! - [`payloads`] — the [`FuzzPayload`] enum + its impls + the giant/nested
+//! - `payloads` — the [`FuzzPayload`] enum + its impls + the giant/nested
 //!   payload `LazyLock` constants.
-//! - [`classify`] — response classification + report-note rendering helpers.
+//! - `classify` — response classification + report-note rendering helpers.
 //! - this file — the [`Fuzzer`] struct and [`Scenario`] impl.
 
 mod classify;
diff --git a/crates/mcp-loadtest/src/scenario/pattern.rs b/crates/mcp-loadtest/src/scenario/pattern.rs
index b24e2b7..97f0012 100644
--- a/crates/mcp-loadtest/src/scenario/pattern.rs
+++ b/crates/mcp-loadtest/src/scenario/pattern.rs
@@ -19,14 +19,15 @@
 
 use std::time::Duration;
 
+use async_trait::async_trait;
 use rand::Rng;
 use rand::seq::IndexedRandom;
-use serde_json::Value;
+use serde_json::{Value, json};
 use tokio::time::sleep;
 
 use crate::Session;
 use crate::metrics::CallOutcome;
-use crate::scenario::RunContext;
+use crate::scenario::{RunContext, Scenario, ScenarioOutcome};
 
 /// What to do when a pattern step errors out.
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
@@ -71,6 +72,117 @@ pub struct Pattern {
     pub steps: Vec<PatternStep>,
 }
 
+/// Scenario wrapper around the pattern engine.
+///
+/// `Pattern` itself is just a reusable sequence of tool calls. This wrapper
+/// gives TOML/CLI users a first-class `scenario.type = "pattern"` entry point
+/// and also lets `sustained` configs with multiple weighted tool calls reuse
+/// the same executor without changing the legacy [`Sustained`] struct.
+///
+/// [`Sustained`]: crate::scenario::sustained::Sustained
+pub struct PatternScenario {
+    /// Name surfaced in reports. Use `"pattern"` for explicit pattern runs
+    /// and `"sustained"` when adapting legacy sustained configs with
+    /// `[[scenario.tool_call]]` / `patterns`.
+    pub scenario_name: &'static str,
+    /// Declared concurrency target. See `sustained` module docs: currently
+    /// informational because a single `Session` serializes calls.
+    pub concurrent: u32,
+    /// Total time to keep selecting and driving patterns.
+    pub duration: Duration,
+    /// Weighted pattern set.
+    pub patterns: Vec<Pattern>,
+}
+
+impl PatternScenario {
+    /// Construct an explicit `pattern` scenario.
+    pub fn new(concurrent: u32, duration: Duration, patterns: Vec<Pattern>) -> Self {
+        Self {
+            scenario_name: "pattern",
+            concurrent,
+            duration,
+            patterns,
+        }
+    }
+
+    /// Adapt a sustained workload that uses weighted patterns/tool calls while
+    /// keeping report output labelled as `sustained`.
+    pub fn sustained(concurrent: u32, duration: Duration, patterns: Vec<Pattern>) -> Self {
+        Self {
+            scenario_name: "sustained",
+            concurrent,
+            duration,
+            patterns,
+        }
+    }
+}
+
+#[async_trait]
+impl Scenario for PatternScenario {
+    async fn drive(&self, session: &mut Session, ctx: &RunContext) -> ScenarioOutcome {
+        crate::scenario::sustained::run_patterns(
+            self.concurrent,
+            self.duration,
+            &self.patterns,
+            session,
+            ctx,
+        )
+        .await
+    }
+
+    fn config_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "title": "Pattern",
+            "description": "Weighted random multi-step tool-call patterns.",
+            "properties": {
+                "concurrent": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "default": 10
+                },
+                "duration": {
+                    "type": "string",
+                    "default": "60s"
+                },
+                "patterns": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "name": { "type": "string" },
+                            "weight": { "type": "number", "default": 1.0 },
+                            "think_time": { "type": "string", "default": "0ms" },
+                            "on_step_error": {
+                                "type": "string",
+                                "enum": ["continue", "abort"],
+                                "default": "continue"
+                            },
+                            "steps": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "tool": { "type": "string" },
+                                        "args": { "type": "object" }
+                                    },
+                                    "required": ["tool"]
+                                }
+                            }
+                        },
+                        "required": ["steps"]
+                    }
+                }
+            },
+            "required": ["patterns"]
+        })
+    }
+
+    fn name(&self) -> &'static str {
+        self.scenario_name
+    }
+}
+
 impl Pattern {
     /// Convenience: a single-step pattern with weight 1.0 and no think-time.
     /// Useful for the legacy single-tool / single-args [`Sustained`] config
@@ -218,6 +330,23 @@ mod tests {
         assert_eq!(p.name, "single:echo");
     }
 
+    #[test]
+    fn pattern_scenario_name_can_preserve_sustained_label() {
+        let s = PatternScenario::sustained(
+            1,
+            Duration::from_secs(1),
+            vec![Pattern::single_call("echo", json!({}))],
+        );
+        assert_eq!(s.name(), "sustained");
+
+        let explicit = PatternScenario::new(
+            1,
+            Duration::from_secs(1),
+            vec![Pattern::single_call("echo", json!({}))],
+        );
+        assert_eq!(explicit.name(), "pattern");
+    }
+
     #[test]
     fn pick_returns_none_on_empty_slice() {
         let mut rng = StdRng::seed_from_u64(0);
diff --git a/crates/mcp-loadtest/src/scenario/soak.rs b/crates/mcp-loadtest/src/scenario/soak.rs
index bd89579..e292fc7 100644
--- a/crates/mcp-loadtest/src/scenario/soak.rs
+++ b/crates/mcp-loadtest/src/scenario/soak.rs
@@ -77,7 +77,7 @@ pub struct Soak {
     pub sample_interval: Duration,
     /// Mean-latency drift (milliseconds-per-second) above which the soak
     /// flags the run as drifting. Defaults to
-    /// [`DEFAULT_LATENCY_DRIFT_MS_PER_SEC`].
+    /// `DEFAULT_LATENCY_DRIFT_MS_PER_SEC`.
     ///
     /// Note: this is **not** an RSS-based leak signal — that lives in the
     /// run orchestrator's RSS regression on `ProcessStats::samples`. This
diff --git a/crates/mcp-loadtest/src/scenario/sustained.rs b/crates/mcp-loadtest/src/scenario/sustained.rs
index 56d22b9..fe2f479 100644
--- a/crates/mcp-loadtest/src/scenario/sustained.rs
+++ b/crates/mcp-loadtest/src/scenario/sustained.rs
@@ -10,7 +10,7 @@
 //! one-element `Pattern::single_call` list. The legacy public surface is
 //! unchanged: `Sustained { concurrent, duration, tool, args }` still
 //! constructs and runs identically. To configure multiple weighted patterns,
-//! use [`run_patterns`] (a free function that drives the loop against a
+//! use `run_patterns` (a free function that drives the loop against a
 //! provided pattern list and shares the [`Sustained`]'s `concurrent` +
 //! `duration` knobs).
 //!
@@ -52,7 +52,7 @@ use crate::scenario::{RunContext, Scenario, ScenarioOutcome};
 /// `ctx.cancel_token` fires.
 ///
 /// To run weighted multi-step patterns instead of a single tool, see
-/// [`run_patterns`].
+/// `run_patterns`.
 pub struct Sustained {
     /// Declared concurrency target. See module docs — this is informational
     /// in M2; the loop is single-threaded against one [`Session`].
diff --git a/docs/examples/custom-scenario.md b/docs/examples/custom-scenario.md
index c87ea68..f56199c 100644
--- a/docs/examples/custom-scenario.md
+++ b/docs/examples/custom-scenario.md
@@ -204,8 +204,9 @@ pub mod cache_warmup;   // ← new
 
 ## Step 3. Wire it into the CLI
 
-`crates/mcp-loadtest-cli/src/main.rs` has a `build_scenario(kind, params)`
-dispatch. Add a branch:
+`crates/mcp-loadtest-cli/src/cmd_run/builder.rs` has a
+`build_scenario(kind, params)` dispatch (param helpers like `required_str` /
+`parse_dur_field` live in the sibling `cmd_run/params.rs`). Add a branch:
 
 ```rust
 "cache_warmup" => {

From ba0faaea270ffea2072de8f316446a38da2a4d71 Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 18:15:50 +0700
Subject: [PATCH 2/8] feat(compare): configurable regression thresholds

ADR 0009 deferred operator-configurable thresholds until a user asked;
this lands them. p99 / error-rate / deadlock-zero-tolerance were
hard-coded constants shared by the compare CLI and the compare_runs
MCP tool, so a user wanting a different CI budget had to fork.

Add analysis::regression::RegressionThresholds whose Default
reproduces the historical 10% / 0.5pp / zero-tolerance policy exactly,
so existing gates are unaffected unless they opt in. Surface overrides
where operators work: compare gains --max-p99-regression-pct /
--max-error-rate-regression-pp / --allow-deadlock-increase; the
compare_runs tool gains the matching optional args. CLI and MCP stay
in lockstep by construction (same struct).

ADR 0009 updated (open question resolved); CHANGELOG noted.
ci-checks green: fmt, clippy -D warnings, --locked tests, doc.
---
 CHANGELOG.md                                  |  1 +
 crates/mcp-loadtest-cli/src/cmd_compare.rs    | 96 ++++++++++++++-----
 .../mcp-loadtest-cli/src/cmd_compare/diff.rs  | 45 +++++----
 .../mcp-loadtest-cli/src/cmd_compare/types.rs |  4 +-
 crates/mcp-loadtest-cli/src/main.rs           | 19 +++-
 crates/mcp-loadtest-cli/tests/cmd_compare.rs  | 89 +++++++++++++++--
 .../mcp-loadtest/src/analysis/regression.rs   | 43 +++++++++
 .../src/serve/tools/compare_runs.rs           | 65 +++++++++++--
 .../adr/0009-regression-threshold-defaults.md | 36 +++++--
 9 files changed, 335 insertions(+), 63 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 549b13d..5bbf2ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Configurable regression thresholds (ADR 0009 follow-up): `compare` gains `--max-p99-regression-pct` / `--max-error-rate-regression-pp` / `--allow-deadlock-increase`, and the `compare_runs` MCP tool gains the matching optional args. Backed by a shared `analysis::regression::RegressionThresholds` whose `Default` reproduces the historical 10% p99 / 0.5pp / deadlock-zero-tolerance policy, so existing CI gates are unaffected unless they opt in.
 - Project scaffolding: workspace layout, Cargo config, CLAUDE.md hierarchy, slash commands, CI workflow.
 - Design document covering motivation, types, algorithms, test matrix, milestones (DESIGN.md, 20 sections).
 - Project-structure document for AI-assisted development workflow (PROJECT-STRUCTURE.md).
diff --git a/crates/mcp-loadtest-cli/src/cmd_compare.rs b/crates/mcp-loadtest-cli/src/cmd_compare.rs
index f78d379..aa709bb 100644
--- a/crates/mcp-loadtest-cli/src/cmd_compare.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_compare.rs
@@ -7,13 +7,16 @@
 //! ## Regression rules
 //!
 //! A metric is flagged as a regression when **any** of these hold:
-//! - latency p99 grew by more than [`P99_REGRESSION_PCT`] percent;
-//! - error rate (errors / total) grew by more than [`ERROR_RATE_REGRESSION_PP`]
-//!   percentage points;
-//! - deadlock count went up at all (any new deadlock is a regression).
+//! - latency p99 grew by more than `thresholds.p99_pct` percent;
+//! - error rate (errors / total) grew by more than
+//!   `thresholds.error_rate_pp` percentage points;
+//! - deadlock count went up at all, when `deadlock_zero_tolerance` is set.
 //!
-//! Improvements (latency dropped > 10%, errors fewer, deadlocks fewer) get
-//! the up arrow; everything else is informational.
+//! [`RegressionThresholds::default`] reproduces the historical policy
+//! (10% p99 / 0.5pp error rate / deadlock zero-tolerance); the `compare`
+//! CLI flags and the `compare_runs` MCP tool args override it (ADR 0009).
+//! Improvements (the same deltas in the other direction) get the up arrow;
+//! everything else is informational.
 //!
 //! ## Why we re-deserialize the JSON shape locally
 //!
@@ -36,7 +39,8 @@ mod types;
 pub use diff::build_report;
 pub use types::{
     ComparableErrors, ComparableLatency, ComparableReport, ComparableThroughput, CompareReport,
-    Direction, ERROR_RATE_REGRESSION_PP, MetricDiff, P99_REGRESSION_PCT, ScenarioView,
+    Direction, ERROR_RATE_REGRESSION_PP, MetricDiff, P99_REGRESSION_PCT, RegressionThresholds,
+    ScenarioView,
 };
 
 /// Output format selector.
@@ -63,12 +67,18 @@ impl CompareFormat {
 
 // ---- entry point --------------------------------------------------------
 
-/// Run the `compare` subcommand. Reads two JSON files, builds a diff, and
-/// renders it in the requested format to stdout.
-pub fn run(baseline: &Path, current: &Path, format: CompareFormat) -> Result<String> {
+/// Run the `compare` subcommand. Reads two JSON files, builds a diff using
+/// `thresholds` (pass [`RegressionThresholds::default`] for the historical
+/// policy), and renders it in the requested format to stdout.
+pub fn run(
+    baseline: &Path,
+    current: &Path,
+    format: CompareFormat,
+    thresholds: &RegressionThresholds,
+) -> Result<String> {
     let base = read_report(baseline)?;
     let cur = read_report(current)?;
-    let report = build_report(&base, &cur);
+    let report = build_report(&base, &cur, thresholds);
     let rendered = match format {
         CompareFormat::Markdown => render::render_markdown(&report, &base, &cur),
         CompareFormat::Json => {
@@ -138,28 +148,43 @@ mod tests {
 
     #[test]
     fn classify_p99_flags_regression_above_threshold() {
-        // 100ms → 115ms = 15% growth, above 10% threshold.
-        assert_eq!(classify_p99(100.0, 115.0), Direction::Regressed);
+        // 100ms → 115ms = 15% growth, above the 10% default threshold.
+        assert_eq!(
+            classify_p99(100.0, 115.0, P99_REGRESSION_PCT),
+            Direction::Regressed
+        );
     }
 
     #[test]
     fn classify_p99_neutral_below_threshold() {
         // 100ms → 105ms = 5% growth, below threshold.
-        assert_eq!(classify_p99(100.0, 105.0), Direction::Neutral);
+        assert_eq!(
+            classify_p99(100.0, 105.0, P99_REGRESSION_PCT),
+            Direction::Neutral
+        );
     }
 
     #[test]
     fn classify_p99_flags_improvement() {
         // 100ms → 80ms = 20% drop.
-        assert_eq!(classify_p99(100.0, 80.0), Direction::Improved);
+        assert_eq!(
+            classify_p99(100.0, 80.0, P99_REGRESSION_PCT),
+            Direction::Improved
+        );
     }
 
     #[test]
     fn classify_error_rate_pp_threshold() {
         // 0.1pp jump — neutral.
-        assert_eq!(classify_error_rate(0.0, 0.1), Direction::Neutral);
+        assert_eq!(
+            classify_error_rate(0.0, 0.1, ERROR_RATE_REGRESSION_PP),
+            Direction::Neutral
+        );
         // 1.0pp jump — regression.
-        assert_eq!(classify_error_rate(0.0, 1.0), Direction::Regressed);
+        assert_eq!(
+            classify_error_rate(0.0, 1.0, ERROR_RATE_REGRESSION_PP),
+            Direction::Regressed
+        );
     }
 
     #[test]
@@ -169,7 +194,7 @@ mod tests {
         cur.run_id = "01CUR".into();
         cur.latency_ms.p99 = 200.0; // 100% growth — regression.
 
-        let cmp = build_report(&base, &cur);
+        let cmp = build_report(&base, &cur, &RegressionThresholds::default());
         assert!(cmp.has_regression);
         assert!(
             cmp.regressions.iter().any(|m| m.metric == "latency_p99_ms"),
@@ -184,7 +209,7 @@ mod tests {
         let mut cur = baseline_report();
         cur.deadlock_count = 1;
 
-        let cmp = build_report(&base, &cur);
+        let cmp = build_report(&base, &cur, &RegressionThresholds::default());
         assert!(cmp.has_regression);
         assert!(cmp.regressions.iter().any(|m| m.metric == "deadlock_count"));
     }
@@ -196,7 +221,7 @@ mod tests {
         cur.errors.total = 50; // 5% errors over 1000 requests = +5pp regression.
         cur.throughput.successful_requests = 950;
 
-        let cmp = build_report(&base, &cur);
+        let cmp = build_report(&base, &cur, &RegressionThresholds::default());
         assert!(cmp.has_regression);
         assert!(cmp.regressions.iter().any(|m| m.metric == "error_rate_pct"));
     }
@@ -205,18 +230,45 @@ mod tests {
     fn build_report_no_regression_when_clean() {
         let base = baseline_report();
         let cur = baseline_report();
-        let cmp = build_report(&base, &cur);
+        let cmp = build_report(&base, &cur, &RegressionThresholds::default());
         assert!(!cmp.has_regression);
         assert!(cmp.regressions.is_empty());
     }
 
+    #[test]
+    fn custom_thresholds_change_the_pass_fail_bucket() {
+        // Same inputs, different policy → different verdict (B1 contract).
+        let base = baseline_report();
+
+        // (a) p99 +15%: regression under the default 10%, clean under 20%.
+        let mut cur = baseline_report();
+        cur.latency_ms.p99 = 115.0;
+        assert!(build_report(&base, &cur, &RegressionThresholds::default()).has_regression);
+        let lax_p99 = RegressionThresholds {
+            p99_pct: 20.0,
+            ..RegressionThresholds::default()
+        };
+        assert!(!build_report(&base, &cur, &lax_p99).has_regression);
+
+        // (b) deadlock uptick: regression by default, ignored when
+        //     zero-tolerance is turned off.
+        let mut dl = baseline_report();
+        dl.deadlock_count = 3;
+        assert!(build_report(&base, &dl, &RegressionThresholds::default()).has_regression);
+        let allow_dl = RegressionThresholds {
+            deadlock_zero_tolerance: false,
+            ..RegressionThresholds::default()
+        };
+        assert!(!build_report(&base, &dl, &allow_dl).has_regression);
+    }
+
     #[test]
     fn render_markdown_mentions_regression() {
         let base = baseline_report();
         let mut cur = baseline_report();
         cur.run_id = "01CUR".into();
         cur.latency_ms.p99 = 250.0;
-        let cmp = build_report(&base, &cur);
+        let cmp = build_report(&base, &cur, &RegressionThresholds::default());
         let md = render_markdown(&cmp, &base, &cur);
         assert!(md.contains("REGRESSION"));
         assert!(md.contains("latency_p99_ms"));
diff --git a/crates/mcp-loadtest-cli/src/cmd_compare/diff.rs b/crates/mcp-loadtest-cli/src/cmd_compare/diff.rs
index bd59352..0e874b6 100644
--- a/crates/mcp-loadtest-cli/src/cmd_compare/diff.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_compare/diff.rs
@@ -2,18 +2,23 @@
 //! two [`ComparableReport`]s and classify per-metric regressions against
 //! the canonical thresholds.
 
-use super::types::{
-    ComparableReport, CompareReport, Direction, ERROR_RATE_REGRESSION_PP, MetricDiff,
-    P99_REGRESSION_PCT,
-};
+use super::types::{ComparableReport, CompareReport, Direction, MetricDiff, RegressionThresholds};
 
 /// Build the structured diff from two reports. Pure function, easily tested.
-pub fn build_report(base: &ComparableReport, cur: &ComparableReport) -> CompareReport {
+///
+/// `thresholds` controls which metric deltas count as regressions; pass
+/// [`RegressionThresholds::default`] for the historical 10% p99 / 0.5pp
+/// error-rate / deadlock-zero-tolerance policy.
+pub fn build_report(
+    base: &ComparableReport,
+    cur: &ComparableReport,
+    thresholds: &RegressionThresholds,
+) -> CompareReport {
     let mut metrics = Vec::new();
 
     // --- Latency p99 (ms) ------------------------------------------------
     let p99_change = cur.latency_ms.p99 - base.latency_ms.p99;
-    let p99_dir = classify_p99(base.latency_ms.p99, cur.latency_ms.p99);
+    let p99_dir = classify_p99(base.latency_ms.p99, cur.latency_ms.p99, thresholds.p99_pct);
     metrics.push(MetricDiff {
         metric: "latency_p99_ms".into(),
         baseline: format!("{:.2}", base.latency_ms.p99),
@@ -80,7 +85,7 @@ pub fn build_report(base: &ComparableReport, cur: &ComparableReport) -> CompareR
     let base_rate = error_rate_pct(base);
     let cur_rate = error_rate_pct(cur);
     let err_change = cur_rate - base_rate;
-    let err_dir = classify_error_rate(base_rate, cur_rate);
+    let err_dir = classify_error_rate(base_rate, cur_rate, thresholds.error_rate_pp);
     metrics.push(MetricDiff {
         metric: "error_rate_pct".into(),
         baseline: format!("{:.2}", base_rate),
@@ -92,7 +97,14 @@ pub fn build_report(base: &ComparableReport, cur: &ComparableReport) -> CompareR
     // --- Deadlock count — any uptick is a regression --------------------
     let dl_change = cur.deadlock_count as f64 - base.deadlock_count as f64;
     let dl_dir = if cur.deadlock_count > base.deadlock_count {
-        Direction::Regressed
+        // An uptick only gates the build when zero-tolerance is on (default).
+        // With it off, the diff still shows the change but it stays out of
+        // the `regressions` filter below.
+        if thresholds.deadlock_zero_tolerance {
+            Direction::Regressed
+        } else {
+            Direction::Neutral
+        }
     } else if cur.deadlock_count < base.deadlock_count {
         Direction::Improved
     } else {
@@ -150,8 +162,8 @@ pub fn build_report(base: &ComparableReport, cur: &ComparableReport) -> CompareR
     }
 }
 
-/// Returns Regressed if p99 grew > P99_REGRESSION_PCT%.
-pub(super) fn classify_p99(base: f64, cur: f64) -> Direction {
+/// Returns Regressed if p99 grew by more than `p99_pct` percent.
+pub(super) fn classify_p99(base: f64, cur: f64, p99_pct: f64) -> Direction {
     if base <= 0.0 {
         // Baseline had no p99 sample (zero-measurement run); any non-zero
         // current p99 is "more data, not necessarily worse".
@@ -161,9 +173,9 @@ pub(super) fn classify_p99(base: f64, cur: f64) -> Direction {
         return Direction::Neutral;
     }
     let pct_change = (cur - base) / base * 100.0;
-    if pct_change > P99_REGRESSION_PCT {
+    if pct_change > p99_pct {
         Direction::Regressed
-    } else if pct_change < -P99_REGRESSION_PCT {
+    } else if pct_change < -p99_pct {
         Direction::Improved
     } else {
         Direction::Neutral
@@ -178,12 +190,13 @@ fn error_rate_pct(r: &ComparableReport) -> f64 {
     r.errors.total as f64 / r.throughput.total_requests as f64 * 100.0
 }
 
-/// Returns Regressed if error rate grew > ERROR_RATE_REGRESSION_PP percentage points.
-pub(super) fn classify_error_rate(base_pct: f64, cur_pct: f64) -> Direction {
+/// Returns Regressed if error rate grew by more than `error_rate_pp`
+/// percentage points.
+pub(super) fn classify_error_rate(base_pct: f64, cur_pct: f64, error_rate_pp: f64) -> Direction {
     let delta = cur_pct - base_pct;
-    if delta > ERROR_RATE_REGRESSION_PP {
+    if delta > error_rate_pp {
         Direction::Regressed
-    } else if delta < -ERROR_RATE_REGRESSION_PP {
+    } else if delta < -error_rate_pp {
         Direction::Improved
     } else {
         Direction::Neutral
diff --git a/crates/mcp-loadtest-cli/src/cmd_compare/types.rs b/crates/mcp-loadtest-cli/src/cmd_compare/types.rs
index 2bd42f7..91d9dff 100644
--- a/crates/mcp-loadtest-cli/src/cmd_compare/types.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_compare/types.rs
@@ -9,7 +9,9 @@ use serde::{Deserialize, Serialize};
 // Re-export the canonical regression thresholds from the core library so the
 // `serve` tool handler and `cmd_compare` agree by construction. The `pub`
 // surface stays stable for downstream callers that imported these symbols.
-pub use mcp_loadtest::analysis::regression::{ERROR_RATE_REGRESSION_PP, P99_REGRESSION_PCT};
+pub use mcp_loadtest::analysis::regression::{
+    ERROR_RATE_REGRESSION_PP, P99_REGRESSION_PCT, RegressionThresholds,
+};
 
 /// Symbol used in markdown output for a regressed metric.
 pub(super) const ARROW_REGRESSION: &str = "🔻";
diff --git a/crates/mcp-loadtest-cli/src/main.rs b/crates/mcp-loadtest-cli/src/main.rs
index 1b9bdf5..f1703ba 100644
--- a/crates/mcp-loadtest-cli/src/main.rs
+++ b/crates/mcp-loadtest-cli/src/main.rs
@@ -46,6 +46,15 @@ enum Cmd {
         /// Output format: `markdown` (default, human-readable) or `json` (CI-friendly).
         #[arg(long, default_value = "markdown")]
         format: String,
+        /// p99 latency growth (percent) that flags a regression.
+        #[arg(long, default_value_t = cmd_compare::P99_REGRESSION_PCT)]
+        max_p99_regression_pct: f64,
+        /// Error-rate growth (percentage points) that flags a regression.
+        #[arg(long, default_value_t = cmd_compare::ERROR_RATE_REGRESSION_PP)]
+        max_error_rate_regression_pp: f64,
+        /// Don't flag a regression when the deadlock count increases.
+        #[arg(long, default_value_t = false)]
+        allow_deadlock_increase: bool,
     },
 
     /// Quick deadlock probe — convenience wrapper around `DeadlockProbe`.
@@ -148,9 +157,17 @@ async fn main() -> Result<()> {
             baseline,
             current,
             format,
+            max_p99_regression_pct,
+            max_error_rate_regression_pp,
+            allow_deadlock_increase,
         } => {
             let fmt = cmd_compare::CompareFormat::parse(&format)?;
-            let rendered = cmd_compare::run(&baseline, &current, fmt)?;
+            let thresholds = cmd_compare::RegressionThresholds {
+                p99_pct: max_p99_regression_pct,
+                error_rate_pp: max_error_rate_regression_pp,
+                deadlock_zero_tolerance: !allow_deadlock_increase,
+            };
+            let rendered = cmd_compare::run(&baseline, &current, fmt, &thresholds)?;
             // Stream the rendered diff straight to stdout so it can be
             // piped into a file or downstream CI tooling.
             print!("{rendered}");
diff --git a/crates/mcp-loadtest-cli/tests/cmd_compare.rs b/crates/mcp-loadtest-cli/tests/cmd_compare.rs
index 22adc0a..ba60815 100644
--- a/crates/mcp-loadtest-cli/tests/cmd_compare.rs
+++ b/crates/mcp-loadtest-cli/tests/cmd_compare.rs
@@ -6,7 +6,7 @@
 
 use std::fs;
 
-use mcp_loadtest_cli::cmd_compare::{self, CompareFormat};
+use mcp_loadtest_cli::cmd_compare::{self, CompareFormat, RegressionThresholds};
 use serde_json::json;
 use tempfile::tempdir;
 
@@ -78,8 +78,13 @@ fn compare_flags_p99_regression() {
     .expect("write baseline");
     fs::write(&current_path, synthetic_metrics_json("01CUR", 250.0, 0, 0)).expect("write current");
 
-    let md = cmd_compare::run(&baseline_path, &current_path, CompareFormat::Markdown)
-        .expect("compare run");
+    let md = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Markdown,
+        &RegressionThresholds::default(),
+    )
+    .expect("compare run");
 
     assert!(
         md.contains("REGRESSION"),
@@ -107,8 +112,13 @@ fn compare_no_regression_on_identical_reports() {
     .expect("write baseline");
     fs::write(&current_path, synthetic_metrics_json("01CUR", 100.0, 0, 0)).expect("write current");
 
-    let md = cmd_compare::run(&baseline_path, &current_path, CompareFormat::Markdown)
-        .expect("compare run");
+    let md = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Markdown,
+        &RegressionThresholds::default(),
+    )
+    .expect("compare run");
 
     assert!(
         md.contains("no regressions"),
@@ -129,8 +139,13 @@ fn compare_flags_deadlock_uptick() {
     .expect("write baseline");
     fs::write(&current_path, synthetic_metrics_json("01CUR", 100.0, 0, 1)).expect("write current");
 
-    let md = cmd_compare::run(&baseline_path, &current_path, CompareFormat::Markdown)
-        .expect("compare run");
+    let md = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Markdown,
+        &RegressionThresholds::default(),
+    )
+    .expect("compare run");
     assert!(md.contains("REGRESSION"), "expected regression on deadlock");
     assert!(md.contains("deadlock_count"));
 }
@@ -148,8 +163,13 @@ fn compare_json_output_structure() {
     .expect("write baseline");
     fs::write(&current_path, synthetic_metrics_json("01CUR", 250.0, 0, 0)).expect("write current");
 
-    let json_out =
-        cmd_compare::run(&baseline_path, &current_path, CompareFormat::Json).expect("compare run");
+    let json_out = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Json,
+        &RegressionThresholds::default(),
+    )
+    .expect("compare run");
 
     let v: serde_json::Value = serde_json::from_str(&json_out).expect("parse json output");
     assert_eq!(v["baseline_run_id"], "01BASE");
@@ -173,9 +193,58 @@ fn compare_handles_missing_file() {
     let baseline_path = dir.path().join("missing.json");
     let current_path = dir.path().join("also-missing.json");
 
-    let result = cmd_compare::run(&baseline_path, &current_path, CompareFormat::Markdown);
+    let result = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Markdown,
+        &RegressionThresholds::default(),
+    );
     assert!(
         result.is_err(),
         "expected error for missing files, got: {result:?}"
     );
 }
+
+#[test]
+fn custom_thresholds_flip_the_regression_verdict() {
+    // Identical inputs, different policy → different gate result, observed
+    // through the public `run` JSON output (the CI-facing contract).
+    let dir = tempdir().expect("tempdir");
+    let baseline_path = dir.path().join("baseline.json");
+    let current_path = dir.path().join("current.json");
+
+    // Baseline p99 = 100ms, current p99 = 115ms → +15% growth.
+    fs::write(
+        &baseline_path,
+        synthetic_metrics_json("01BASE", 100.0, 0, 0),
+    )
+    .expect("write baseline");
+    fs::write(&current_path, synthetic_metrics_json("01CUR", 115.0, 0, 0)).expect("write current");
+
+    let parse = |s: &str| -> serde_json::Value {
+        serde_json::from_str(s).expect("parse compare json output")
+    };
+
+    // Default 10% policy → +15% is a regression.
+    let strict = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Json,
+        &RegressionThresholds::default(),
+    )
+    .expect("compare run (strict)");
+    assert_eq!(parse(&strict)["has_regression"], true);
+
+    // Loosen p99 budget to 20% → same +15% delta is now within budget.
+    let lax = cmd_compare::run(
+        &baseline_path,
+        &current_path,
+        CompareFormat::Json,
+        &RegressionThresholds {
+            p99_pct: 20.0,
+            ..RegressionThresholds::default()
+        },
+    )
+    .expect("compare run (lax)");
+    assert_eq!(parse(&lax)["has_regression"], false);
+}
diff --git a/crates/mcp-loadtest/src/analysis/regression.rs b/crates/mcp-loadtest/src/analysis/regression.rs
index 9ed137d..d4a8e71 100644
--- a/crates/mcp-loadtest/src/analysis/regression.rs
+++ b/crates/mcp-loadtest/src/analysis/regression.rs
@@ -12,3 +12,46 @@ pub const P99_REGRESSION_PCT: f64 = 10.0;
 /// Error-rate growth, in percentage points, that flips a comparison into the
 /// "regressed" bucket.
 pub const ERROR_RATE_REGRESSION_PP: f64 = 0.5;
+
+/// Operator-tunable regression policy shared by the `compare` subcommand and
+/// the `compare_runs` MCP tool.
+///
+/// [`RegressionThresholds::default`] reproduces the historical hard-coded
+/// behaviour exactly (`P99_REGRESSION_PCT`, `ERROR_RATE_REGRESSION_PP`,
+/// deadlock zero-tolerance), so existing callers and CI gates are unaffected
+/// unless they opt in to overrides. This resolves the "expose the constants
+/// as a config struct" open question in ADR 0009.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct RegressionThresholds {
+    /// p99 latency growth, in percent, that counts as a regression.
+    pub p99_pct: f64,
+    /// Error-rate growth, in percentage points, that counts as a regression.
+    pub error_rate_pp: f64,
+    /// When `true` (the default), any increase in deadlock count is a
+    /// regression. Set `false` to stop gating on deadlock upticks (the diff
+    /// still reports the change, it just no longer flips `has_regression`).
+    pub deadlock_zero_tolerance: bool,
+}
+
+impl Default for RegressionThresholds {
+    fn default() -> Self {
+        Self {
+            p99_pct: P99_REGRESSION_PCT,
+            error_rate_pp: ERROR_RATE_REGRESSION_PP,
+            deadlock_zero_tolerance: true,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn default_matches_the_historical_constants() {
+        let t = RegressionThresholds::default();
+        assert_eq!(t.p99_pct, P99_REGRESSION_PCT);
+        assert_eq!(t.error_rate_pp, ERROR_RATE_REGRESSION_PP);
+        assert!(t.deadlock_zero_tolerance);
+    }
+}
diff --git a/crates/mcp-loadtest/src/serve/tools/compare_runs.rs b/crates/mcp-loadtest/src/serve/tools/compare_runs.rs
index d1becb5..907cc2b 100644
--- a/crates/mcp-loadtest/src/serve/tools/compare_runs.rs
+++ b/crates/mcp-loadtest/src/serve/tools/compare_runs.rs
@@ -12,7 +12,7 @@ use std::path::PathBuf;
 
 use serde_json::{Value, json};
 
-use crate::analysis::regression::{ERROR_RATE_REGRESSION_PP, P99_REGRESSION_PCT};
+use crate::analysis::regression::RegressionThresholds;
 
 use super::{ToolError, required_str};
 
@@ -32,6 +32,18 @@ pub(super) fn compare_runs_def() -> Value {
                 "current": {
                     "type": "string",
                     "description": "Path to the current metrics.json."
+                },
+                "max_p99_regression_pct": {
+                    "type": "number",
+                    "description": "Optional. p99 latency growth (percent) that flags a regression. Default 10."
+                },
+                "max_error_rate_regression_pp": {
+                    "type": "number",
+                    "description": "Optional. Error-rate growth (percentage points) that flags a regression. Default 0.5."
+                },
+                "allow_deadlock_increase": {
+                    "type": "boolean",
+                    "description": "Optional. When true, an increase in deadlock count is not treated as a regression. Default false."
                 }
             },
             "required": ["baseline", "current"]
@@ -82,13 +94,14 @@ pub(super) async fn compare_runs(args: &Value) -> Result<Value, ToolError> {
     let base_err_rate = error_rate_pct(base_errs, base_total);
     let cur_err_rate = error_rate_pct(cur_errs, cur_total);
 
-    // Regression rules — thresholds are sourced from
-    // `analysis::regression` so the CLI's `compare` subcommand and this
-    // handler stay in lockstep.
+    // Regression rules — same `RegressionThresholds` model the CLI's
+    // `compare` subcommand uses, so the two stay in lockstep. Optional tool
+    // args override the defaults (ADR 0009).
+    let thr = regression_overrides(args);
     let p99_pct = pct_change(base_p99, cur_p99);
-    let p99_regressed = p99_pct > P99_REGRESSION_PCT;
-    let err_regressed = (cur_err_rate - base_err_rate) > ERROR_RATE_REGRESSION_PP;
-    let dl_regressed = cur_dl > base_dl;
+    let p99_regressed = p99_pct > thr.p99_pct;
+    let err_regressed = (cur_err_rate - base_err_rate) > thr.error_rate_pp;
+    let dl_regressed = thr.deadlock_zero_tolerance && cur_dl > base_dl;
 
     let mut regressions: Vec<&str> = Vec::new();
     if p99_regressed {
@@ -208,6 +221,26 @@ fn json_path_u64(v: &Value, path: &[&str]) -> u64 {
     cur.as_u64().unwrap_or(0)
 }
 
+/// Build the regression policy from optional tool args, falling back to
+/// [`RegressionThresholds::default`] for anything not supplied.
+fn regression_overrides(args: &Value) -> RegressionThresholds {
+    let d = RegressionThresholds::default();
+    RegressionThresholds {
+        p99_pct: args
+            .get("max_p99_regression_pct")
+            .and_then(Value::as_f64)
+            .unwrap_or(d.p99_pct),
+        error_rate_pp: args
+            .get("max_error_rate_regression_pp")
+            .and_then(Value::as_f64)
+            .unwrap_or(d.error_rate_pp),
+        deadlock_zero_tolerance: !args
+            .get("allow_deadlock_increase")
+            .and_then(Value::as_bool)
+            .unwrap_or(false),
+    }
+}
+
 fn pct_change(base: f64, cur: f64) -> f64 {
     if base <= 0.0 {
         return 0.0;
@@ -269,4 +302,22 @@ mod tests {
         let canonical = result.expect("absolute metrics.json must be accepted");
         assert!(canonical.ends_with("metrics.json"));
     }
+
+    #[test]
+    fn regression_overrides_default_when_args_absent() {
+        let t = regression_overrides(&json!({}));
+        assert_eq!(t, RegressionThresholds::default());
+    }
+
+    #[test]
+    fn regression_overrides_parses_supplied_args() {
+        let t = regression_overrides(&json!({
+            "max_p99_regression_pct": 25.0,
+            "max_error_rate_regression_pp": 2.0,
+            "allow_deadlock_increase": true
+        }));
+        assert_eq!(t.p99_pct, 25.0);
+        assert_eq!(t.error_rate_pp, 2.0);
+        assert!(!t.deadlock_zero_tolerance);
+    }
 }
diff --git a/docs/adr/0009-regression-threshold-defaults.md b/docs/adr/0009-regression-threshold-defaults.md
index 291bcb9..b075bbd 100644
--- a/docs/adr/0009-regression-threshold-defaults.md
+++ b/docs/adr/0009-regression-threshold-defaults.md
@@ -33,22 +33,46 @@ Both `cmd_compare.rs` (CLI) and `serve/tools.rs` (MCP) import from `analysis::re
 
 ## Alternatives considered
 
-| Option | Why rejected |
-|---|---|
-| **Operator-configurable thresholds via CLI flag / MCP tool arg** | Useful but premature — we don't yet know which knobs operators actually want to turn. Adding them later in `analysis::regression` is straightforward. |
-| **Per-tool overrides (e.g. tighter threshold for stdio vs HTTP)** | Same reasoning — defer until empirical evidence shows the defaults are wrong for a category. |
-| **Tighter defaults (5% / 0.2pp)** | False-positive rate too high at the sample sizes typical for `mcp-loadtest` runs. |
+| Option                                                            | Why rejected                                                                                                                                          |
+| ----------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Operator-configurable thresholds via CLI flag / MCP tool arg**  | Useful but premature — we don't yet know which knobs operators actually want to turn. Adding them later in `analysis::regression` is straightforward. |
+| **Per-tool overrides (e.g. tighter threshold for stdio vs HTTP)** | Same reasoning — defer until empirical evidence shows the defaults are wrong for a category.                                                          |
+| **Tighter defaults (5% / 0.2pp)**                                 | False-positive rate too high at the sample sizes typical for `mcp-loadtest` runs.                                                                     |
 
 ## Consequences
 
 **Positive:**
+
 - CLI and MCP consumers stay in sync by construction. Cargo's module system enforces it.
 - Future tuning lands in one place; reviewers know exactly which constants control the policy.
 - The numbers are pragmatic defaults, not gospel — documented here so they can be challenged later with evidence rather than re-derived from scratch.
 
 **Negative:**
+
 - A user who wants different thresholds today has to fork or pre-process the metrics themselves. Workaround until configurable thresholds land.
 
 **Open:**
-- Whether to expose the constants as a `RegressionConfig` struct that `compare` and `compare_runs` both accept. Defer until the first user asks for it.
+
+- ~~Whether to expose the constants as a `RegressionConfig` struct that `compare` and `compare_runs` both accept.~~ **Resolved 2026-05-16 — see Update below.**
 - Whether deadlock-count rule belongs in the same module or in a dedicated `analysis::deadlock`. Defer; current single-module home is fine at this scale.
+
+## Update (2026-05-16): thresholds are now operator-configurable
+
+The "expose as a config struct" open question is resolved. `analysis::regression` now also exports:
+
+```rust
+pub struct RegressionThresholds {
+    pub p99_pct: f64,                 // default P99_REGRESSION_PCT (10.0)
+    pub error_rate_pp: f64,           // default ERROR_RATE_REGRESSION_PP (0.5)
+    pub deadlock_zero_tolerance: bool, // default true
+}
+```
+
+`RegressionThresholds::default()` reproduces the historical hard-coded policy **exactly**, so existing CI gates are unaffected unless they opt in. The two constants stay as the documented defaults (and the `Default` impl's single source of truth).
+
+Overrides are surfaced where operators actually work:
+
+- **CLI:** `mcp-loadtest compare --max-p99-regression-pct <pct> --max-error-rate-regression-pp <pp> [--allow-deadlock-increase]`.
+- **MCP tool:** optional `compare_runs` args `max_p99_regression_pct` / `max_error_rate_regression_pp` / `allow_deadlock_increase`.
+
+Status stays **Accepted** — the defaults and rationale above are unchanged; this only adds an opt-in override path that the original "Alternatives considered" row deferred until a user asked for it. A user asked.

From 10443aacd66a1b45ec9f53ea6da50cb640efeb50 Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 19:35:44 +0700
Subject: [PATCH 3/8] feat(validation): opt-in strict MCP schema checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The protocol-aware-assertions differentiator wanted the tester to catch
when a tools/call's args don't match the server's advertised
inputSchema, but ADR 0005 deliberately chose lenient/forward-compat
parsing and the repo's supply-chain discipline rules out an
unauditable JSON Schema crate.

Add `[validation] strict = true` (default false → behaviour unchanged):
- protocol::schema, a dependency-free subset validator
  (type/properties/required/enum/items) that skips unmodeled keywords,
  so it never rejects on forward-compat grounds.
- classify_schema_violation: args violations Fail (gate as
  ProtocolError), result violations Warn (result-side deferred to
  v0.2), none Ignore.
- Session caches the tools/list registry the run already fetches (no
  extra round-trip, ADR 0006 hot path untouched when off) and rejects
  bad args before send via SessionError::SchemaViolation, mapped to
  CallOutcome::ProtocolError.

ADR 0010 added; CHANGELOG noted. New mock-schema.py fixture +
strict_validation.rs prove bad args gate and compliant args don't.
ci-checks green: fmt, clippy -D warnings, --locked tests, doc.
---
 CHANGELOG.md                                  |   1 +
 crates/mcp-loadtest-cli/src/cmd_cross.rs      |   1 +
 crates/mcp-loadtest-cli/src/cmd_deadlock.rs   |   1 +
 crates/mcp-loadtest/src/config.rs             |  18 ++
 crates/mcp-loadtest/src/protocol/mod.rs       |   1 +
 crates/mcp-loadtest/src/protocol/schema.rs    | 305 ++++++++++++++++++
 crates/mcp-loadtest/src/run.rs                |  14 +-
 crates/mcp-loadtest/src/run/thresholds.rs     |   1 +
 .../src/scenario/fuzzer/classify.rs           |  10 +
 crates/mcp-loadtest/src/scenario/mod.rs       |  14 +
 .../src/serve/tools/deadlock_probe.rs         |   1 +
 .../src/serve/tools/sustained_load.rs         |   1 +
 crates/mcp-loadtest/src/session.rs            |  61 ++++
 .../tests/fixtures/mock-schema.py             |  62 ++++
 .../mcp-loadtest/tests/strict_validation.rs   | 128 ++++++++
 docs/adr/0010-strict-schema-validation.md     |  64 ++++
 docs/adr/README.md                            |  27 +-
 17 files changed, 698 insertions(+), 12 deletions(-)
 create mode 100644 crates/mcp-loadtest/src/protocol/schema.rs
 create mode 100644 crates/mcp-loadtest/tests/fixtures/mock-schema.py
 create mode 100644 crates/mcp-loadtest/tests/strict_validation.rs
 create mode 100644 docs/adr/0010-strict-schema-validation.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5bbf2ae..6410ab3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Opt-in strict MCP schema validation (ADR 0010): `[validation] strict = true` validates each `tools/call`'s arguments against the server's advertised `inputSchema` before sending. A dependency-free subset validator (`protocol::schema`) covers `type`/`properties`/`required`/`enum`/`items` and skips unmodeled keywords (forward-compatible, ADR 0005). Arg violations are classified as `CallOutcome::ProtocolError` so they gate a run; default (off) behaviour is byte-for-byte unchanged. Result-side validation is deferred to v0.2.
 - Configurable regression thresholds (ADR 0009 follow-up): `compare` gains `--max-p99-regression-pct` / `--max-error-rate-regression-pp` / `--allow-deadlock-increase`, and the `compare_runs` MCP tool gains the matching optional args. Backed by a shared `analysis::regression::RegressionThresholds` whose `Default` reproduces the historical 10% p99 / 0.5pp / deadlock-zero-tolerance policy, so existing CI gates are unaffected unless they opt in.
 - Project scaffolding: workspace layout, Cargo config, CLAUDE.md hierarchy, slash commands, CI workflow.
 - Design document covering motivation, types, algorithms, test matrix, milestones (DESIGN.md, 20 sections).
diff --git a/crates/mcp-loadtest-cli/src/cmd_cross.rs b/crates/mcp-loadtest-cli/src/cmd_cross.rs
index f4bcd1e..54dab0e 100644
--- a/crates/mcp-loadtest-cli/src/cmd_cross.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_cross.rs
@@ -177,6 +177,7 @@ async fn run_one(server: &str, args: &CrossArgs, args_value: &Value) -> Result<R
             // individual `mcp-loadtest run` invocations.
             formats: Vec::new(),
         },
+        validation: mcp_loadtest::config::ValidationConfig::default(),
     };
 
     let run = Run::new(config, scenario, args.output_dir.clone());
diff --git a/crates/mcp-loadtest-cli/src/cmd_deadlock.rs b/crates/mcp-loadtest-cli/src/cmd_deadlock.rs
index 25564d8..21983cb 100644
--- a/crates/mcp-loadtest-cli/src/cmd_deadlock.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_deadlock.rs
@@ -53,6 +53,7 @@ pub async fn run_deadlock_probe(
             report_dir: output_dir.to_path_buf(),
             formats: vec!["terminal".into(), "markdown".into(), "json".into()],
         },
+        validation: mcp_loadtest::config::ValidationConfig::default(),
     };
 
     let run = Run::new(config, Box::new(scenario), output_dir.to_path_buf());
diff --git a/crates/mcp-loadtest/src/config.rs b/crates/mcp-loadtest/src/config.rs
index 8493bf9..80add4d 100644
--- a/crates/mcp-loadtest/src/config.rs
+++ b/crates/mcp-loadtest/src/config.rs
@@ -32,6 +32,24 @@ pub struct Config {
     /// Output controls (run dir, formats).
     #[serde(default)]
     pub output: OutputConfig,
+    /// Protocol-validation controls. Off by default (forward-compatible,
+    /// ADR 0005); set `[validation] strict = true` to opt in.
+    #[serde(default)]
+    pub validation: ValidationConfig,
+}
+
+/// Opt-in protocol-validation block.
+///
+/// When `strict` is false (the default) the load tester behaves exactly as
+/// before — unknown/mismatched schema shapes are tolerated. When true,
+/// tool-call args/results are checked against the server's advertised
+/// `inputSchema` and mismatches are classified via
+/// [`crate::protocol::schema::classify_schema_violation`].
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct ValidationConfig {
+    /// Enable strict MCP schema validation. Default `false`.
+    #[serde(default)]
+    pub strict: bool,
 }
 
 /// Server invocation block. See DESIGN.md §7.
diff --git a/crates/mcp-loadtest/src/protocol/mod.rs b/crates/mcp-loadtest/src/protocol/mod.rs
index c3f4946..1857c8b 100644
--- a/crates/mcp-loadtest/src/protocol/mod.rs
+++ b/crates/mcp-loadtest/src/protocol/mod.rs
@@ -4,4 +4,5 @@
 
 pub mod jsonrpc;
 pub mod mcp;
+pub mod schema;
 pub mod transport;
diff --git a/crates/mcp-loadtest/src/protocol/schema.rs b/crates/mcp-loadtest/src/protocol/schema.rs
new file mode 100644
index 0000000..65c6f40
--- /dev/null
+++ b/crates/mcp-loadtest/src/protocol/schema.rs
@@ -0,0 +1,305 @@
+//! Dependency-free JSON Schema **subset** validator for MCP tool
+//! `inputSchema`s, plus the opt-in policy that decides what a mismatch
+//! *means* to the load test.
+//!
+//! # Why a subset, and why no `jsonschema` crate
+//!
+//! MCP `inputSchema`s in the wild use a small, stable slice of JSON Schema:
+//! `type`, `properties`, `required`, `enum`, `items`, and nesting. Pulling a
+//! full validator crate would add a transitive dependency tree we can't
+//! `cargo deny`-audit in every dev environment, for coverage we don't need.
+//! See ADR 0010.
+//!
+//! # Forward-compatibility (ADR 0005)
+//!
+//! [`validate`] **never rejects on a keyword it does not model** — unknown
+//! keywords are skipped, not failed. Strictness is therefore opt-in *policy*
+//! ([`classify_schema_violation`]), not a hard protocol change: the default
+//! build behaves exactly as before unless an operator turns it on.
+
+use serde_json::Value;
+
+/// One schema mismatch found while validating an instance against a schema.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct SchemaViolation {
+    /// Dotted location of the offending value, e.g. `args.timeout`.
+    pub path: String,
+    /// Human-readable description of what was wrong.
+    pub message: String,
+}
+
+/// Where a validation happened — tool-call **arguments** (what we send) or
+/// the tool-call **result** (what the server sent back). The policy can
+/// treat the two sides differently.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ValidationSite {
+    /// Arguments we are about to send in a `tools/call`.
+    ToolCallArgs,
+    /// The `result` payload a `tools/call` returned.
+    ToolCallResult,
+}
+
+/// What the load tester should DO about a set of violations.
+///
+/// This is the gate-shaping knob: [`SchemaPolicy::Fail`] makes a mismatch
+/// count against the run (it maps to `CallOutcome::ProtocolError`), while
+/// [`SchemaPolicy::Warn`]/[`SchemaPolicy::Ignore`] keep the run green.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SchemaPolicy {
+    /// Mismatch is a protocol error — gate the run on it.
+    Fail,
+    /// Record/log the mismatch but do not gate.
+    Warn,
+    /// Ignore the mismatch entirely.
+    Ignore,
+}
+
+/// Validate `instance` against the JSON Schema `schema`, returning every
+/// mismatch found. Empty result = valid under the supported subset.
+///
+/// Supported keywords: `type` (object/array/string/number/integer/boolean/
+/// null), `properties`, `required`, `enum`, `items`. Any other keyword is
+/// skipped (forward-compatible — see module docs). An empty/`true`-ish
+/// schema validates everything.
+pub fn validate(schema: &Value, instance: &Value) -> Vec<SchemaViolation> {
+    let mut out = Vec::new();
+    validate_at(schema, instance, "", &mut out);
+    out
+}
+
+fn validate_at(schema: &Value, instance: &Value, path: &str, out: &mut Vec<SchemaViolation>) {
+    let Some(obj) = schema.as_object() else {
+        // Non-object schema (e.g. `true`) — nothing to enforce.
+        return;
+    };
+
+    if let Some(ty) = obj.get("type").and_then(Value::as_str)
+        && !type_matches(ty, instance)
+    {
+        out.push(SchemaViolation {
+            path: loc(path),
+            message: format!("expected type `{ty}`, got `{}`", json_type(instance)),
+        });
+        // Type is wrong; deeper checks would just produce noise.
+        return;
+    }
+
+    if let Some(enum_vals) = obj.get("enum").and_then(Value::as_array)
+        && !enum_vals.iter().any(|v| v == instance)
+    {
+        out.push(SchemaViolation {
+            path: loc(path),
+            message: format!("value not in enum {enum_vals:?}"),
+        });
+    }
+
+    if let Some(props) = obj.get("properties").and_then(Value::as_object)
+        && let Some(inst_obj) = instance.as_object()
+    {
+        for (key, sub_schema) in props {
+            if let Some(sub_inst) = inst_obj.get(key) {
+                let child = if path.is_empty() {
+                    key.clone()
+                } else {
+                    format!("{path}.{key}")
+                };
+                validate_at(sub_schema, sub_inst, &child, out);
+            }
+        }
+    }
+
+    if let Some(required) = obj.get("required").and_then(Value::as_array)
+        && let Some(inst_obj) = instance.as_object()
+    {
+        for req in required.iter().filter_map(Value::as_str) {
+            if !inst_obj.contains_key(req) {
+                out.push(SchemaViolation {
+                    path: loc(path),
+                    message: format!("missing required property `{req}`"),
+                });
+            }
+        }
+    }
+
+    if let Some(items) = obj.get("items")
+        && let Some(arr) = instance.as_array()
+    {
+        for (i, item) in arr.iter().enumerate() {
+            validate_at(items, item, &format!("{path}[{i}]"), out);
+        }
+    }
+}
+
+fn type_matches(ty: &str, v: &Value) -> bool {
+    match ty {
+        "object" => v.is_object(),
+        "array" => v.is_array(),
+        "string" => v.is_string(),
+        "boolean" => v.is_boolean(),
+        "null" => v.is_null(),
+        "number" => v.is_number(),
+        // JSON has no integer type; accept whole numbers.
+        "integer" => v.is_i64() || v.is_u64() || v.as_f64().is_some_and(|f| f.fract() == 0.0),
+        // Unknown type keyword → don't enforce (forward-compatible).
+        _ => true,
+    }
+}
+
+fn json_type(v: &Value) -> &'static str {
+    match v {
+        Value::Null => "null",
+        Value::Bool(_) => "boolean",
+        Value::Number(_) => "number",
+        Value::String(_) => "string",
+        Value::Array(_) => "array",
+        Value::Object(_) => "object",
+    }
+}
+
+fn loc(path: &str) -> String {
+    if path.is_empty() {
+        "<root>".to_string()
+    } else {
+        path.to_string()
+    }
+}
+
+/// Decide what a set of [`validate`] violations *means* for the run.
+///
+/// Kept a separate, tiny function on purpose: it encodes the gate policy
+/// (a product decision), not mechanical validation, and it sits on top of
+/// the deliberate forward-compat stance (ADR 0005). The policy (ADR 0010):
+///
+/// - **Args** ([`ValidationSite::ToolCallArgs`]) → [`SchemaPolicy::Fail`].
+///   Args are what we send; strict mode is explicitly opted in and
+///   [`validate`] only flags real breaks of the server's *own* advertised
+///   contract (never unknown/extra fields), so a violation is an actionable
+///   CI failure rather than forward-compat noise.
+/// - **Result** ([`ValidationSite::ToolCallResult`]) → [`SchemaPolicy::Warn`].
+///   Result-side checking is deferred to v0.2; servers legitimately grow
+///   results over time, so it must stay non-gating observability.
+/// - No violations → [`SchemaPolicy::Ignore`].
+pub fn classify_schema_violation(
+    site: ValidationSite,
+    violations: &[SchemaViolation],
+) -> SchemaPolicy {
+    if violations.is_empty() {
+        return SchemaPolicy::Ignore;
+    }
+    match site {
+        // Args are what *we* send. Strict mode is explicitly opted in, and
+        // `validate` only ever flags genuine contract breaks against the
+        // server's *own* advertised `inputSchema` (missing `required`,
+        // wrong `type`, bad `enum`, bad array item). It never flags unknown
+        // or extra keywords/properties — so failing here does not conflict
+        // with the forward-compat stance (ADR 0005); that concern is about
+        // unknown fields and result evolution, neither of which this path
+        // touches. A request that violates the tool's declared input
+        // contract is an actionable CI failure, not noise.
+        ValidationSite::ToolCallArgs => SchemaPolicy::Fail,
+        // Result-side validation is not wired until v0.2 (ADR 0010, Open).
+        // When it lands, servers legitimately grow result payloads over
+        // time, so the result side must default to non-gating
+        // observability rather than a hard gate. Keeping this arm correct
+        // now makes the policy forward-looking without affecting v0.1.0
+        // (where `ToolCallResult` is never passed).
+        ValidationSite::ToolCallResult => SchemaPolicy::Warn,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    fn schema() -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "name": { "type": "string" },
+                "count": { "type": "integer" },
+                "mode": { "type": "string", "enum": ["fast", "slow"] },
+                "tags": { "type": "array", "items": { "type": "string" } }
+            },
+            "required": ["name"]
+        })
+    }
+
+    #[test]
+    fn valid_instance_has_no_violations() {
+        let v = validate(
+            &schema(),
+            &json!({ "name": "x", "count": 3, "mode": "fast", "tags": ["a", "b"] }),
+        );
+        assert!(v.is_empty(), "unexpected: {v:?}");
+    }
+
+    #[test]
+    fn missing_required_is_reported() {
+        let v = validate(&schema(), &json!({ "count": 1 }));
+        assert!(
+            v.iter()
+                .any(|x| x.message.contains("required property `name`"))
+        );
+    }
+
+    #[test]
+    fn wrong_type_is_reported_with_path() {
+        let v = validate(&schema(), &json!({ "name": 7 }));
+        assert_eq!(v.len(), 1);
+        assert_eq!(v[0].path, "name");
+        assert!(v[0].message.contains("expected type `string`"));
+    }
+
+    #[test]
+    fn enum_and_array_item_types_are_checked() {
+        let v = validate(
+            &schema(),
+            &json!({ "name": "x", "mode": "turbo", "tags": ["ok", 9] }),
+        );
+        assert!(v.iter().any(|x| x.message.contains("enum")));
+        assert!(v.iter().any(|x| x.path == "tags[1]"));
+    }
+
+    #[test]
+    fn unknown_keywords_are_not_rejected() {
+        // `format`/`minimum` are unmodeled — must not produce violations.
+        let s = json!({ "type": "string", "format": "email", "minLength": 3 });
+        assert!(validate(&s, &json!("hi")).is_empty());
+    }
+
+    fn one_violation() -> Vec<SchemaViolation> {
+        vec![SchemaViolation {
+            path: "x".into(),
+            message: "expected type `string`".into(),
+        }]
+    }
+
+    #[test]
+    fn policy_fails_on_arg_violations() {
+        assert_eq!(
+            classify_schema_violation(ValidationSite::ToolCallArgs, &one_violation()),
+            SchemaPolicy::Fail
+        );
+    }
+
+    #[test]
+    fn policy_warns_on_result_violations() {
+        assert_eq!(
+            classify_schema_violation(ValidationSite::ToolCallResult, &one_violation()),
+            SchemaPolicy::Warn
+        );
+    }
+
+    #[test]
+    fn policy_ignores_when_no_violations() {
+        assert_eq!(
+            classify_schema_violation(ValidationSite::ToolCallArgs, &[]),
+            SchemaPolicy::Ignore
+        );
+        assert_eq!(
+            classify_schema_violation(ValidationSite::ToolCallResult, &[]),
+            SchemaPolicy::Ignore
+        );
+    }
+}
diff --git a/crates/mcp-loadtest/src/run.rs b/crates/mcp-loadtest/src/run.rs
index 3c95a8b..eb5fb23 100644
--- a/crates/mcp-loadtest/src/run.rs
+++ b/crates/mcp-loadtest/src/run.rs
@@ -115,7 +115,19 @@ impl Run {
         //     just leaves the registered set empty, so coverage is reported
         //     vacuously rather than failing the whole run.
         let registered_tools: Vec<String> = match session.list_tools().await {
-            Ok(tools) => tools.into_iter().map(|t| t.name).collect(),
+            Ok(tools) => {
+                // Opt-in strict args validation reuses this same registry —
+                // no extra `tools/list` round-trip (ADR 0010 / 0006).
+                if config.validation.strict {
+                    session.set_strict_tool_schemas(
+                        tools
+                            .iter()
+                            .map(|t| (t.name.clone(), t.input_schema.clone()))
+                            .collect(),
+                    );
+                }
+                tools.into_iter().map(|t| t.name).collect()
+            }
             Err(err) => {
                 tracing::warn!(error = %err, "tools/list failed; coverage will be empty");
                 Vec::new()
diff --git a/crates/mcp-loadtest/src/run/thresholds.rs b/crates/mcp-loadtest/src/run/thresholds.rs
index 1cb40c4..764bc5d 100644
--- a/crates/mcp-loadtest/src/run/thresholds.rs
+++ b/crates/mcp-loadtest/src/run/thresholds.rs
@@ -218,6 +218,7 @@ mod tests {
             },
             thresholds,
             output: Default::default(),
+            validation: Default::default(),
         }
     }
 
diff --git a/crates/mcp-loadtest/src/scenario/fuzzer/classify.rs b/crates/mcp-loadtest/src/scenario/fuzzer/classify.rs
index c0b9f71..8768194 100644
--- a/crates/mcp-loadtest/src/scenario/fuzzer/classify.rs
+++ b/crates/mcp-loadtest/src/scenario/fuzzer/classify.rs
@@ -43,6 +43,16 @@ pub(super) fn classify_err(err: &SessionError) -> (FuzzClass, Option<i64>, Strin
             None,
             format!("startup timeout: {err}"),
         ),
+        // Only reachable if strict validation is enabled alongside the
+        // fuzzer (unusual — the fuzzer's whole point is to send payloads
+        // that violate the schema). Our own validator caught it before the
+        // server saw it; surface it as a protocol issue with an explicit
+        // "caught client-side" note so it's never misread as a server bug.
+        SessionError::SchemaViolation { .. } => (
+            FuzzClass::ProtocolError,
+            None,
+            format!("rejected client-side by strict validation: {err}"),
+        ),
     }
 }
 
diff --git a/crates/mcp-loadtest/src/scenario/mod.rs b/crates/mcp-loadtest/src/scenario/mod.rs
index 3342f88..7825f3c 100644
--- a/crates/mcp-loadtest/src/scenario/mod.rs
+++ b/crates/mcp-loadtest/src/scenario/mod.rs
@@ -111,6 +111,9 @@ pub(crate) fn classify_error(err: &SessionError) -> CallOutcome {
         E::Transport(T::Closed) | E::Transport(T::Io(_)) => CallOutcome::Disconnected,
         E::Transport(T::Timeout(_)) | E::StartupTimeout(_) => CallOutcome::Timeout,
         E::Transport(T::Http(_)) | E::Transport(T::Other(_)) => CallOutcome::ServerError,
+        // Strict-validation reject: the call never reached the server, the
+        // session is healthy — it's a protocol-level mismatch.
+        E::SchemaViolation { .. } => CallOutcome::ProtocolError,
     }
 }
 
@@ -181,6 +184,13 @@ mod tests {
             classify_error(&SessionError::StartupTimeout(Duration::from_secs(10))),
             CallOutcome::Timeout,
         );
+        assert_eq!(
+            classify_error(&SessionError::SchemaViolation {
+                tool: "echo".to_string(),
+                summary: "args.x: expected type `string`".to_string(),
+            }),
+            CallOutcome::ProtocolError,
+        );
 
         // Every TransportError variant tunneled through SessionError::Transport.
         assert_eq!(
@@ -241,5 +251,9 @@ mod tests {
         assert!(!is_terminal_error(&SessionError::Transport(
             TransportError::Other("weird".to_string()),
         )));
+        assert!(!is_terminal_error(&SessionError::SchemaViolation {
+            tool: "echo".to_string(),
+            summary: "args.x: expected type `string`".to_string(),
+        }));
     }
 }
diff --git a/crates/mcp-loadtest/src/serve/tools/deadlock_probe.rs b/crates/mcp-loadtest/src/serve/tools/deadlock_probe.rs
index eff3f56..ef664b7 100644
--- a/crates/mcp-loadtest/src/serve/tools/deadlock_probe.rs
+++ b/crates/mcp-loadtest/src/serve/tools/deadlock_probe.rs
@@ -93,6 +93,7 @@ pub(super) async fn deadlock_probe(args: &Value) -> Result<Value, ToolError> {
             report_dir: PathBuf::from("./runs"),
             formats: vec![],
         },
+        validation: crate::config::ValidationConfig::default(),
     };
 
     let run = Run::new(config, Box::new(scenario), PathBuf::from("./runs"));
diff --git a/crates/mcp-loadtest/src/serve/tools/sustained_load.rs b/crates/mcp-loadtest/src/serve/tools/sustained_load.rs
index ab282f5..aa86c60 100644
--- a/crates/mcp-loadtest/src/serve/tools/sustained_load.rs
+++ b/crates/mcp-loadtest/src/serve/tools/sustained_load.rs
@@ -80,6 +80,7 @@ pub(super) async fn sustained_load(args: &Value) -> Result<Value, ToolError> {
             report_dir: PathBuf::from("./runs"),
             formats: vec![],
         },
+        validation: crate::config::ValidationConfig::default(),
     };
 
     let run = Run::new(config, Box::new(scenario), PathBuf::from("./runs"));
diff --git a/crates/mcp-loadtest/src/session.rs b/crates/mcp-loadtest/src/session.rs
index dfb35a9..dfaae83 100644
--- a/crates/mcp-loadtest/src/session.rs
+++ b/crates/mcp-loadtest/src/session.rs
@@ -8,6 +8,7 @@
 //! M1 scope: synchronous request/response only. Server-initiated
 //! notifications and concurrent in-flight requests are deferred to M5+.
 
+use std::collections::HashMap;
 use std::ffi::OsStr;
 use std::time::Duration;
 
@@ -24,6 +25,7 @@ use crate::protocol::mcp::{
     CallToolParams, CallToolResult, ClientInfo, InitializeParams, InitializeResult,
     ListToolsResult, PROTOCOL_VERSION, Tool,
 };
+use crate::protocol::schema::{self, SchemaPolicy, ValidationSite};
 use crate::protocol::transport::stdio::StdioTransport;
 use crate::protocol::transport::{Transport, TransportError};
 
@@ -56,6 +58,17 @@ pub enum SessionError {
     /// `initialize` did not complete within the configured budget.
     #[error("server did not respond to initialize within {0:?}")]
     StartupTimeout(Duration),
+    /// Strict validation rejected this call before it was sent: the
+    /// arguments didn't match the tool's advertised `inputSchema` and the
+    /// policy classified it as [`SchemaPolicy::Fail`]. Only produced when
+    /// `[validation] strict = true`. Maps to `CallOutcome::ProtocolError`.
+    #[error("schema violation for tool `{tool}`: {summary}")]
+    SchemaViolation {
+        /// Tool whose args failed validation.
+        tool: String,
+        /// Human-readable summary of the violations (first few).
+        summary: String,
+    },
 }
 
 /// A single MCP session over an opaque [`Transport`].
@@ -64,6 +77,13 @@ pub struct Session {
     next_id: u64,
     /// Reported by the server during `initialize` — kept for diagnostics.
     pub server_protocol_version: String,
+    /// Strict args-validation registry: tool name → advertised
+    /// `inputSchema`. `None` (the default) means strict validation is off
+    /// and `call_tool` is byte-for-byte its pre-existing self — a single
+    /// `Option` check on the hot path, no allocation, ADR 0006 preserved.
+    /// `Some` is populated once at run start from the `tools/list` the run
+    /// already fetches (see [`Session::set_strict_tool_schemas`]).
+    tool_schemas: Option<HashMap<String, Value>>,
 }
 
 impl Session {
@@ -79,6 +99,7 @@ impl Session {
             transport: Box::new(transport),
             next_id: 1,
             server_protocol_version: String::new(),
+            tool_schemas: None,
         };
         match tokio::time::timeout(DEFAULT_STARTUP_TIMEOUT, session.initialize()).await {
             Ok(result) => result?,
@@ -128,6 +149,16 @@ impl Session {
         Ok(result.tools)
     }
 
+    /// Turn on strict args validation for subsequent [`Session::call_tool`]
+    /// calls. `schemas` maps each tool's name to its advertised
+    /// `inputSchema`. Call once at run start (the caller already has the
+    /// `tools/list` result in hand). A tool absent from `schemas` is never
+    /// validated — a server that doesn't advertise a schema is not failed
+    /// on that ground (forward-compatible, ADR 0005).
+    pub fn set_strict_tool_schemas(&mut self, schemas: HashMap<String, Value>) {
+        self.tool_schemas = Some(schemas);
+    }
+
     /// Call `tools/call` with the given tool name and arguments.
     ///
     /// Both `name` and `arguments` are borrowed — callers driving in a hot
@@ -138,6 +169,36 @@ impl Session {
         name: &str,
         arguments: &Value,
     ) -> Result<CallToolResult, SessionError> {
+        // Opt-in strict args validation. When `tool_schemas` is `None`
+        // (default) this is a single branch and the hot path is unchanged.
+        if let Some(schemas) = &self.tool_schemas
+            && let Some(input_schema) = schemas.get(name)
+        {
+            let violations = schema::validate(input_schema, arguments);
+            if !violations.is_empty() {
+                match schema::classify_schema_violation(ValidationSite::ToolCallArgs, &violations) {
+                    SchemaPolicy::Fail => {
+                        let summary = violations
+                            .iter()
+                            .take(3)
+                            .map(|v| format!("{}: {}", v.path, v.message))
+                            .collect::<Vec<_>>()
+                            .join("; ");
+                        return Err(SessionError::SchemaViolation {
+                            tool: name.to_string(),
+                            summary,
+                        });
+                    }
+                    SchemaPolicy::Warn => tracing::warn!(
+                        tool = name,
+                        violations = violations.len(),
+                        "strict schema validation: args mismatch (policy: warn)"
+                    ),
+                    SchemaPolicy::Ignore => {}
+                }
+            }
+        }
+
         let params = CallToolParams { name, arguments };
         self.request("tools/call", &params).await
     }
diff --git a/crates/mcp-loadtest/tests/fixtures/mock-schema.py b/crates/mcp-loadtest/tests/fixtures/mock-schema.py
new file mode 100644
index 0000000..2f7d9b0
--- /dev/null
+++ b/crates/mcp-loadtest/tests/fixtures/mock-schema.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""mock-schema: like mock-normal, but `echo` advertises a strict inputSchema
+(`msg` is a required string). Used to exercise opt-in strict args validation:
+calls whose args don't match this schema must be rejected client-side before
+they reach this server."""
+
+import json
+import pathlib
+import sys
+
+sys.path.insert(0, str(pathlib.Path(__file__).parent))
+
+from _common import (  # noqa: E402  (path manipulation above is intentional)
+    read_frame,
+    respond_error,
+    respond_initialize,
+    respond_ok,
+    respond_tools_list,
+)
+
+TOOLS = [
+    {
+        "name": "echo",
+        "description": "Echo args back. Requires a string `msg`.",
+        "inputSchema": {
+            "type": "object",
+            "properties": {"msg": {"type": "string"}},
+            "required": ["msg"],
+        },
+    }
+]
+
+
+def main():
+    while True:
+        msg = read_frame()
+        if msg is None:
+            return  # stdin closed
+        method = msg.get("method")
+        msg_id = msg.get("id")
+        is_notification = msg_id is None
+
+        if method == "initialize":
+            respond_initialize(msg_id)
+        elif method == "notifications/initialized":
+            pass  # one-way notification, no response
+        elif method == "tools/list":
+            respond_tools_list(msg_id, TOOLS)
+        elif method == "tools/call":
+            args = msg.get("params", {}).get("arguments", {})
+            respond_ok(
+                msg_id,
+                {
+                    "content": [{"type": "text", "text": json.dumps(args)}],
+                },
+            )
+        elif not is_notification:
+            respond_error(msg_id, -32601, f"method not found: {method}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/crates/mcp-loadtest/tests/strict_validation.rs b/crates/mcp-loadtest/tests/strict_validation.rs
new file mode 100644
index 0000000..d0759d8
--- /dev/null
+++ b/crates/mcp-loadtest/tests/strict_validation.rs
@@ -0,0 +1,128 @@
+//! End-to-end coverage for opt-in strict args validation (ADR 0010).
+//!
+//! Drives the real `Sustained` scenario against `mock-schema.py` (whose
+//! `echo` tool advertises `{required: ["msg"], msg: string}`) with strict
+//! validation enabled exactly the way `Run::execute` enables it, and asserts
+//! the production policy: bad args are rejected client-side as
+//! `ProtocolError`; compliant args pass untouched.
+
+mod helpers;
+
+use std::collections::HashMap;
+use std::time::{Duration, Instant};
+
+use mcp_loadtest::Session;
+use mcp_loadtest::metrics::Recorder;
+use mcp_loadtest::scenario::sustained::Sustained;
+use mcp_loadtest::scenario::{RunContext, Scenario};
+use serde_json::json;
+use tokio_util::sync::CancellationToken;
+
+fn make_ctx() -> (RunContext, Recorder) {
+    let metrics = Recorder::new();
+    let ctx = RunContext {
+        run_start: Instant::now(),
+        cancel_token: CancellationToken::new(),
+        metrics: metrics.clone(),
+        hang_threshold: Duration::from_secs(5),
+        grace_period: Duration::from_secs(10),
+    };
+    (ctx, metrics)
+}
+
+/// Mirror `Run::execute`: pull `tools/list`, hand the name→inputSchema map to
+/// the session so subsequent `call_tool`s validate args.
+async fn enable_strict(session: &mut Session) {
+    let tools = session.list_tools().await.expect("tools/list failed");
+    let schemas: HashMap<String, serde_json::Value> = tools
+        .iter()
+        .map(|t| (t.name.clone(), t.input_schema.clone()))
+        .collect();
+    session.set_strict_tool_schemas(schemas);
+}
+
+#[tokio::test]
+async fn strict_rejects_args_violating_advertised_schema() {
+    let mock = helpers::fixture_path("mock-schema.py");
+    let py = helpers::python();
+
+    let mut session = Session::spawn(&py, [mock.as_os_str()])
+        .await
+        .expect("spawn failed");
+    enable_strict(&mut session).await;
+
+    // `echo` requires a string `msg`; send an int instead → schema violation.
+    let scenario = Sustained {
+        concurrent: 1,
+        duration: Duration::from_millis(500),
+        tool: "echo".to_string(),
+        args: json!({ "msg": 123 }),
+    };
+
+    let (ctx, metrics) = make_ctx();
+    let outcome = scenario.drive(&mut session, &ctx).await;
+
+    assert!(
+        outcome.total_calls > 0,
+        "expected calls to be attempted; got {outcome:?}"
+    );
+    assert_eq!(
+        outcome.successful_calls, 0,
+        "no call should succeed — every one violates the schema; got {outcome:?}"
+    );
+
+    let snap = metrics.snapshot();
+    assert_eq!(
+        snap.outcomes.protocol_error, outcome.total_calls,
+        "every rejected call must be a ProtocolError; got {:?}",
+        snap.outcomes
+    );
+    assert_eq!(
+        snap.outcomes.success, 0,
+        "no success expected; got {:?}",
+        snap.outcomes
+    );
+
+    tokio::time::timeout(Duration::from_secs(5), session.shutdown())
+        .await
+        .expect("shutdown timed out")
+        .expect("shutdown errored");
+}
+
+#[tokio::test]
+async fn strict_allows_schema_compliant_args() {
+    let mock = helpers::fixture_path("mock-schema.py");
+    let py = helpers::python();
+
+    let mut session = Session::spawn(&py, [mock.as_os_str()])
+        .await
+        .expect("spawn failed");
+    enable_strict(&mut session).await;
+
+    // Compliant args: `msg` present and a string.
+    let scenario = Sustained {
+        concurrent: 1,
+        duration: Duration::from_millis(500),
+        tool: "echo".to_string(),
+        args: json!({ "msg": "hello" }),
+    };
+
+    let (ctx, metrics) = make_ctx();
+    let outcome = scenario.drive(&mut session, &ctx).await;
+
+    assert!(outcome.total_calls > 0, "expected calls; got {outcome:?}");
+    assert_eq!(
+        outcome.successful_calls, outcome.total_calls,
+        "strict mode must not gate schema-compliant calls; got {outcome:?}"
+    );
+    assert_eq!(
+        metrics.snapshot().outcomes.protocol_error,
+        0,
+        "compliant args must not produce ProtocolError"
+    );
+
+    tokio::time::timeout(Duration::from_secs(5), session.shutdown())
+        .await
+        .expect("shutdown timed out")
+        .expect("shutdown errored");
+}
diff --git a/docs/adr/0010-strict-schema-validation.md b/docs/adr/0010-strict-schema-validation.md
new file mode 100644
index 0000000..88f37f2
--- /dev/null
+++ b/docs/adr/0010-strict-schema-validation.md
@@ -0,0 +1,64 @@
+# 10. Opt-in strict MCP schema validation
+
+Date: 2026-05-16
+Status: Accepted
+
+## Context
+
+The protocol-aware-assertions differentiator (ADR 0004) wants the load
+tester to catch MCP-semantic problems — a server whose `tools/call` result
+doesn't match its advertised `inputSchema`, or a scenario sending args that
+violate it. But ADR 0005 deliberately chose **lenient, forward-compatible**
+parsing: unknown fields are tolerated so a newer server doesn't break an
+older tester. Flipping to strict-by-default would reverse that contract and
+would false-positive on servers that legitimately add fields over time.
+
+We also need a JSON Schema validator. A full crate (`jsonschema`,
+Draft 2020-12) pulls a transitive tree we cannot `cargo deny`-audit in every
+dev environment (cargo-deny isn't installed locally; supply-chain review is
+required in CI per `deny.toml`), and conflicts with the slim-binary goal
+(ADR 0008). MCP `inputSchema`s in practice use only a small, stable slice of
+JSON Schema.
+
+## Decision
+
+1. **Strict validation is opt-in.** `[validation] strict = true` in the TOML
+   config (default `false`). With it off, behaviour is byte-for-byte what it
+   was before — ADR 0005's forward-compat contract is preserved; strictness
+   is additive policy, not a protocol change.
+2. **Dependency-free subset validator** in `protocol::schema`: handles
+   `type` / `properties` / `required` / `enum` / `items` and nesting. Any
+   keyword it does not model is **skipped, never failed** — the validator is
+   itself forward-compatible.
+3. **The gate decision is isolated** in
+   `classify_schema_violation(site, violations) -> SchemaPolicy`. Mechanical
+   validation ("does this match the subset?") is separate from the product
+   decision ("does a mismatch here fail the run?"). `Fail` maps to
+   `CallOutcome::ProtocolError` (gates the run); `Warn` / `Ignore` do not.
+
+## Alternatives considered
+
+| Option                                  | Why rejected                                                                                                                                                                        |
+| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `jsonschema` crate (full Draft 2020-12) | Unauditable transitive deps in local dev; binary-size cost (ADR 0008); MCP only uses a small subset — full coverage is unused weight.                                               |
+| Strict-by-default                       | Reverses ADR 0005; servers that add result fields over time would trip false regressions.                                                                                           |
+| Validate args only, not results         | Throws away the differentiator — asserting on server _behaviour_ is the point. Keep both sides; let the policy weight them differently.                                             |
+| Bake the policy into the validator      | Conflates a CI/product trade-off with mechanical validation, making the decision invisible and hard to tune. Keeping it a tiny standalone function makes it reviewable and ownable. |
+
+## Consequences
+
+**Positive:**
+
+- Delivers the protocol-aware-assertions differentiator without breaking the deliberate forward-compat stance.
+- No new dependencies; fully verifiable in every dev environment.
+- The entire policy is one small function — easy to review, test, and tune.
+
+**Negative:**
+
+- The subset validator won't catch exotic JSON Schema constraints (`pattern`, `minimum`, `oneOf`, …). Acceptable: MCP `inputSchema`s don't use them, and unmodeled keywords are skipped rather than mishandled.
+- Strict mode's value depends on `classify_schema_violation` being implemented with a deliberate policy — the placeholder ships as the safe "never gate" default so an unfinished policy can't silently fail runs.
+
+**Open:**
+
+- Enforcement wiring point: args validated pre-send, result validated post-receive — the exact hook in the scenario/session layer is the remaining integration step and is intentionally landed after the policy is decided.
+- Per-tool strictness overrides: deferred until a user asks (mirrors the ADR 0009 posture).
diff --git a/docs/adr/README.md b/docs/adr/README.md
index d42a267..7692552 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -4,17 +4,18 @@ Append-only log of significant decisions. New ADRs are added with the next numbe
 
 ## Index
 
-| # | Title | Status |
-|---|---|---|
-| [0001](0001-language-rust.md) | Language: Rust | Accepted |
-| [0002](0002-runtime-tokio.md) | Async runtime: tokio | Accepted |
-| [0003](0003-histogram-hdr.md) | Latency histogram: hdrhistogram | Accepted |
-| [0004](0004-compete-with-reaatech.md) | Strategy: head-on competition with reaatech/mcp-load-test | Accepted |
-| [0005](0005-serve-mcp-mode.md) | Serve mode: expose load tester as MCP server over stdio | Accepted |
-| [0006](0006-zero-copy-protocol-types.md) | Zero-copy protocol types on the request hot path | Accepted |
-| [0007](0007-transport-security-posture.md) | Transport security posture: redirects, frame caps, path validation | Accepted |
-| [0008](0008-release-profile-panic-abort.md) | Release profile: `panic = "abort"` | Accepted |
-| [0009](0009-regression-threshold-defaults.md) | Regression threshold defaults: 10% p99 / 0.5pp error rate | Accepted |
+| #                                             | Title                                                              | Status   |
+| --------------------------------------------- | ------------------------------------------------------------------ | -------- |
+| [0001](0001-language-rust.md)                 | Language: Rust                                                     | Accepted |
+| [0002](0002-runtime-tokio.md)                 | Async runtime: tokio                                               | Accepted |
+| [0003](0003-histogram-hdr.md)                 | Latency histogram: hdrhistogram                                    | Accepted |
+| [0004](0004-compete-with-reaatech.md)         | Strategy: head-on competition with reaatech/mcp-load-test          | Accepted |
+| [0005](0005-serve-mcp-mode.md)                | Serve mode: expose load tester as MCP server over stdio            | Accepted |
+| [0006](0006-zero-copy-protocol-types.md)      | Zero-copy protocol types on the request hot path                   | Accepted |
+| [0007](0007-transport-security-posture.md)    | Transport security posture: redirects, frame caps, path validation | Accepted |
+| [0008](0008-release-profile-panic-abort.md)   | Release profile: `panic = "abort"`                                 | Accepted |
+| [0009](0009-regression-threshold-defaults.md) | Regression threshold defaults: 10% p99 / 0.5pp error rate          | Accepted |
+| [0010](0010-strict-schema-validation.md)      | Opt-in strict MCP schema validation                                | Accepted |
 
 ## When to write a new ADR
 
@@ -32,14 +33,18 @@ Date: YYYY-MM-DD
 Status: Proposed | Accepted | Superseded by [NNNN](NNNN-...)
 
 ## Context
+
 What's the situation? What's the question we're answering?
 
 ## Decision
+
 The choice we made.
 
 ## Alternatives considered
+
 What else we looked at and why we didn't pick them.
 
 ## Consequences
+
 What this commits us to. What it makes easy / hard. Open questions.
 ```

From c9089a52be142f7dc7932cfb68c9957a7712df81 Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 19:38:33 +0700
Subject: [PATCH 4/8] docs: advertise CI gating + strict-validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

B1/B2 shipped real differentiators (configurable regression
thresholds, opt-in protocol-aware schema validation) but the README
and DESIGN matrix didn't surface them, so a reader couldn't tell the
tool is a CI regression gate, not just a profiler.

Add a "CI gating & protocol-aware assertions" section to the README
(threshold gating, configurable compare, strict mode, link to the
ci-integration cookbook) and two rows each to the README and
DESIGN.md §10.5 competitive matrices. Docs only — no code change;
doc -D warnings green.
---
 DESIGN.md | 368 ++++++++++++++++++++++++++++++------------------------
 README.md | 121 +++++++++++-------
 2 files changed, 277 insertions(+), 212 deletions(-)

diff --git a/DESIGN.md b/DESIGN.md
index 537c1f5..4393a98 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -19,6 +19,7 @@ The Model Context Protocol ecosystem is exploding — new MCP servers ship weekl
 ### Canonical motivating case
 
 The author hit a deadlock in [HKUDS/Vibe-Trading](https://github.com/HKUDS/Vibe-Trading) where:
+
 - `initialize` → ✅ worked
 - `tools/list` → ✅ worked
 - `tools/call` → ❌ hung forever
@@ -38,6 +39,7 @@ There are excellent tools for HTTP load testing (k6, vegeta, wrk2), gRPC (ghz),
 ## 2. Goals & Non-Goals
 
 ### Goals
+
 - Detect deadlocks, hangs, livelocks under realistic concurrent load
 - Measure latency (p50/p95/p99), throughput, error rate
 - Work against any MCP server regardless of language / transport (stdio, HTTP, SSE, WebSocket)
@@ -47,6 +49,7 @@ There are excellent tools for HTTP load testing (k6, vegeta, wrk2), gRPC (ghz),
 - Zero-config quick-start: `mcp-loadtest probe -s "python -m my_mcp"` should just work
 
 ### Non-Goals
+
 - **Not a replacement for unit tests.** Different problem.
 - **Not a tool for testing MCP clients.** Client-side bugs are a separate domain.
 - **Not validating tool output correctness.** We test protocol-level behavior. If your tool returns wrong data, that's not what we catch.
@@ -60,12 +63,14 @@ There are excellent tools for HTTP load testing (k6, vegeta, wrk2), gRPC (ghz),
 ### MCP protocol (relevant subset)
 
 JSON-RPC 2.0 framing over one of four transports:
+
 - **stdio** — line-delimited JSON over child process stdin/stdout (most common, all examples in this doc focus here)
 - **HTTP** — Streamable HTTP (simple JSON variant); request via POST, simple JSON response
 - **HTTP+SSE** — request via POST, server pushes events via SSE channel
 - **WebSocket** — bidirectional frames
 
 Lifecycle (stdio):
+
 ```
 client → server   {"method":"initialize", "params":{...}}
 client ← server   {"result":{"protocolVersion":...,"capabilities":{...}}}
@@ -78,14 +83,14 @@ client ← server   {"result":{"content":[{...}]}}
 
 ### Bug classes we target
 
-| Class | Example | Why hard to catch in unit tests |
-|---|---|---|
-| Lazy-init deadlock | Vibe-Trading PR #85 | Bug only surfaces with full subprocess + protocol handshake |
-| Concurrent tool-call race | tools/call before tools/list completes | Need real concurrency; mocked async ≠ real async |
-| Resource exhaustion | 1000 concurrent calls → fd / mem leak | Need sustained load |
-| Slow-tool head-of-line | One slow tool blocks queue | Need mixed workload |
-| Reconnect / mid-call kill | Connection drops between request and response | Hard to simulate without tooling |
-| Notification ordering | Server sends `notifications/cancelled` mid-call | Need sequence-aware client |
+| Class                     | Example                                         | Why hard to catch in unit tests                             |
+| ------------------------- | ----------------------------------------------- | ----------------------------------------------------------- |
+| Lazy-init deadlock        | Vibe-Trading PR #85                             | Bug only surfaces with full subprocess + protocol handshake |
+| Concurrent tool-call race | tools/call before tools/list completes          | Need real concurrency; mocked async ≠ real async            |
+| Resource exhaustion       | 1000 concurrent calls → fd / mem leak           | Need sustained load                                         |
+| Slow-tool head-of-line    | One slow tool blocks queue                      | Need mixed workload                                         |
+| Reconnect / mid-call kill | Connection drops between request and response   | Hard to simulate without tooling                            |
+| Notification ordering     | Server sends `notifications/cancelled` mid-call | Need sequence-aware client                                  |
 
 ---
 
@@ -172,18 +177,18 @@ src/
 
 ### Key crate dependencies
 
-| Crate | Why |
-|---|---|
-| tokio | async runtime |
-| serde / serde_json | JSON-RPC payloads |
-| clap | CLI |
-| toml | config |
-| hdrhistogram | percentile latency |
-| sysinfo | RSS/CPU per pid (cross-platform) |
-| indicatif | terminal progress |
-| tracing | structured logging |
-| thiserror / anyhow | errors |
-| tokio-util | LinesCodec for stdio framing |
+| Crate              | Why                              |
+| ------------------ | -------------------------------- |
+| tokio              | async runtime                    |
+| serde / serde_json | JSON-RPC payloads                |
+| clap               | CLI                              |
+| toml               | config                           |
+| hdrhistogram       | percentile latency               |
+| sysinfo            | RSS/CPU per pid (cross-platform) |
+| indicatif          | terminal progress                |
+| tracing            | structured logging               |
+| thiserror / anyhow | errors                           |
+| tokio-util         | LinesCodec for stdio framing     |
 
 No proc-macro magic. No "framework" — just composable structs.
 
@@ -225,6 +230,7 @@ async fn no_deadlock_under_concurrent_calls() {
 ```
 
 Design choices:
+
 - **Builder pattern** — predictable, no struct-init explosion
 - **`execute()` returns `Result<Report>`** — lets tests pattern-match on metrics
 - **Report exposes raw histograms** — can drive custom assertions
@@ -262,6 +268,7 @@ mcp-loadtest example-config > bench.toml
 ```
 
 Output structure for each run:
+
 ```
 runs/2026-05-10T14-30-00/
 ├── config.toml          # exact config used
@@ -325,23 +332,25 @@ formats = ["markdown", "json", "terminal"]
 
 ## 8. Built-in scenarios
 
-| Scenario | Description | Detects |
-|---|---|---|
-| `cold_start` | Spawn → initialize → first tool call. Repeat N times. (M2 placeholder — needs session factory; tracked for follow-up.) | regression in startup time, init-time deadlocks |
-| `sustained` | Constant load against one session for a fixed duration. Drives the multi-step weighted-random `pattern` engine internally. | baseline p99 latency, throughput, sustained error rate |
-| `spike` | Baseline → sharp burst at peak concurrency for a fixed window → cooldown back to baseline. Models Black-Friday-style traffic spikes. | queue overflow, recovery behavior, fairness under burst |
-| `ramp` | Step concurrency from `from` to `to` by `step_increment`, optionally feeding the per-step metrics into [`analysis::breaking_point`]. | finds break-point — concurrency where p99 explodes |
-| `soak` | Long-duration steady load with periodic snapshots; pairs with `analysis::regression` for latency-drift and (via `ProcessSampler`) RSS-slope leak signals. | memory leaks, latency drift, throughput collapse over hours |
-| `pattern` | Multi-step weighted-random tool-call sequences with per-pattern `think_time` and `ErrorBehavior`. Building block used directly by `sustained`. | realistic mixed workloads (explore-then-act, read-then-write) |
-| `deadlock_probe` | initialize → tools/list → fire N `tools/call` to same tool wrapped in `hang_detect`. Bails on first deadlock to avoid flooding a wedged session. | the **Vibe-Trading bug class** specifically |
-| `race_check` | Issue N identical `tools/call` and run the responses through `analysis::race_detector` (key-sorted JSON canonicalization). | non-determinism / divergent responses to identical inputs |
-| `fuzzer` | Cycle through enumerated malformed-but-plausible payloads (unknown method, numeric method, giant payload, control chars, deep-nested, null/string params); classify each via `analysis::fuzz_report`. | parser bugs, type-confusion in method dispatch |
+| Scenario         | Description                                                                                                                                                                                           | Detects                                                       |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- |
+| `cold_start`     | Spawn → initialize → first tool call. Repeat N times. (M2 placeholder — needs session factory; tracked for follow-up.)                                                                                | regression in startup time, init-time deadlocks               |
+| `sustained`      | Constant load against one session for a fixed duration. Drives the multi-step weighted-random `pattern` engine internally.                                                                            | baseline p99 latency, throughput, sustained error rate        |
+| `spike`          | Baseline → sharp burst at peak concurrency for a fixed window → cooldown back to baseline. Models Black-Friday-style traffic spikes.                                                                  | queue overflow, recovery behavior, fairness under burst       |
+| `ramp`           | Step concurrency from `from` to `to` by `step_increment`, optionally feeding the per-step metrics into [`analysis::breaking_point`].                                                                  | finds break-point — concurrency where p99 explodes            |
+| `soak`           | Long-duration steady load with periodic snapshots; pairs with `analysis::regression` for latency-drift and (via `ProcessSampler`) RSS-slope leak signals.                                             | memory leaks, latency drift, throughput collapse over hours   |
+| `pattern`        | Multi-step weighted-random tool-call sequences with per-pattern `think_time` and `ErrorBehavior`. Building block used directly by `sustained`.                                                        | realistic mixed workloads (explore-then-act, read-then-write) |
+| `deadlock_probe` | initialize → tools/list → fire N `tools/call` to same tool wrapped in `hang_detect`. Bails on first deadlock to avoid flooding a wedged session.                                                      | the **Vibe-Trading bug class** specifically                   |
+| `race_check`     | Issue N identical `tools/call` and run the responses through `analysis::race_detector` (key-sorted JSON canonicalization).                                                                            | non-determinism / divergent responses to identical inputs     |
+| `fuzzer`         | Cycle through enumerated malformed-but-plausible payloads (unknown method, numeric method, giant payload, control chars, deep-nested, null/string params); classify each via `analysis::fuzz_report`. | parser bugs, type-confusion in method dispatch                |
 
 Deferred to v0.2:
+
 - `slow_mix` — 80% calls to a fast tool, 20% to a deliberately-slow tool (head-of-line blocking, fairness). Approximable today by configuring a multi-step `pattern` with weighted tools.
 - `reconnect` — drop session mid-call (close stdin), spawn new session, retry (resilience, leftover state, zombies). Needs the session pool that lands in M8+.
 
 Each scenario is an `impl Scenario` with two methods:
+
 ```rust
 trait Scenario {
     async fn drive(&self, session: SessionPool, ctx: RunContext) -> ScenarioOutcome;
@@ -357,27 +366,29 @@ trait Scenario {
 
 Mock MCP servers in `tests/fixtures/`. Each is a tiny Python script (chosen for ubiquity, not Rust, to make the test environment realistic).
 
-| Mock | Behavior | Tests |
-|---|---|---|
-| `mock-normal.py` | Echoes args, responds in 1ms | happy-path metrics shape |
-| `mock-slow.py` | Tool sleeps 2s | latency histogram correctness |
-| `mock-broken.py` | Hangs on first tools/call (replicates Vibe-Trading bug) | `deadlock_probe` correctly classifies |
-| `mock-crash.py` | Panics on 1% of calls | error-rate accuracy |
-| `mock-leak.py` | Allocates 10 KB/call, never frees | `leak` scenario detects |
-| `mock-error.py` | Returns JSON-RPC errors per spec | error classification |
-| `mock-slow-init.py` | Takes 5s to respond to `initialize` | `cold_start` measures correctly |
-| `mock-malformed.py` | Returns invalid JSON occasionally | parser robustness |
+| Mock                | Behavior                                                | Tests                                 |
+| ------------------- | ------------------------------------------------------- | ------------------------------------- |
+| `mock-normal.py`    | Echoes args, responds in 1ms                            | happy-path metrics shape              |
+| `mock-slow.py`      | Tool sleeps 2s                                          | latency histogram correctness         |
+| `mock-broken.py`    | Hangs on first tools/call (replicates Vibe-Trading bug) | `deadlock_probe` correctly classifies |
+| `mock-crash.py`     | Panics on 1% of calls                                   | error-rate accuracy                   |
+| `mock-leak.py`      | Allocates 10 KB/call, never frees                       | `leak` scenario detects               |
+| `mock-error.py`     | Returns JSON-RPC errors per spec                        | error classification                  |
+| `mock-slow-init.py` | Takes 5s to respond to `initialize`                     | `cold_start` measures correctly       |
+| `mock-malformed.py` | Returns invalid JSON occasionally                       | parser robustness                     |
 
 Test invariant: for each (scenario × mock) pair, the report's machine-readable summary contains expected fields with expected ranges. This is the bulk of integration tests.
 
 ### Layer B — does it catch real bugs?
 
 Snapshot test against a known-buggy commit of Vibe-Trading:
+
 - Pin to commit `~PR-85` (just before the fix)
 - Run `deadlock_probe` scenario
 - Assert: report flags ≥1 deadlock, identifies `tools/call` as the offending request
 
 Re-run against post-fix commit:
+
 - Same scenario, expect 0 deadlocks
 
 This is the killer demo. It goes in the README.
@@ -390,26 +401,27 @@ CI matrix: ubuntu-latest, macos-latest, windows-latest × stable Rust × Python
 
 ## 10. Milestones (revised 2026-05-10 — head-on competition with reaatech/mcp-load-test)
 
-Original 3-week plan replaced after discovering [reaatech/mcp-load-test](https://github.com/reaatech/mcp-load-test) ships v0.1 functionality already (see §10.5 for parity matrix). v0.1.0 of mcp-loadtest must reach feature parity *and* surface our differentiators before re-publishing.
+Original 3-week plan replaced after discovering [reaatech/mcp-load-test](https://github.com/reaatech/mcp-load-test) ships v0.1 functionality already (see §10.5 for parity matrix). v0.1.0 of mcp-loadtest must reach feature parity _and_ surface our differentiators before re-publishing.
 
 Repo was private through the M1-M7 development phase. The new public repo URL will be added once v0.1.0 is published to crates.io.
 
 M1 through M7 are all shipped. Post-M7 work (spike scenario, HTML reporter, WebSocket transport, hot-path zero-copy refactor, criterion benches) is captured under `[Unreleased]` in CHANGELOG rather than as a new milestone — the work is small + cohesive enough that bundling it into v0.1.0 makes more sense than coining "M8" for it. The "Week N" column is dropped because milestones are no longer time-boxed — they're released.
 
-| M | Theme | Key deliverables |
-|---|---|---|
-| **M1** ✓ | stdio Session | `Session::spawn` → handshake → `list_tools`/`call_tool`/`shutdown`; mock-normal.py; happy-path integration test |
-| **M2** ✓ | Scenarios + metrics core | `Scenario` trait; `cold_start` + `sustained` + `deadlock_probe` impls; `hang_detector` (§15.1); hdrhistogram metrics; mocks `mock-broken`/`mock-slow`/`mock-crash` + tests |
-| **M3** ✓ | Reports + first internal release | TOML config; markdown / JSON / console reporters; sysinfo-based process sampling; **regression test against real Vibe-Trading commit ~PR-85** |
-| **M4** ✓ | Transport parity | HTTP transport (StreamableHTTP); SSE transport; HTTP/SSE fixtures; transport-aware concurrency profiles |
-| **M5** ✓ | Analysis parity | `breaking_point` detection; performance grading (A-F per latency/concurrency/error); realistic patterns (explore-then-act, read-then-write, multi-step) with weighted random + think-time; `soak` scenario polish; `compare-baselines` subcommand |
-| **M6** ✓ | Differentiators v1 | Real-time terminal TUI dashboard (live latency/throughput/RSS); server resource sampling beyond RSS (CPU, fd, threads); `race_detector` scenario; cross-server compare (`run --server srv-a --server srv-b`) |
-| **M7** ✓ | Differentiators v2 + v0.1 polish | Protocol fuzzer (basic — random/malformed payloads); coverage tracking (tools registered vs. exercised); per-tool SLO assertions; README rewrite with competitive positioning; `cargo install` smoke test on all 3 OS |
-| **Post-M7** ✓ | Pre-public-release close-out | Spike scenario; HTML reporter; WebSocket transport; hot-path zero-copy refactor; criterion benches (DESIGN §19 claims now reproducible). See CHANGELOG `[Unreleased]`. |
-| **v0.1.0-rc** | Pre-publish review in flight | repo back to **public**; crates.io publish; HN/lobste.rs/r/rust announce |
-| _M8+ stretch_ | Beyond | AI-assisted pattern generator; distributed mode (multi-worker); replay/record; PyO3 binding |
+| M             | Theme                            | Key deliverables                                                                                                                                                                                                                                  |
+| ------------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **M1** ✓      | stdio Session                    | `Session::spawn` → handshake → `list_tools`/`call_tool`/`shutdown`; mock-normal.py; happy-path integration test                                                                                                                                   |
+| **M2** ✓      | Scenarios + metrics core         | `Scenario` trait; `cold_start` + `sustained` + `deadlock_probe` impls; `hang_detector` (§15.1); hdrhistogram metrics; mocks `mock-broken`/`mock-slow`/`mock-crash` + tests                                                                        |
+| **M3** ✓      | Reports + first internal release | TOML config; markdown / JSON / console reporters; sysinfo-based process sampling; **regression test against real Vibe-Trading commit ~PR-85**                                                                                                     |
+| **M4** ✓      | Transport parity                 | HTTP transport (StreamableHTTP); SSE transport; HTTP/SSE fixtures; transport-aware concurrency profiles                                                                                                                                           |
+| **M5** ✓      | Analysis parity                  | `breaking_point` detection; performance grading (A-F per latency/concurrency/error); realistic patterns (explore-then-act, read-then-write, multi-step) with weighted random + think-time; `soak` scenario polish; `compare-baselines` subcommand |
+| **M6** ✓      | Differentiators v1               | Real-time terminal TUI dashboard (live latency/throughput/RSS); server resource sampling beyond RSS (CPU, fd, threads); `race_detector` scenario; cross-server compare (`run --server srv-a --server srv-b`)                                      |
+| **M7** ✓      | Differentiators v2 + v0.1 polish | Protocol fuzzer (basic — random/malformed payloads); coverage tracking (tools registered vs. exercised); per-tool SLO assertions; README rewrite with competitive positioning; `cargo install` smoke test on all 3 OS                             |
+| **Post-M7** ✓ | Pre-public-release close-out     | Spike scenario; HTML reporter; WebSocket transport; hot-path zero-copy refactor; criterion benches (DESIGN §19 claims now reproducible). See CHANGELOG `[Unreleased]`.                                                                            |
+| **v0.1.0-rc** | Pre-publish review in flight     | repo back to **public**; crates.io publish; HN/lobste.rs/r/rust announce                                                                                                                                                                          |
+| _M8+ stretch_ | Beyond                           | AI-assisted pattern generator; distributed mode (multi-worker); replay/record; PyO3 binding                                                                                                                                                       |
 
 **Definition of done for v0.1.0:**
+
 - `cargo install mcp-loadtest-cli` works on Linux/macOS/Windows.
 - `mcp-loadtest deadlock-probe -s "python -m vibe_trading_mcp"` reproduces the original bug on commit `~PR-85`.
 - All §10.5 parity-must-have rows are checked.
@@ -424,40 +436,42 @@ reaatech/mcp-load-test as of 2026-05-10 (TS monorepo, 77 source files, ~50% of R
 
 ### Parity — features they have, we must match before re-publishing public
 
-| Feature | reaatech | mcp-loadtest target | Milestone |
-|---|---|---|---|
-| stdio transport | ✓ | ✓ | M1 |
-| HTTP (StreamableHTTP) transport | ✓ | ✓ | M4 |
-| SSE transport | ✓ | ✓ | M4 |
-| WebSocket transport | ✗ | ✓ | Post-M7 |
-| Latency histograms p50/p95/p99/p999 per tool | ✓ | ✓ | M2 |
-| Breaking point detection | ✓ | ✓ | M5 |
-| Performance grading A-F | ✓ | ✓ | M5 |
-| Soak / leak detection | ✓ | ✓ | M5 |
-| Spike scenario | ✓ | ✓ | Post-M7 |
-| Compare baselines | ✓ | ✓ | M5 |
-| Realistic patterns (explore-then-act, multi-step) | ✓ | ✓ | M5 |
-| Console + markdown + JSON reporters | ✓ | ✓ | M3 |
-| HTML reporter (self-contained) | ✗ | ✓ | Post-M7 |
-| Programmatic library API | ✓ | ✓ | M2/M3 |
+| Feature                                           | reaatech | mcp-loadtest target | Milestone |
+| ------------------------------------------------- | -------- | ------------------- | --------- |
+| stdio transport                                   | ✓        | ✓                   | M1        |
+| HTTP (StreamableHTTP) transport                   | ✓        | ✓                   | M4        |
+| SSE transport                                     | ✓        | ✓                   | M4        |
+| WebSocket transport                               | ✗        | ✓                   | Post-M7   |
+| Latency histograms p50/p95/p99/p999 per tool      | ✓        | ✓                   | M2        |
+| Breaking point detection                          | ✓        | ✓                   | M5        |
+| Performance grading A-F                           | ✓        | ✓                   | M5        |
+| Soak / leak detection                             | ✓        | ✓                   | M5        |
+| Spike scenario                                    | ✓        | ✓                   | Post-M7   |
+| Compare baselines                                 | ✓        | ✓                   | M5        |
+| Realistic patterns (explore-then-act, multi-step) | ✓        | ✓                   | M5        |
+| Console + markdown + JSON reporters               | ✓        | ✓                   | M3        |
+| HTML reporter (self-contained)                    | ✗        | ✓                   | Post-M7   |
+| Programmatic library API                          | ✓        | ✓                   | M2/M3     |
 
 ### Differentiators — features we have/will have that they don't
 
-| Feature | reaatech | mcp-loadtest | Why it matters |
-|---|---|---|---|
-| **Deadlock detection (`deadlock_probe`)** | ✗ | ✓ M2 | Lazy-init / async-worker bugs that break in prod. Direct response to Vibe-Trading PR #85. |
-| **Race detector** | ✗ | ✓ M6 | Order-sensitive concurrent tool calls; finds protocol-level race bugs. |
-| **Real-time TUI dashboard** | ✗ (post-hoc only) | ✓ M6 | Watch perf cliff happen live during a run. |
-| **Cross-server compare** (run vs N targets) | partial (compare baselines = 2 runs) | ✓ M6 (1 run, N targets) | Side-by-side: vendor A vs vendor B vs your fork. |
-| **Server resource sampling** (CPU/fd/threads/RSS over time) | ✗ (latency only) | ✓ M6 | Find resource exhaustion before throughput collapses. |
-| **Protocol fuzzer (mcp-fuzz integrated)** | ✗ | ✓ M7 | Random/malformed payloads; finds parser bugs unit tests miss. |
-| **Coverage tracking** (registered vs exercised tools) | ✗ | ✓ M7 | Catch silently-broken tools that nobody tests in CI. |
-| **Per-tool SLO assertions** | partial (global) | ✓ M7 | Per-tool latency/error budgets in CI. |
-| **Rust perf** + static binary | ✗ (Node runtime required) | ✓ | `cargo install` → single ~5MB binary; no Node toolchain. |
-| **AI-assisted pattern generator** | ✗ | ⏳ M8 stretch | LLM reads tool schemas → generates realistic call sequences. |
-| **Distributed mode** | ✗ | ⏳ M8 stretch | Multiple workers driving one server (high-RPS targets). |
-| **Replay / record** | ✗ | ⏳ M8 stretch | Capture prod traffic, replay deterministically. |
-| **Self-hosted as MCP server** (`mcp-loadtest serve --mcp`) | ✗ | ✓ M7 | AI agents (Claude, Cursor, etc.) call `deadlock_probe` / `compare` / `report` directly via MCP. Recursive: load-test an MCP using an MCP. |
+| Feature                                                     | reaatech                             | mcp-loadtest            | Why it matters                                                                                                                                                                         |
+| ----------------------------------------------------------- | ------------------------------------ | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Deadlock detection (`deadlock_probe`)**                   | ✗                                    | ✓ M2                    | Lazy-init / async-worker bugs that break in prod. Direct response to Vibe-Trading PR #85.                                                                                              |
+| **Race detector**                                           | ✗                                    | ✓ M6                    | Order-sensitive concurrent tool calls; finds protocol-level race bugs.                                                                                                                 |
+| **Real-time TUI dashboard**                                 | ✗ (post-hoc only)                    | ✓ M6                    | Watch perf cliff happen live during a run.                                                                                                                                             |
+| **Cross-server compare** (run vs N targets)                 | partial (compare baselines = 2 runs) | ✓ M6 (1 run, N targets) | Side-by-side: vendor A vs vendor B vs your fork.                                                                                                                                       |
+| **Server resource sampling** (CPU/fd/threads/RSS over time) | ✗ (latency only)                     | ✓ M6                    | Find resource exhaustion before throughput collapses.                                                                                                                                  |
+| **Protocol fuzzer (mcp-fuzz integrated)**                   | ✗                                    | ✓ M7                    | Random/malformed payloads; finds parser bugs unit tests miss.                                                                                                                          |
+| **Coverage tracking** (registered vs exercised tools)       | ✗                                    | ✓ M7                    | Catch silently-broken tools that nobody tests in CI.                                                                                                                                   |
+| **Per-tool SLO assertions**                                 | partial (global)                     | ✓ M7                    | Per-tool latency/error budgets in CI.                                                                                                                                                  |
+| **Configurable regression thresholds**                      | ✗ (fixed)                            | ✓ v0.1                  | `compare` CLI flags + `compare_runs` MCP args override p99 / error-rate / deadlock policy; defaults unchanged (ADR 0009).                                                              |
+| **Protocol-aware assertions**                               | ✗                                    | ✓ v0.1                  | Opt-in strict mode validates `tools/call` args vs the server's advertised `inputSchema`; mismatch → `ProtocolError` gates the run. Forward-compatible, off by default (ADR 0005/0010). |
+| **Rust perf** + static binary                               | ✗ (Node runtime required)            | ✓                       | `cargo install` → single ~5MB binary; no Node toolchain.                                                                                                                               |
+| **AI-assisted pattern generator**                           | ✗                                    | ⏳ M8 stretch           | LLM reads tool schemas → generates realistic call sequences.                                                                                                                           |
+| **Distributed mode**                                        | ✗                                    | ⏳ M8 stretch           | Multiple workers driving one server (high-RPS targets).                                                                                                                                |
+| **Replay / record**                                         | ✗                                    | ⏳ M8 stretch           | Capture prod traffic, replay deterministically.                                                                                                                                        |
+| **Self-hosted as MCP server** (`mcp-loadtest serve --mcp`)  | ✗                                    | ✓ M7                    | AI agents (Claude, Cursor, etc.) call `deadlock_probe` / `compare` / `report` directly via MCP. Recursive: load-test an MCP using an MCP.                                              |
 
 ### Strategic positioning (for README at v0.1.0)
 
@@ -469,16 +483,16 @@ The README at re-publish must lead with the deadlock demo (replicated Vibe-Tradi
 
 ## 11. Decisions (resolved 2026-05-10)
 
-| # | Question | Decision | Rationale |
-|---|---|---|---|
-| 1 | Crate name | **`mcp-loadtest`** (lib) + **`mcp-loadtest-cli`** (bin) | descriptive, discoverable, doesn't pigeonhole to "bench" |
-| 2 | License | **MIT OR Apache-2.0** (dual) | Rust ecosystem standard; MIT for individuals, Apache-2.0 for corporate patent grant |
-| 3 | Repo location | **`github.com/Teerapat-Vatpitak/mcp-loadtest`** | personal handle for v0.1; transfer to `mcp-tools/` org if/when sister projects emerge |
-| 4 | MCP protocol versioning | v0.1 pin to spec v1.x, warn on mismatch; `--strict-protocol` flag for fail-on-mismatch; v0.2+ detect-and-adapt | ship v0.1 fast, add complexity when justified |
-| 5 | `deadlock_probe` | **both** subcommand (`mcp-loadtest deadlock-probe -s "..."`) and scenario in `run --scenario deadlock_probe` | subcommand for newcomer UX, scenario for CI; near-zero implementation cost |
-| 6 | Server stderr | **always capture** to `runs/<id>/server.stderr.log`; opt-in `--tee-stderr` to also stream | stderr critical for debugging; capture is cheap; tee opt-in to avoid CI log spam |
-| 7 | Diff-vs-baseline mode | **defer to M5 stretch** | v0.1 emits JSON; users diff externally. Proper baseline storage + regression detection has too many edge cases for v0.1 |
-| 8 | Library API → 1.0 | When **all three**: 3 months no breaking changes + 5+ external users + 1 real bug caught in wild | calendar time + adoption + value-prop validation, all required |
+| #   | Question                | Decision                                                                                                       | Rationale                                                                                                               |
+| --- | ----------------------- | -------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| 1   | Crate name              | **`mcp-loadtest`** (lib) + **`mcp-loadtest-cli`** (bin)                                                        | descriptive, discoverable, doesn't pigeonhole to "bench"                                                                |
+| 2   | License                 | **MIT OR Apache-2.0** (dual)                                                                                   | Rust ecosystem standard; MIT for individuals, Apache-2.0 for corporate patent grant                                     |
+| 3   | Repo location           | **`github.com/Teerapat-Vatpitak/mcp-loadtest`**                                                                | personal handle for v0.1; transfer to `mcp-tools/` org if/when sister projects emerge                                   |
+| 4   | MCP protocol versioning | v0.1 pin to spec v1.x, warn on mismatch; `--strict-protocol` flag for fail-on-mismatch; v0.2+ detect-and-adapt | ship v0.1 fast, add complexity when justified                                                                           |
+| 5   | `deadlock_probe`        | **both** subcommand (`mcp-loadtest deadlock-probe -s "..."`) and scenario in `run --scenario deadlock_probe`   | subcommand for newcomer UX, scenario for CI; near-zero implementation cost                                              |
+| 6   | Server stderr           | **always capture** to `runs/<id>/server.stderr.log`; opt-in `--tee-stderr` to also stream                      | stderr critical for debugging; capture is cheap; tee opt-in to avoid CI log spam                                        |
+| 7   | Diff-vs-baseline mode   | **defer to M5 stretch**                                                                                        | v0.1 emits JSON; users diff externally. Proper baseline storage + regression detection has too many edge cases for v0.1 |
+| 8   | Library API → 1.0       | When **all three**: 3 months no breaking changes + 5+ external users + 1 real bug caught in wild               | calendar time + adoption + value-prop validation, all required                                                          |
 
 ---
 
@@ -803,7 +817,7 @@ Simple, but worth specifying so the report's `passed()` is unambiguous.
 
 ## 16. Mock server specs
 
-Mocks live in `tests/fixtures/<name>.py`. Each is < 50 lines of Python — minimal MCP server using stdio + JSON-RPC by hand (no fastmcp dep, to avoid version coupling). Shipped fixtures: `mock-normal.py`, `mock-slow.py`, `mock-broken.py`, `mock-crash.py`, plus `mock-http-server.py` and `mock-sse-server.py` (transport parity coverage). Pseudocode for each below; entries marked *(planned for v0.2)* are documented for completeness but not yet shipped.
+Mocks live in `tests/fixtures/<name>.py`. Each is < 50 lines of Python — minimal MCP server using stdio + JSON-RPC by hand (no fastmcp dep, to avoid version coupling). Shipped fixtures: `mock-normal.py`, `mock-slow.py`, `mock-broken.py`, `mock-crash.py`, plus `mock-http-server.py` and `mock-sse-server.py` (transport parity coverage). Pseudocode for each below; entries marked _(planned for v0.2)_ are documented for completeness but not yet shipped.
 
 ### 16.1 mock-normal.py
 
@@ -866,7 +880,7 @@ while True:
 # Stdlib http.server only. Used by SseTransport integration tests.
 ```
 
-### 16.7 mock-leak.py *(planned for v0.2)*
+### 16.7 mock-leak.py _(planned for v0.2)_
 
 ```python
 # Allocates 10 KB per tools/call into a module-global list. Never frees.
@@ -875,7 +889,7 @@ while True:
 # (t, rss) series; a real leaking fixture is still useful for end-to-end coverage.
 ```
 
-### 16.8 mock-error.py *(planned for v0.2)*
+### 16.8 mock-error.py _(planned for v0.2)_
 
 ```python
 # Returns JSON-RPC errors per spec: -32601 method not found,
@@ -883,13 +897,13 @@ while True:
 # Cycles through error codes per call. Tests error classification (§18).
 ```
 
-### 16.9 mock-slow-init.py *(planned for v0.2)*
+### 16.9 mock-slow-init.py _(planned for v0.2)_
 
 ```python
 # Sleeps 5s on `initialize` before responding. Tests cold_start measurement.
 ```
 
-### 16.10 mock-malformed.py *(planned for v0.2)*
+### 16.10 mock-malformed.py _(planned for v0.2)_
 
 ```python
 # Returns invalid JSON every 10th response (truncated, missing field).
@@ -927,36 +941,52 @@ Stream-friendly. Can be processed with `jq` or any line-oriented tool.
 
 ```json
 {
-  "run_id": "01HXY...",
-  "started_at": "2026-05-10T07:30:00Z",
-  "duration_secs": 60.0,
-  "scenario": {"kind": "Sustained", "concurrent": 50, "duration_secs": 60.0},
-  "latency_ms": {
-    "p50": 12.3, "p95": 45.6, "p99": 123.4, "p999": 456.7,
-    "min": 1.2, "max": 999.9, "mean": 23.4, "stddev": 18.7,
-    "count": 12345
-  },
-  "throughput": {
-    "total_requests": 12345, "successful_requests": 12300,
-    "requests_per_sec": 205.75
-  },
-  "errors": {
-    "total": 45,
-    "by_category": {
-      "Hang": 0, "Timeout": 5, "ServerError": 30,
-      "ProtocolError": 10, "Crash": 0, "Malformed": 0
-    }
-  },
-  "process": {
-    "peak_rss_mb": 156.3, "final_rss_mb": 142.1,
-    "avg_cpu_pct": 23.4
-  },
-  "deadlock_count": 0,
-  "hang_count": 0,
-  "threshold_violations": [
-    {"metric": "p99_latency", "expected": "<=100ms", "actual": "123.4ms"}
-  ],
-  "passed": false
+    "run_id": "01HXY...",
+    "started_at": "2026-05-10T07:30:00Z",
+    "duration_secs": 60.0,
+    "scenario": {
+        "kind": "Sustained",
+        "concurrent": 50,
+        "duration_secs": 60.0
+    },
+    "latency_ms": {
+        "p50": 12.3,
+        "p95": 45.6,
+        "p99": 123.4,
+        "p999": 456.7,
+        "min": 1.2,
+        "max": 999.9,
+        "mean": 23.4,
+        "stddev": 18.7,
+        "count": 12345
+    },
+    "throughput": {
+        "total_requests": 12345,
+        "successful_requests": 12300,
+        "requests_per_sec": 205.75
+    },
+    "errors": {
+        "total": 45,
+        "by_category": {
+            "Hang": 0,
+            "Timeout": 5,
+            "ServerError": 30,
+            "ProtocolError": 10,
+            "Crash": 0,
+            "Malformed": 0
+        }
+    },
+    "process": {
+        "peak_rss_mb": 156.3,
+        "final_rss_mb": 142.1,
+        "avg_cpu_pct": 23.4
+    },
+    "deadlock_count": 0,
+    "hang_count": 0,
+    "threshold_violations": [
+        { "metric": "p99_latency", "expected": "<=100ms", "actual": "123.4ms" }
+    ],
+    "passed": false
 }
 ```
 
@@ -975,32 +1005,38 @@ JSON Schema published at `schema/metrics.v1.json` for downstream tooling.
 **Started:** 2026-05-10 07:30:00 UTC
 
 ## Summary
+
 - Total requests: 12,345
 - Throughput: 205.75 req/s
 - Error rate: 0.36%
-- Deadlocks: 0  Hangs: 0
+- Deadlocks: 0 Hangs: 0
 
 ## Latency
-| p50 | p95 | p99 | p999 | max |
-|---|---|---|---|---|
+
+| p50    | p95    | p99            | p999    | max     |
+| ------ | ------ | -------------- | ------- | ------- |
 | 12.3ms | 45.6ms | **123.4ms** ❌ | 456.7ms | 999.9ms |
 
 (latency histogram ASCII chart here)
 
 ## Errors
-| Category | Count |
-|---|---|
-| ServerError | 30 |
-| ProtocolError | 10 |
-| Timeout | 5 |
+
+| Category      | Count |
+| ------------- | ----- |
+| ServerError   | 30    |
+| ProtocolError | 10    |
+| Timeout       | 5     |
 
 ## Process
+
 Peak RSS: 156.3 MB · Final RSS: 142.1 MB · Avg CPU: 23.4%
 
 ## Threshold violations
+
 - ❌ **p99_latency**: expected ≤100ms, got 123.4ms
 
 ## Trace
+
 Full trace: `./trace.jsonl` (12,345 events, 8.2 MB)
 ```
 
@@ -1010,17 +1046,17 @@ Full trace: `./trace.jsonl` (12,345 events, 8.2 MB)
 
 Every failure is classified into exactly one category. Used for `ErrorStats.by_category` and reporting.
 
-| Category | Definition | Example |
-|---|---|---|
-| `Hang` | No response within `hang_threshold`, but response arrived before grace_period expires | tool genuinely slow under contention |
-| `Deadlock` | No response after `hang_threshold + grace_period` | Vibe-Trading PR #85 |
-| `Timeout` | Client-side configured deadline exceeded (separate from hang_threshold) | network buffer full |
-| `ServerError` | JSON-RPC error response with `code` in `[-32099..=-32000]` (server-defined) | tool returned business error |
-| `ProtocolError` | JSON-RPC error with `code` `-32600..=-32603` (transport / spec violations) | malformed request rejected |
-| `Crash` | Server process exited (non-zero or signal) during call | unhandled panic |
-| `Malformed` | Response was not valid JSON or didn't match JSON-RPC schema | partial response, broken framing |
-| `Disconnected` | Transport closed unexpectedly mid-call | broken pipe |
-| `Cancelled` | Client cancelled the request before response | scenario shutdown |
+| Category        | Definition                                                                            | Example                              |
+| --------------- | ------------------------------------------------------------------------------------- | ------------------------------------ |
+| `Hang`          | No response within `hang_threshold`, but response arrived before grace_period expires | tool genuinely slow under contention |
+| `Deadlock`      | No response after `hang_threshold + grace_period`                                     | Vibe-Trading PR #85                  |
+| `Timeout`       | Client-side configured deadline exceeded (separate from hang_threshold)               | network buffer full                  |
+| `ServerError`   | JSON-RPC error response with `code` in `[-32099..=-32000]` (server-defined)           | tool returned business error         |
+| `ProtocolError` | JSON-RPC error with `code` `-32600..=-32603` (transport / spec violations)            | malformed request rejected           |
+| `Crash`         | Server process exited (non-zero or signal) during call                                | unhandled panic                      |
+| `Malformed`     | Response was not valid JSON or didn't match JSON-RPC schema                           | partial response, broken framing     |
+| `Disconnected`  | Transport closed unexpectedly mid-call                                                | broken pipe                          |
+| `Cancelled`     | Client cancelled the request before response                                          | scenario shutdown                    |
 
 Classification precedence: top-down. A request that hangs and then the server crashes → classified as `Crash` (the terminal event), but trace.jsonl records both `hang` and `crash` events for forensics.
 
@@ -1030,13 +1066,13 @@ Classification precedence: top-down. A request that hangs and then the server cr
 
 `mcp-loadtest` should never be the bottleneck.
 
-| Aspect | Target |
-|---|---|
-| Driver per-request CPU overhead | < 50µs (excluding JSON serialization) |
-| Memory per concurrent worker | < 100KB |
-| Max sustainable concurrency on a 4-core laptop | ≥ 1000 workers |
-| Trace file write throughput | ≥ 100k events/sec |
-| Histogram update | lock-free per-worker, merged at end |
+| Aspect                                         | Target                                |
+| ---------------------------------------------- | ------------------------------------- |
+| Driver per-request CPU overhead                | < 50µs (excluding JSON serialization) |
+| Memory per concurrent worker                   | < 100KB                               |
+| Max sustainable concurrency on a 4-core laptop | ≥ 1000 workers                        |
+| Trace file write throughput                    | ≥ 100k events/sec                     |
+| Histogram update                               | lock-free per-worker, merged at end   |
 
 These are tested in `benches/` (criterion). v0.1 ships with reproducible numbers in the README.
 
@@ -1050,6 +1086,7 @@ These are tested in `benches/` (criterion). v0.1 ships with reproducible numbers
 - Library MSRV (minimum supported Rust version): stable - 2 (e.g. if 1.85 is current stable, MSRV is 1.83).
 
 When to commit to 1.0:
+
 - After 3 months of v0.x with no breaking changes
 - After 5+ external users have integrated
 - After at least 1 real bug caught in the wild and reported back
@@ -1070,15 +1107,15 @@ mcp-loadtest is a tool that AI agents will both **operate** (Claude Code running
 
 The single most important AI-friendly feature. mcp-loadtest exposes itself as an MCP server with these tools:
 
-| Tool | Args | Returns |
-|---|---|---|
-| `deadlock_probe` | `server_command`, `tool`, `concurrent` | `{ deadlock_count, hung_for_ms[], details }` |
-| `sustained_load` | `server_command`, `concurrent`, `duration_secs`, `tool`, `args` | `{ p50_ms, p99_ms, error_rate, requests_per_sec }` |
-| `compare_runs` | `baseline_run_dir`, `current_run_dir` | structured diff with regression flags |
-| `report_summary` | `run_dir` | markdown summary string |
-| `list_recent_runs` | `limit` | run dirs with metadata |
+| Tool               | Args                                                            | Returns                                            |
+| ------------------ | --------------------------------------------------------------- | -------------------------------------------------- |
+| `deadlock_probe`   | `server_command`, `tool`, `concurrent`                          | `{ deadlock_count, hung_for_ms[], details }`       |
+| `sustained_load`   | `server_command`, `concurrent`, `duration_secs`, `tool`, `args` | `{ p50_ms, p99_ms, error_rate, requests_per_sec }` |
+| `compare_runs`     | `baseline_run_dir`, `current_run_dir`                           | structured diff with regression flags              |
+| `report_summary`   | `run_dir`                                                       | markdown summary string                            |
+| `list_recent_runs` | `limit`                                                         | run dirs with metadata                             |
 
-A user can say to Claude / Cursor / any MCP-aware agent: *"Find deadlocks in my new MCP server at `python -m foo`"* — and the agent calls `deadlock_probe` directly. **No human-in-the-loop required to spawn a child process and parse stdout** — the agent gets structured JSON back.
+A user can say to Claude / Cursor / any MCP-aware agent: _"Find deadlocks in my new MCP server at `python -m foo`"_ — and the agent calls `deadlock_probe` directly. **No human-in-the-loop required to spawn a child process and parse stdout** — the agent gets structured JSON back.
 
 **Reaatech doesn't do this.** It's our most under-priced differentiator.
 
@@ -1094,6 +1131,7 @@ Hint: server may have crashed before responding. Check stderr at:
 ```
 
 vs. the bad version:
+
 ```
 Error: BrokenPipe(Os { code: 32, ... })
 ```
@@ -1130,6 +1168,7 @@ LLMs use this to plan the right invocation. Reduces "I tried it but it didn't do
 ### 21.6 `mcp-loadtest doctor`
 
 Diagnoses common setup issues:
+
 - Python interpreter not on PATH (for fixture-based tests).
 - MSVC vs GNU toolchain mismatch on Windows.
 - Stale `runs/` accumulation.
@@ -1158,6 +1197,7 @@ A report that says `p99 latency: 234ms` is data. A report that adds `"95% of use
 Per-scenario copy-pasteable commands + expected output. LLMs train on README-style examples; cookbook entries make those examples concrete and executable.
 
 Examples to ship at v0.1.0:
+
 - "Find deadlocks in my new MCP server"
 - "Add a regression gate to my CI"
 - "Compare two implementations of the same MCP server"
diff --git a/README.md b/README.md
index 705f863..6dea968 100644
--- a/README.md
+++ b/README.md
@@ -35,22 +35,45 @@ This is the bug class that breaks MCP servers in production. Unit tests don't ca
 ## What it does
 
 - **Bug-class detection**
-  - `deadlock_probe` — fires N concurrent `tools/call`s through `hang_detect`; classifies each as success / slow / deadlock (see [DESIGN.md §15.2](DESIGN.md#152-deadlock-probe-scenario)).
-  - `race_check` — issues identical calls and diffs the responses to surface non-determinism (clocks, RNG, leaked state).
-  - Hang detector watchdog wraps every call, so any scenario can surface a hung tool, not just the dedicated probe.
+    - `deadlock_probe` — fires N concurrent `tools/call`s through `hang_detect`; classifies each as success / slow / deadlock (see [DESIGN.md §15.2](DESIGN.md#152-deadlock-probe-scenario)).
+    - `race_check` — issues identical calls and diffs the responses to surface non-determinism (clocks, RNG, leaked state).
+    - Hang detector watchdog wraps every call, so any scenario can surface a hung tool, not just the dedicated probe.
 - **Load testing**
-  - `sustained` — constant concurrency over a duration; baseline p50/p95/p99/p999 + throughput.
-  - `ramp` — linear ramp of concurrency to find the break-point.
-  - `soak` — long-running sustained load with periodic RSS sampling for leak hunting.
-  - Cold-start measurement, weighted pattern mixes (explore-then-act, multi-step).
+    - `sustained` — constant concurrency over a duration; baseline p50/p95/p99/p999 + throughput.
+    - `ramp` — linear ramp of concurrency to find the break-point.
+    - `soak` — long-running sustained load with periodic RSS sampling for leak hunting.
+    - Cold-start measurement, weighted pattern mixes (explore-then-act, multi-step).
 - **Reporting**
-  - Markdown report at `runs/<ulid>/report.md`, self-contained `report.html` (no external deps), machine-readable `metrics.json`, ANSI terminal summary.
-  - Schema-stable JSON (see `docs/schema/metrics.v1.json`); snapshot-tested so downstream LLM agents don't break on patch versions.
-  - `mcp-loadtest compare baseline.json current.json` for regression diffs in CI.
-  - `mcp-loadtest cross --server "..." --server "..."` for side-by-side runs across N targets.
+    - Markdown report at `runs/<ulid>/report.md`, self-contained `report.html` (no external deps), machine-readable `metrics.json`, ANSI terminal summary.
+    - Schema-stable JSON (see `docs/schema/metrics.v1.json`); snapshot-tested so downstream LLM agents don't break on patch versions.
+    - `mcp-loadtest compare baseline.json current.json` for regression diffs in CI.
+    - `mcp-loadtest cross --server "..." --server "..."` for side-by-side runs across N targets.
 - **AI-agent friendly**
-  - Stable JSON output and structured error messages with `Hint:` lines.
-  - `mcp-loadtest serve --mcp` exposes the tool itself as an MCP server so Claude Code, Cursor, or any MCP-aware agent can call `deadlock_probe`, `sustained_load`, and `compare_runs` as MCP tools directly. See [DESIGN.md §21.2](DESIGN.md#212-self-hosted-mcp-server-mcp-loadtest-serve---mcp).
+    - Stable JSON output and structured error messages with `Hint:` lines.
+    - `mcp-loadtest serve --mcp` exposes the tool itself as an MCP server so Claude Code, Cursor, or any MCP-aware agent can call `deadlock_probe`, `sustained_load`, and `compare_runs` as MCP tools directly. See [DESIGN.md §21.2](DESIGN.md#212-self-hosted-mcp-server-mcp-loadtest-serve---mcp).
+
+## CI gating & protocol-aware assertions
+
+`mcp-loadtest` is built to be a **CI regression gate**, not just a profiler. Every run resolves to a pass/fail and a non-zero exit code, so it drops straight into a pipeline:
+
+- **Threshold gating** — `[thresholds]` (p50/p95/p99/p999 latency, error rate, memory growth, per-tool SLOs). Any breach → `report.passed() == false` → non-zero exit. Deadlocks are zero-tolerance.
+- **Baseline regression diff** — `mcp-loadtest compare baseline.json current.json` flags p99 / error-rate / deadlock regressions. Thresholds default to 10% p99 / 0.5pp error rate / deadlock-zero-tolerance and are now configurable:
+
+    ```bash
+    mcp-loadtest compare base.json cur.json \
+        --max-p99-regression-pct 15 --max-error-rate-regression-pp 1.0
+    ```
+
+    The same knobs are exposed as `compare_runs` MCP tool args for agent-driven gating (ADR 0009).
+
+- **Protocol-aware assertions** — opt-in strict mode validates every `tools/call`'s arguments against the server's advertised `inputSchema` _before_ the call. A contract mismatch is recorded as a `ProtocolError` and gates the run. Off by default (forward-compatible, ADR 0005/0010); enable per-config:
+
+    ```toml
+    [validation]
+    strict = true
+    ```
+
+Full GitHub Actions example: [docs/examples/ci-integration.md](docs/examples/ci-integration.md).
 
 ## Quick start
 
@@ -138,46 +161,48 @@ async fn no_deadlock_under_concurrent_calls() {
 
 [`reaatech/mcp-load-test`](https://github.com/reaatech/mcp-load-test) is the only other MCP load tester we're aware of. It's a TypeScript monorepo and covers the load-testing basics well. `mcp-loadtest` is built on a different axis: Rust performance + a static binary, plus a bug-detector layer that targets the classes of MCP failures unit tests miss.
 
-| Feature | reaatech | mcp-loadtest |
-|---|---|---|
-| **Deadlock detection (`deadlock_probe`)** | not available | yes |
-| **Race / non-determinism detector** | not available | yes (`race_check`) |
-| **Real-time TUI dashboard** | post-hoc only | yes |
-| **Cross-server compare (1 run, N targets)** | partial (2-run baseline diff) | yes (`cross` subcommand) |
-| **Server resource sampling over time** (RSS/CPU/fd) | latency only | yes |
-| **Protocol fuzzer** | not available | yes |
-| **Coverage tracking** (registered vs exercised tools) | not available | yes |
-| **Per-tool SLO assertions** | global only | yes |
-| **Self-hosted as MCP server** (LLM-agent control) | not available | yes |
-| **HTML report** | not available | yes |
-| **WebSocket transport** | not available | yes |
-| **Rust perf + static binary** | Node runtime required | single ~5 MB binary via `cargo install` |
-| stdio transport | yes | yes |
-| HTTP / SSE transports | yes | yes |
-| Latency histograms p50/p95/p99/p999 | yes | yes |
-| Breaking-point detection | yes | yes |
-| Performance grading A-F | yes | yes |
-| Soak / leak detection | yes | yes |
-| Spike scenario (sudden burst) | yes | yes |
-| Compare baselines | yes | yes |
-| Realistic patterns (explore-then-act, multi-step) | yes | yes |
-| Console + markdown + JSON reporters | yes | yes |
-| Programmatic library API | yes | yes |
+| Feature                                                            | reaatech                      | mcp-loadtest                            |
+| ------------------------------------------------------------------ | ----------------------------- | --------------------------------------- |
+| **Deadlock detection (`deadlock_probe`)**                          | not available                 | yes                                     |
+| **Race / non-determinism detector**                                | not available                 | yes (`race_check`)                      |
+| **Real-time TUI dashboard**                                        | post-hoc only                 | yes                                     |
+| **Cross-server compare (1 run, N targets)**                        | partial (2-run baseline diff) | yes (`cross` subcommand)                |
+| **Server resource sampling over time** (RSS/CPU/fd)                | latency only                  | yes                                     |
+| **Protocol fuzzer**                                                | not available                 | yes                                     |
+| **Coverage tracking** (registered vs exercised tools)              | not available                 | yes                                     |
+| **Per-tool SLO assertions**                                        | global only                   | yes                                     |
+| **Configurable regression thresholds** (CLI + MCP args)            | fixed                         | yes                                     |
+| **Protocol-aware assertions** (opt-in strict `inputSchema` gating) | not available                 | yes                                     |
+| **Self-hosted as MCP server** (LLM-agent control)                  | not available                 | yes                                     |
+| **HTML report**                                                    | not available                 | yes                                     |
+| **WebSocket transport**                                            | not available                 | yes                                     |
+| **Rust perf + static binary**                                      | Node runtime required         | single ~5 MB binary via `cargo install` |
+| stdio transport                                                    | yes                           | yes                                     |
+| HTTP / SSE transports                                              | yes                           | yes                                     |
+| Latency histograms p50/p95/p99/p999                                | yes                           | yes                                     |
+| Breaking-point detection                                           | yes                           | yes                                     |
+| Performance grading A-F                                            | yes                           | yes                                     |
+| Soak / leak detection                                              | yes                           | yes                                     |
+| Spike scenario (sudden burst)                                      | yes                           | yes                                     |
+| Compare baselines                                                  | yes                           | yes                                     |
+| Realistic patterns (explore-then-act, multi-step)                  | yes                           | yes                                     |
+| Console + markdown + JSON reporters                                | yes                           | yes                                     |
+| Programmatic library API                                           | yes                           | yes                                     |
 
 We tracked 4 direct competitors (reaatech, haakco/mcp-testing-framework, spbiju/MCP-Benchmark, IBM mcp-context-forge internal) and 6 adjacent LLM-eval frameworks (MCP-Bench, MCPBench, MCP-Universe, MCPMark, MCP-Inspector, k6-MCP). See [DESIGN.md §10.5](DESIGN.md#105-competitive-parity--differentiation-matrix) for the full matrix and [ADR 0004](docs/adr/0004-compete-with-reaatech.md) for the positioning decision.
 
 ## Built-in scenarios
 
-| Scenario | Detects |
-|---|---|
-| `cold_start` | startup time regressions, init-time deadlocks |
-| `sustained` | baseline p99 latency, throughput, error rate |
-| `ramp` | break-point — concurrency where p99 explodes |
-| `spike` | sudden-burst load — baseline → peak window → cooldown |
-| `soak` | memory leaks under sustained load |
-| `deadlock_probe` | lazy-init deadlocks (the canonical Vibe-Trading bug class) |
-| `race_check` | non-determinism / order-sensitive bugs |
-| `pattern` | weighted random mixes (explore-then-act, read-then-write, multi-step) |
+| Scenario         | Detects                                                               |
+| ---------------- | --------------------------------------------------------------------- |
+| `cold_start`     | startup time regressions, init-time deadlocks                         |
+| `sustained`      | baseline p99 latency, throughput, error rate                          |
+| `ramp`           | break-point — concurrency where p99 explodes                          |
+| `spike`          | sudden-burst load — baseline → peak window → cooldown                 |
+| `soak`           | memory leaks under sustained load                                     |
+| `deadlock_probe` | lazy-init deadlocks (the canonical Vibe-Trading bug class)            |
+| `race_check`     | non-determinism / order-sensitive bugs                                |
+| `pattern`        | weighted random mixes (explore-then-act, read-then-write, multi-step) |
 
 Each scenario is one `impl Scenario` in [`crates/mcp-loadtest/src/scenario/`](crates/mcp-loadtest/src/scenario/) with a JSON-Schema describing its config block. See [DESIGN.md §8](DESIGN.md#8-built-in-scenarios) for the full table.
 

From ad2e5c602598f2913cd5352b21f1c56b9c2b1468 Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 19:59:47 +0700
Subject: [PATCH 5/8] fix(security): harden B1/B2 surface before publish
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-publish security review of the configurable-thresholds and
strict-validation work surfaced one HIGH and three lower findings,
all in code added this branch:

- HIGH: protocol::schema::validate_at recursed with no depth bound. A
  malicious server-advertised inputSchema could stack-overflow the
  load test (an uncatchable abort) once strict mode is on. Cap at
  MAX_SCHEMA_DEPTH = 64; over-deep tails are skipped, not failed.
- MED: negative regression thresholds silently inverted the gate.
  Reject non-positive / non-finite values at both boundaries — a
  clap value_parser on the CLI flags and a finite-and-positive
  filter in compare_runs::regression_overrides.
- LOW: compare_runs canonicalize error echoed the raw caller path
  back to the MCP client; drop it.
- LOW: enum-violation messages rendered the full server enum array;
  cap the preview at 5 entries + a count.

Regression tests added for the depth cap and the threshold guard.
ci-checks green: fmt, clippy -D warnings, --locked tests, doc.
---
 CHANGELOG.md                                  |  1 +
 crates/mcp-loadtest-cli/src/main.rs           | 21 ++++++--
 crates/mcp-loadtest/src/protocol/schema.rs    | 54 +++++++++++++++++--
 .../src/serve/tools/compare_runs.rs           | 42 +++++++++++----
 4 files changed, 98 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6410ab3..0ab992a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -140,6 +140,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ✅ The M8 file-split pass completed in the pre-publish review. All source files have production code (excluding `#[cfg(test)] mod tests`) under the 300-line convention. See `POST_PUBLISH_ISSUES.md` for the per-wave summary of what split where.
 - `serve` and `tui` modules will move behind cargo feature flags in a future release to keep the default build slim.
 - HTTP / SSE transport host-allowlist for SSRF defense is deferred. Currently mitigated by `Policy::none()` on redirects; the allowlist is operator-facing config and will land alongside the broader transport-hardening pass.
+- ✅ Pre-publish security pass on the B1/B2 surface: bounded `protocol::schema` recursion (`MAX_SCHEMA_DEPTH`, defends strict mode against a maliciously deep server `inputSchema`); non-positive regression thresholds rejected at the CLI/MCP boundary (would otherwise invert the gate); `compare_runs` no longer echoes the raw caller path; `enum`-violation messages are length-capped.
 
 ## [0.1.0] — TBD
 
diff --git a/crates/mcp-loadtest-cli/src/main.rs b/crates/mcp-loadtest-cli/src/main.rs
index f1703ba..7928e5e 100644
--- a/crates/mcp-loadtest-cli/src/main.rs
+++ b/crates/mcp-loadtest-cli/src/main.rs
@@ -15,6 +15,19 @@ use clap::{Parser, Subcommand};
 
 use mcp_loadtest_cli::{cmd_compare, cmd_cross, cmd_deadlock, cmd_run};
 
+/// clap value parser: accept only a finite, strictly-positive f64. A
+/// non-positive regression threshold would invert the regression direction,
+/// so reject it at the CLI boundary with a clear message instead of
+/// silently mis-gating.
+fn positive_f64(s: &str) -> Result<f64, String> {
+    let v: f64 = s.parse().map_err(|_| format!("`{s}` is not a number"))?;
+    if v.is_finite() && v > 0.0 {
+        Ok(v)
+    } else {
+        Err("must be a finite number greater than 0".to_string())
+    }
+}
+
 #[derive(Parser)]
 #[command(name = "mcp-loadtest", version, about = "Load tester for MCP servers")]
 struct Cli {
@@ -46,11 +59,11 @@ enum Cmd {
         /// Output format: `markdown` (default, human-readable) or `json` (CI-friendly).
         #[arg(long, default_value = "markdown")]
         format: String,
-        /// p99 latency growth (percent) that flags a regression.
-        #[arg(long, default_value_t = cmd_compare::P99_REGRESSION_PCT)]
+        /// p99 latency growth (percent) that flags a regression. Must be > 0.
+        #[arg(long, default_value_t = cmd_compare::P99_REGRESSION_PCT, value_parser = positive_f64)]
         max_p99_regression_pct: f64,
-        /// Error-rate growth (percentage points) that flags a regression.
-        #[arg(long, default_value_t = cmd_compare::ERROR_RATE_REGRESSION_PP)]
+        /// Error-rate growth (percentage points) that flags a regression. Must be > 0.
+        #[arg(long, default_value_t = cmd_compare::ERROR_RATE_REGRESSION_PP, value_parser = positive_f64)]
         max_error_rate_regression_pp: f64,
         /// Don't flag a regression when the deadlock count increases.
         #[arg(long, default_value_t = false)]
diff --git a/crates/mcp-loadtest/src/protocol/schema.rs b/crates/mcp-loadtest/src/protocol/schema.rs
index 65c6f40..3f5081d 100644
--- a/crates/mcp-loadtest/src/protocol/schema.rs
+++ b/crates/mcp-loadtest/src/protocol/schema.rs
@@ -54,6 +54,14 @@ pub enum SchemaPolicy {
     Ignore,
 }
 
+/// Max schema nesting depth we descend. The `inputSchema` is advertised by
+/// the (untrusted) server under test; a maliciously deep schema would blow
+/// the stack (a Rust stack overflow is an uncatchable abort, not a panic).
+/// 64 covers every real MCP `inputSchema`; deeper is treated as
+/// unvalidatable (skipped, not a violation — consistent with the
+/// forward-compat stance) rather than crashing the load test.
+const MAX_SCHEMA_DEPTH: usize = 64;
+
 /// Validate `instance` against the JSON Schema `schema`, returning every
 /// mismatch found. Empty result = valid under the supported subset.
 ///
@@ -63,11 +71,20 @@ pub enum SchemaPolicy {
 /// schema validates everything.
 pub fn validate(schema: &Value, instance: &Value) -> Vec<SchemaViolation> {
     let mut out = Vec::new();
-    validate_at(schema, instance, "", &mut out);
+    validate_at(schema, instance, "", 0, &mut out);
     out
 }
 
-fn validate_at(schema: &Value, instance: &Value, path: &str, out: &mut Vec<SchemaViolation>) {
+fn validate_at(
+    schema: &Value,
+    instance: &Value,
+    path: &str,
+    depth: usize,
+    out: &mut Vec<SchemaViolation>,
+) {
+    if depth > MAX_SCHEMA_DEPTH {
+        return;
+    }
     let Some(obj) = schema.as_object() else {
         // Non-object schema (e.g. `true`) — nothing to enforce.
         return;
@@ -87,9 +104,17 @@ fn validate_at(schema: &Value, instance: &Value, path: &str, out: &mut Vec<Schem
     if let Some(enum_vals) = obj.get("enum").and_then(Value::as_array)
         && !enum_vals.iter().any(|v| v == instance)
     {
+        // The server controls `enum`; cap the rendered preview so a
+        // pathological multi-million-entry enum can't balloon the message.
+        let preview: Vec<&Value> = enum_vals.iter().take(5).collect();
+        let suffix = if enum_vals.len() > 5 {
+            format!(" … ({} total)", enum_vals.len())
+        } else {
+            String::new()
+        };
         out.push(SchemaViolation {
             path: loc(path),
-            message: format!("value not in enum {enum_vals:?}"),
+            message: format!("value not in enum {preview:?}{suffix}"),
         });
     }
 
@@ -103,7 +128,7 @@ fn validate_at(schema: &Value, instance: &Value, path: &str, out: &mut Vec<Schem
                 } else {
                     format!("{path}.{key}")
                 };
-                validate_at(sub_schema, sub_inst, &child, out);
+                validate_at(sub_schema, sub_inst, &child, depth + 1, out);
             }
         }
     }
@@ -125,7 +150,7 @@ fn validate_at(schema: &Value, instance: &Value, path: &str, out: &mut Vec<Schem
         && let Some(arr) = instance.as_array()
     {
         for (i, item) in arr.iter().enumerate() {
-            validate_at(items, item, &format!("{path}[{i}]"), out);
+            validate_at(items, item, &format!("{path}[{i}]"), depth + 1, out);
         }
     }
 }
@@ -291,6 +316,25 @@ mod tests {
         );
     }
 
+    #[test]
+    fn deeply_nested_schema_is_bounded_not_a_stack_overflow() {
+        // 300 levels — comfortably past MAX_SCHEMA_DEPTH (64) so the cap is
+        // exercised, but shallow enough that building/dropping the nested
+        // `serde_json::Value` (whose Drop is itself recursive) is safe. The
+        // real attack vector (a multi-thousand-deep schema sent on the
+        // wire) is rejected even earlier by serde_json's own deserialize
+        // recursion limit; this guards the in-memory traversal.
+        let mut schema = json!({ "type": "object" });
+        let mut instance = json!({});
+        for _ in 0..300 {
+            schema = json!({ "type": "object", "properties": { "n": schema } });
+            instance = json!({ "n": instance });
+        }
+        // Must return (not abort) and not spuriously flag the over-deep tail.
+        let v = validate(&schema, &instance);
+        assert!(v.is_empty(), "over-deep schema must be skipped, got {v:?}");
+    }
+
     #[test]
     fn policy_ignores_when_no_violations() {
         assert_eq!(
diff --git a/crates/mcp-loadtest/src/serve/tools/compare_runs.rs b/crates/mcp-loadtest/src/serve/tools/compare_runs.rs
index 907cc2b..e5c2544 100644
--- a/crates/mcp-loadtest/src/serve/tools/compare_runs.rs
+++ b/crates/mcp-loadtest/src/serve/tools/compare_runs.rs
@@ -171,9 +171,10 @@ fn validate_metrics_path(p: &str) -> Result<PathBuf, ToolError> {
         // Wrap canonicalize errors as InvalidArgs (not Io) so the client sees
         // a single consistent rejection reason for "bad path" regardless of
         // whether the failure is "doesn't exist", "permission denied", etc.
-        ToolError::InvalidArgs(format!(
-            "metrics path rejected: cannot canonicalize {p}: {e}"
-        ))
+        // Do not echo the raw caller-supplied path back — the OS error
+        // alone is enough to act on and avoids reflecting attacker input /
+        // partial filesystem layout to the MCP client.
+        ToolError::InvalidArgs(format!("metrics path rejected: cannot resolve path: {e}"))
     })?;
 
     // Allow #1: under cwd/runs/.
@@ -223,17 +224,23 @@ fn json_path_u64(v: &Value, path: &[&str]) -> u64 {
 
 /// Build the regression policy from optional tool args, falling back to
 /// [`RegressionThresholds::default`] for anything not supplied.
+///
+/// A threshold is only honoured when it is **finite and > 0**. A
+/// non-positive value would invert the regression direction (a regression
+/// would read as an improvement and vice-versa); since this is untrusted
+/// MCP tool input, such values are rejected back to the default rather
+/// than silently disabling the gate.
 fn regression_overrides(args: &Value) -> RegressionThresholds {
     let d = RegressionThresholds::default();
-    RegressionThresholds {
-        p99_pct: args
-            .get("max_p99_regression_pct")
+    let positive_or = |key: &str, fallback: f64| {
+        args.get(key)
             .and_then(Value::as_f64)
-            .unwrap_or(d.p99_pct),
-        error_rate_pp: args
-            .get("max_error_rate_regression_pp")
-            .and_then(Value::as_f64)
-            .unwrap_or(d.error_rate_pp),
+            .filter(|v| v.is_finite() && *v > 0.0)
+            .unwrap_or(fallback)
+    };
+    RegressionThresholds {
+        p99_pct: positive_or("max_p99_regression_pct", d.p99_pct),
+        error_rate_pp: positive_or("max_error_rate_regression_pp", d.error_rate_pp),
         deadlock_zero_tolerance: !args
             .get("allow_deadlock_increase")
             .and_then(Value::as_bool)
@@ -320,4 +327,17 @@ mod tests {
         assert_eq!(t.error_rate_pp, 2.0);
         assert!(!t.deadlock_zero_tolerance);
     }
+
+    #[test]
+    fn regression_overrides_rejects_non_positive_thresholds() {
+        // Negative / zero would invert the regression direction — must fall
+        // back to the safe defaults rather than be honoured.
+        let d = RegressionThresholds::default();
+        let t = regression_overrides(&json!({
+            "max_p99_regression_pct": -10.0,
+            "max_error_rate_regression_pp": 0.0
+        }));
+        assert_eq!(t.p99_pct, d.p99_pct);
+        assert_eq!(t.error_rate_pp, d.error_rate_pp);
+    }
 }

From 95783e6a026f4e36559e463e724f2fb3b4ff97d7 Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 20:42:52 +0700
Subject: [PATCH 6/8] chore(release): stabilize for v0.1.0 publish
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-publish stabilization from a read-only audit:

- packaging: exclude the four CLAUDE.md scaffolding files from the
  published `mcp-loadtest` crate (108 -> 104 files; verified via
  `cargo package --list`). runs/ was already gitignored.
- test gap: add `crates/mcp-loadtest-cli/tests/run_strict.rs` — the
  existing strict_validation.rs drives scenario.drive() directly;
  this covers the real production entrypoint end-to-end:
  `cmd_run::run_from_config` -> Config::from_file (incl. [validation]
  TOML deserialize) -> build_scenario -> Run::execute (run.rs strict
  wiring) -> emit_reports (report.md/metrics.json on disk) ->
  non-zero gate. Asserts both the fail (bad args -> ProtocolError ->
  not passed) and pass (compliant args) paths.
- docs: tick AGENTS.md M7 `/security-review + /release-checks`;
  annotate POST_PUBLISH_ISSUES dry-run/smoke status. CHANGELOG noted.

Verified on **x86_64-pc-windows-msvc** (the target crates.io ships to
Windows users — previously only gnu was built locally): fmt, clippy
-D warnings, build --locked, full test suite (incl. run_strict /
strict_validation), doc -D warnings, publish --dry-run (lib, 104
files), release-binary smoke — all green.
---
 AGENTS.md                                   |  86 +++++------
 CHANGELOG.md                                |   1 +
 POST_PUBLISH_ISSUES.md                      |  25 +++-
 crates/mcp-loadtest-cli/tests/run_strict.rs | 158 ++++++++++++++++++++
 crates/mcp-loadtest/Cargo.toml              |   8 +
 5 files changed, 233 insertions(+), 45 deletions(-)
 create mode 100644 crates/mcp-loadtest-cli/tests/run_strict.rs

diff --git a/AGENTS.md b/AGENTS.md
index 4d7f428..b9901ec 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -21,13 +21,13 @@ Started 2026-05-10. Target end: 2026-05-17 (1 week with 4 parallel agents instea
 
 ### Active agent registry
 
-| Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
-|---|---|---|---|---|---|
-| **A** | `cold_start` + `sustained` scenarios | `feat/m2-scenarios-cs` | `crates/mcp-loadtest/src/scenario/{cold_start,sustained}.rs`; `tests/scenarios_basic.rs` | `session.rs`, `scenario/mod.rs` (trait), `metrics/*` | ✅ done |
-| **B** | `hang_detect` + `deadlock_probe` scenario | `feat/m2-deadlock` | `src/hang_detector.rs` (body); `src/scenario/deadlock_probe.rs`; `tests/deadlock.rs` | `session.rs`, `scenario/mod.rs` (trait), `metrics/*` | ✅ done |
-| **C** | hdrhistogram metrics layer | `feat/m2-metrics` | `src/metrics/{mod,histogram,throughput}.rs` | none — fully independent | ✅ done |
-| **D** | Mock servers + Python framing fixes | `feat/m2-mocks` | `tests/fixtures/mock-{broken,slow,crash}.py` | (none) | ✅ done |
-| **integration** | Wire-up + ci-checks + commit | `main` | `scenario/mod.rs` (uncomment registrations); `tests/deadlock.rs` (replace inline helper with real `DeadlockProbe`) | all M2 outputs | ✅ done |
+| Agent           | Scope                                     | Branch                 | Files OWNED                                                                                                        | Read-only deps                                       | Status  |
+| --------------- | ----------------------------------------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------- | ------- |
+| **A**           | `cold_start` + `sustained` scenarios      | `feat/m2-scenarios-cs` | `crates/mcp-loadtest/src/scenario/{cold_start,sustained}.rs`; `tests/scenarios_basic.rs`                           | `session.rs`, `scenario/mod.rs` (trait), `metrics/*` | ✅ done |
+| **B**           | `hang_detect` + `deadlock_probe` scenario | `feat/m2-deadlock`     | `src/hang_detector.rs` (body); `src/scenario/deadlock_probe.rs`; `tests/deadlock.rs`                               | `session.rs`, `scenario/mod.rs` (trait), `metrics/*` | ✅ done |
+| **C**           | hdrhistogram metrics layer                | `feat/m2-metrics`      | `src/metrics/{mod,histogram,throughput}.rs`                                                                        | none — fully independent                             | ✅ done |
+| **D**           | Mock servers + Python framing fixes       | `feat/m2-mocks`        | `tests/fixtures/mock-{broken,slow,crash}.py`                                                                       | (none)                                               | ✅ done |
+| **integration** | Wire-up + ci-checks + commit              | `main`                 | `scenario/mod.rs` (uncomment registrations); `tests/deadlock.rs` (replace inline helper with real `DeadlockProbe`) | all M2 outputs                                       | ✅ done |
 
 ### Conflicts to expect
 
@@ -117,6 +117,7 @@ Algorithm: see DESIGN.md §15.1 for the precise decision tree.
 ### Mock fixtures — `tests/fixtures/mock-*.py`
 
 Contract enforced by `_common.py`:
+
 - Read JSON-RPC frames from stdin (one per line).
 - Write JSON-RPC frames to stdout (one per line).
 - Exit 0 on stdin EOF.
@@ -152,6 +153,7 @@ If you're a fresh agent / contributor entering this sprint:
 7. Wait for the integration step.
 
 If your work needs an interface change:
+
 - **STOP.** Do not break the contract.
 - Open an issue or sync with the main session.
 - All agents pause until the contract is renegotiated.
@@ -164,12 +166,12 @@ Started 2026-05-11. Final sprint before user-review hand-off. **Repo stays priva
 
 ### M7 active agent registry
 
-| Agent | Scope | Files OWNED | Status |
-|---|---|---|---|
-| **U** | Protocol fuzzer scenario — random/malformed JSON-RPC payloads | `src/scenario/fuzzer.rs` (new); `src/analysis/fuzz_report.rs` (new); `tests/fuzzer.rs` (new) | done (2026-05-11) |
-| **V** | Coverage tracking + per-tool SLO assertions | `src/analysis/coverage.rs` (new); add `coverage: Option<CoverageReport>` to `Report`; extend `Run::execute`; `tests/coverage.rs` (new) | done (2026-05-11) |
-| **W** | `mcp-loadtest serve --mcp` self-hosted MCP server | `src/serve/mod.rs` + `src/serve/tools.rs` (new); add `Serve` Cmd; `tests/serve_smoke.rs` (new) | done (2026-05-11) |
-| **X** | README rewrite + cookbook | `README.md`; `docs/examples/{ci-integration,custom-scenario,debugging-deadlocks}.md` (new) | done (2026-05-11) |
+| Agent | Scope                                                         | Files OWNED                                                                                                                            | Status            |
+| ----- | ------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ----------------- |
+| **U** | Protocol fuzzer scenario — random/malformed JSON-RPC payloads | `src/scenario/fuzzer.rs` (new); `src/analysis/fuzz_report.rs` (new); `tests/fuzzer.rs` (new)                                           | done (2026-05-11) |
+| **V** | Coverage tracking + per-tool SLO assertions                   | `src/analysis/coverage.rs` (new); add `coverage: Option<CoverageReport>` to `Report`; extend `Run::execute`; `tests/coverage.rs` (new) | done (2026-05-11) |
+| **W** | `mcp-loadtest serve --mcp` self-hosted MCP server             | `src/serve/mod.rs` + `src/serve/tools.rs` (new); add `Serve` Cmd; `tests/serve_smoke.rs` (new)                                         | done (2026-05-11) |
+| **X** | README rewrite + cookbook                                     | `README.md`; `docs/examples/{ci-integration,custom-scenario,debugging-deadlocks}.md` (new)                                             | done (2026-05-11) |
 
 ### M7 sprint exit criteria
 
@@ -178,7 +180,7 @@ Started 2026-05-11. Final sprint before user-review hand-off. **Repo stays priva
 - [x] Coverage shows "echo" exercised in coverage/scenario tests
 - [x] `mcp-loadtest serve --mcp` boots; `tools/list` returns `deadlock_probe`/`sustained_load`/`compare_runs` (minimum)
 - [x] README leads with the deadlock demo + has 3 cookbook examples
-- [ ] /security-review + /release-checks pass
+- [x] /security-review + /release-checks pass (security review: 1 HIGH + 3 lower findings, all fixed in `ad2e5c6`; release-checks run — see CHANGELOG Notes + the `run_strict` / `strict_validation` e2e tests)
 
 After M7: STOP. Hand off to user for review BEFORE flipping repo public.
 
@@ -190,12 +192,12 @@ Started 2026-05-11. Target: 1 week.
 
 ### M6 active agent registry
 
-| Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
-|---|---|---|---|---|---|
-| **Q** | Real-time TUI dashboard via ratatui + crossterm | `feat/m6-tui` | `src/tui/mod.rs` (uncomment + enrich); `src/tui/dashboard.rs` (new); `tests/tui_smoke.rs` (new) | `metrics::Recorder` (clone), `report/*` | done (2026-05-11) |
-| **R** | `race_detector` analyzer + `RaceCheck` scenario | `feat/m6-race` | `src/analysis/race_detector.rs`; `src/scenario/race_check.rs` (new); `tests/race_check.rs` (new) | session, scenario trait | done (2026-05-11) |
-| **S** | Cross-server CLI subcommand `mcp-loadtest cross` | `feat/m6-cross` | `crates/mcp-loadtest-cli/src/cmd_cross.rs` (new); add `Cross` Cmd variant in main.rs | run, scenario, report (read) | done (2026-05-11) |
-| **T** | ProcessSampler enrichment (fd + threads) + Soak leak-regression | `feat/m6-process-leak` | `src/metrics/process.rs` (extend); `src/report/mod.rs` ONLY to add `peak_fd: u64`, `peak_threads: u64` to ProcessStats (locked-API expansion); refactor `Soak` to use the new fields | sysinfo API | done (2026-05-11) |
+| Agent | Scope                                                           | Branch                 | Files OWNED                                                                                                                                                                          | Read-only deps                          | Status            |
+| ----- | --------------------------------------------------------------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------- | ----------------- |
+| **Q** | Real-time TUI dashboard via ratatui + crossterm                 | `feat/m6-tui`          | `src/tui/mod.rs` (uncomment + enrich); `src/tui/dashboard.rs` (new); `tests/tui_smoke.rs` (new)                                                                                      | `metrics::Recorder` (clone), `report/*` | done (2026-05-11) |
+| **R** | `race_detector` analyzer + `RaceCheck` scenario                 | `feat/m6-race`         | `src/analysis/race_detector.rs`; `src/scenario/race_check.rs` (new); `tests/race_check.rs` (new)                                                                                     | session, scenario trait                 | done (2026-05-11) |
+| **S** | Cross-server CLI subcommand `mcp-loadtest cross`                | `feat/m6-cross`        | `crates/mcp-loadtest-cli/src/cmd_cross.rs` (new); add `Cross` Cmd variant in main.rs                                                                                                 | run, scenario, report (read)            | done (2026-05-11) |
+| **T** | ProcessSampler enrichment (fd + threads) + Soak leak-regression | `feat/m6-process-leak` | `src/metrics/process.rs` (extend); `src/report/mod.rs` ONLY to add `peak_fd: u64`, `peak_threads: u64` to ProcessStats (locked-API expansion); refactor `Soak` to use the new fields | sysinfo API                             | done (2026-05-11) |
 
 ### M6 conflicts to expect
 
@@ -218,22 +220,22 @@ Started 2026-05-11. Target: 1 week. Mostly post-hoc analysis on top of the M3 `R
 
 ### M5 active agent registry
 
-| Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
-|---|---|---|---|---|---|
-| **M** | `Ramp` scenario + `BreakingPointDetector` | `feat/m5-breaking-point` | `src/analysis/breaking_point.rs` (new); `src/scenario/ramp.rs` (new); `tests/breaking_point.rs` (new) | `analysis/mod.rs` (declare module), `scenario/mod.rs` (register), `metrics/*`, locked `Report` | done (2026-05-11) |
-| **N** | Performance grading (A-F per latency / concurrency / error) | `feat/m5-grading` | `src/analysis/grading.rs` (new); `tests/grading.rs` (new) | `report/*` (read), `metrics/*` | done (2026-05-11) |
-| **O** | Pattern engine — multi-step + weighted random + think-time | `feat/m5-patterns` | `src/scenario/pattern.rs` (new); refactor `src/scenario/sustained.rs` to drive Patterns; `tests/patterns.rs` (new) | `Session::call_tool`, locked `Scenario` trait | done (2026-05-11) |
-| **P** | `compare` subcommand + soak scenario polish | `feat/m5-compare-soak` | `src/scenario/soak.rs` (new); `crates/mcp-loadtest-cli/src/cmd_compare.rs` (new); add `mcp-loadtest compare baseline.json current.json` to CLI | `report/json.rs` (read) for the on-disk shape | done |
+| Agent | Scope                                                       | Branch                   | Files OWNED                                                                                                                                    | Read-only deps                                                                                 | Status            |
+| ----- | ----------------------------------------------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | ----------------- |
+| **M** | `Ramp` scenario + `BreakingPointDetector`                   | `feat/m5-breaking-point` | `src/analysis/breaking_point.rs` (new); `src/scenario/ramp.rs` (new); `tests/breaking_point.rs` (new)                                          | `analysis/mod.rs` (declare module), `scenario/mod.rs` (register), `metrics/*`, locked `Report` | done (2026-05-11) |
+| **N** | Performance grading (A-F per latency / concurrency / error) | `feat/m5-grading`        | `src/analysis/grading.rs` (new); `tests/grading.rs` (new)                                                                                      | `report/*` (read), `metrics/*`                                                                 | done (2026-05-11) |
+| **O** | Pattern engine — multi-step + weighted random + think-time  | `feat/m5-patterns`       | `src/scenario/pattern.rs` (new); refactor `src/scenario/sustained.rs` to drive Patterns; `tests/patterns.rs` (new)                             | `Session::call_tool`, locked `Scenario` trait                                                  | done (2026-05-11) |
+| **P** | `compare` subcommand + soak scenario polish                 | `feat/m5-compare-soak`   | `src/scenario/soak.rs` (new); `crates/mcp-loadtest-cli/src/cmd_compare.rs` (new); add `mcp-loadtest compare baseline.json current.json` to CLI | `report/json.rs` (read) for the on-disk shape                                                  | done              |
 
 ### M5 conflicts / notes
 
 - All four touch `scenario/mod.rs` (register their new module). Convention: each appends one line at the bottom; integration agent resolves any merge.
 - Agent O refactors `Sustained` — must keep the existing TOML config that has `tool` + `args` working (single-step pattern fallback). Existing `tests/scenarios_basic.rs` must still pass without modification.
 - Agent P's `compare` subcommand shape:
-  ```
-  mcp-loadtest compare <baseline-metrics.json> <current-metrics.json>
-  ```
-  Outputs a markdown diff highlighting regressions. JSON output via `--format json` for CI use.
+    ```
+    mcp-loadtest compare <baseline-metrics.json> <current-metrics.json>
+    ```
+    Outputs a markdown diff highlighting regressions. JSON output via `--format json` for CI use.
 - DESIGN §21 template-var pattern feature (using previous step's output in next step) is **deferred to M6**. M5's pattern engine: multi-step + weighted + think-time only.
 
 ### M5 sprint exit criteria
@@ -253,11 +255,11 @@ Pre-M4 (integration agent, done): toolchain bumped 1.85→stable; `Transport` tr
 
 ### M4 active agent registry
 
-| Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
-|---|---|---|---|---|---|
-| **J** | HTTP transport (Streamable HTTP simple variant) | `feat/m4-http` | `src/protocol/transport/http.rs`; `tests/http_transport.rs` (new) | `protocol/transport/mod.rs` (locked Transport trait), `session.rs` | done (2026-05-11) |
-| **K** | SSE transport (event-stream subscribe + POST send) | `feat/m4-sse` | `src/protocol/transport/sse.rs`; `tests/sse_transport.rs` (new) | `protocol/transport/mod.rs`, `session.rs` | done (2026-05-11) |
-| **L** | Python HTTP+SSE mock fixtures | `feat/m4-fixtures-http` | `tests/fixtures/mock-http-server.py`; `tests/fixtures/mock-sse-server.py`; `tests/fixtures/_http_common.py` (allowed) | existing fixture conventions | done (2026-05-11) |
+| Agent | Scope                                              | Branch                  | Files OWNED                                                                                                           | Read-only deps                                                     | Status            |
+| ----- | -------------------------------------------------- | ----------------------- | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ | ----------------- |
+| **J** | HTTP transport (Streamable HTTP simple variant)    | `feat/m4-http`          | `src/protocol/transport/http.rs`; `tests/http_transport.rs` (new)                                                     | `protocol/transport/mod.rs` (locked Transport trait), `session.rs` | done (2026-05-11) |
+| **K** | SSE transport (event-stream subscribe + POST send) | `feat/m4-sse`           | `src/protocol/transport/sse.rs`; `tests/sse_transport.rs` (new)                                                       | `protocol/transport/mod.rs`, `session.rs`                          | done (2026-05-11) |
+| **L** | Python HTTP+SSE mock fixtures                      | `feat/m4-fixtures-http` | `tests/fixtures/mock-http-server.py`; `tests/fixtures/mock-sse-server.py`; `tests/fixtures/_http_common.py` (allowed) | existing fixture conventions                                       | done (2026-05-11) |
 
 Agent I (stdio extraction + Transport trait) done by integration agent pre-sprint; left as 3 agents instead of 4 to reduce coordination risk.
 
@@ -281,13 +283,13 @@ Started 2026-05-10 (right after M2 close). Target: 1 week.
 
 ### M3 active agent registry
 
-| Agent | Scope | Branch | Files OWNED | Read-only deps | Status |
-|---|---|---|---|---|---|
-| **E** | TOML config + parse | `feat/m3-config` | `src/config.rs`; `tests/config_parse.rs` | `metrics/*`, `scenario/mod.rs`, locked types in config.rs | ✅ done |
-| **F** | Markdown + JSON reporters | `feat/m3-reporters-md-json` | `src/report/markdown.rs` + `src/report/json.rs`; `tests/reporter_snapshots.rs` + 4 insta snapshots | `report/mod.rs` (locked Report + Reporter trait), `metrics/*`, `scenario/mod.rs` | ✅ done |
-| **G** | Terminal reporter + sysinfo process metrics | `feat/m3-terminal-process` | `src/report/terminal.rs`; `src/metrics/process.rs`; `tests/process_metrics.rs` | `report/mod.rs`, `metrics/mod.rs` | ✅ done |
-| **H** | Run orchestrator + Vibe-Trading regression test | `feat/m3-run-orchestrator` | `src/run.rs` (full body + 6 unit tests); `tests/vibe_trading_regression.rs` | everything else read-only | ✅ done |
-| **integration** | CLI + reporters wiring + ci-checks + commit | `main` | `crates/mcp-loadtest-cli/src/main.rs` (clap subcommands); `crates/mcp-loadtest-cli/Cargo.toml` (deps); CHANGELOG; AGENTS | all M3 outputs | ✅ done |
+| Agent           | Scope                                           | Branch                      | Files OWNED                                                                                                              | Read-only deps                                                                   | Status  |
+| --------------- | ----------------------------------------------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------- | ------- |
+| **E**           | TOML config + parse                             | `feat/m3-config`            | `src/config.rs`; `tests/config_parse.rs`                                                                                 | `metrics/*`, `scenario/mod.rs`, locked types in config.rs                        | ✅ done |
+| **F**           | Markdown + JSON reporters                       | `feat/m3-reporters-md-json` | `src/report/markdown.rs` + `src/report/json.rs`; `tests/reporter_snapshots.rs` + 4 insta snapshots                       | `report/mod.rs` (locked Report + Reporter trait), `metrics/*`, `scenario/mod.rs` | ✅ done |
+| **G**           | Terminal reporter + sysinfo process metrics     | `feat/m3-terminal-process`  | `src/report/terminal.rs`; `src/metrics/process.rs`; `tests/process_metrics.rs`                                           | `report/mod.rs`, `metrics/mod.rs`                                                | ✅ done |
+| **H**           | Run orchestrator + Vibe-Trading regression test | `feat/m3-run-orchestrator`  | `src/run.rs` (full body + 6 unit tests); `tests/vibe_trading_regression.rs`                                              | everything else read-only                                                        | ✅ done |
+| **integration** | CLI + reporters wiring + ci-checks + commit     | `main`                      | `crates/mcp-loadtest-cli/src/main.rs` (clap subcommands); `crates/mcp-loadtest-cli/Cargo.toml` (deps); CHANGELOG; AGENTS | all M3 outputs                                                                   | ✅ done |
 
 ### M3 locked contracts (DO NOT modify)
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0ab992a..78f0163 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -141,6 +141,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `serve` and `tui` modules will move behind cargo feature flags in a future release to keep the default build slim.
 - HTTP / SSE transport host-allowlist for SSRF defense is deferred. Currently mitigated by `Policy::none()` on redirects; the allowlist is operator-facing config and will land alongside the broader transport-hardening pass.
 - ✅ Pre-publish security pass on the B1/B2 surface: bounded `protocol::schema` recursion (`MAX_SCHEMA_DEPTH`, defends strict mode against a maliciously deep server `inputSchema`); non-positive regression thresholds rejected at the CLI/MCP boundary (would otherwise invert the gate); `compare_runs` no longer echoes the raw caller path; `enum`-violation messages are length-capped.
+- ✅ Pre-publish stabilization: `crates/mcp-loadtest` now `exclude`s the four `CLAUDE.md` scaffolding files from the published package (108 → 104 files); added `run_strict` CLI integration test covering the real `run --config` entrypoint with `[validation] strict = true` end-to-end (TOML → `Run::execute` → report-on-disk → non-zero gate); full pipeline (fmt, clippy `-D warnings`, `--locked` build/test, doc) verified on the **x86_64-pc-windows-msvc** target — the toolchain crates.io ships to Windows users.
 
 ## [0.1.0] — TBD
 
diff --git a/POST_PUBLISH_ISSUES.md b/POST_PUBLISH_ISSUES.md
index 91e2b96..d258962 100644
--- a/POST_PUBLISH_ISSUES.md
+++ b/POST_PUBLISH_ISSUES.md
@@ -11,6 +11,7 @@ Each block below is in `gh issue create` shape — copy-paste-ready once the rep
 ### `feat(transport): host-allowlist for HTTP/SSE/WS to defend against SSRF`
 
 **Body:**
+
 > Currently the redirect policy is hardened to `Policy::none()` per [ADR 0007](docs/adr/0007-transport-security-posture.md), but an operator pointing the load tester at a malicious URL can still hit any internal endpoint resolvable from the machine running the test. v0.2 should add an operator-facing allowlist (TOML config: `server.allowed_hosts = ["app.example.com"]`) and reject `connect_async`/`reqwest::send` calls when the resolved host isn't matched. See CHANGELOG `[Unreleased]` Notes.
 
 **Labels:** `security`, `enhancement`, `v0.2`
@@ -19,9 +20,10 @@ Each block below is in `gh issue create` shape — copy-paste-ready once the rep
 
 ## ⚡ Performance
 
-### `perf(scenarios): switch `args: Value` to `Arc<Value>` for hot-loop sharing`
+### `perf(scenarios): switch `args: Value`to`Arc<Value>` for hot-loop sharing`
 
 **Body:**
+
 > Pre-publish Phase 1 audit found scenarios still deep-clone `Value` once per worker spawn (the `&Value` change in commit `c8dee52` cut per-call clones, not per-worker setup). Wrapping `Sustained.args` / `Pattern.args` etc. in `Arc<Value>` lets concurrent workers share the JSON tree by reference. Touches `Session::call_tool` signature → breaking v0.2 API change.
 
 **Labels:** `performance`, `breaking`, `v0.2`
@@ -29,13 +31,15 @@ Each block below is in `gh issue create` shape — copy-paste-ready once the rep
 ### `perf(transport): drop double-parse in SSE/WS id-extract`
 
 **Body:**
+
 > `extract_id` parses the entire JSON twice (once for id-probe, once for full body after match). Use `simd-json` or a streaming id-extractor (parse only the first `"id":N` key). Matters at >100K iter/s — not a v0.1 blocker.
 
 **Labels:** `performance`, `v0.2`
 
-### `perf(session): drop `String` allocation in `stdio` line trim`
+### `perf(session): drop `String`allocation in`stdio` line trim`
 
 **Body:**
+
 > `stdio.rs::request` does `self.line_buf.trim_end().to_string()`. In-place truncate plus returning `&str` would save one alloc per call. Small win at high call rates.
 
 **Labels:** `performance`, `v0.2`
@@ -47,6 +51,7 @@ Each block below is in `gh issue create` shape — copy-paste-ready once the rep
 ### `test(scenario): land real cold_start handshake-latency test`
 
 **Body:**
+
 > `ColdStart` is intentionally a placeholder in v0.1.0; the integration test pins the inert-placeholder contract. v0.2: implement real cold-start sampling (`Session::reinitialize` loop + per-iteration histogram) and replace the placeholder assertion with a measured-latency assertion.
 
 **Labels:** `test`, `feature`, `v0.2`
@@ -54,18 +59,21 @@ Each block below is in `gh issue create` shape — copy-paste-ready once the rep
 ### `test(fixtures): add `mock-leak.py`, `mock-error.py`, `mock-slow-init.py`, `mock-malformed.py``
 
 **Body:**
+
 > DESIGN.md §16 lists 10 mock fixtures; v0.1 ships 6 (normal/slow/broken/crash/http/sse). The 4 missing fixtures gate richer scenario coverage:
+>
 > - `mock-leak.py` — RSS grows over time → exercises `soak::detect_leak`
 > - `mock-error.py` — returns JSON-RPC errors deterministically → exercises error-classification scenarios
 > - `mock-slow-init.py` — slow initialize handshake → exercises `cold_start` (post-real-impl)
 > - `mock-malformed.py` — emits malformed JSON → exercises fuzzer's defensive parse paths
-> Each is < 50 lines of stdlib-only Python following `_common.py`.
+>   Each is < 50 lines of stdlib-only Python following `_common.py`.
 
 **Labels:** `test`, `v0.2`
 
 ### `test(bench): wire criterion benches into CI baseline comparison`
 
 **Body:**
+
 > Phase 1 added `benches/{record,histogram,session_loopback,hang_detect}.rs`. v0.2: capture baseline numbers in `bench-baseline.json` and add a `cargo bench-check` CI step that flags regressions > 10% vs baseline (same threshold as `compare` subcommand).
 
 **Labels:** `test`, `ci`, `v0.2`
@@ -97,6 +105,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `feat(transport): add `Transport::raw_send(&[u8])` hook for fuzzer raw-byte payloads`
 
 **Body:**
+
 > `Fuzzer` currently skips raw-transport payloads (`GiantPayload` raw variant, etc.) because there's no API to bypass JSON-RPC framing. v0.2: add a `raw_send` method to the `Transport` trait that lets `Fuzzer` send arbitrary bytes. Enables full coverage of the malformed-input attack surface.
 
 **Labels:** `feature`, `v0.2`
@@ -104,6 +113,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `feat(cli): `--capture-stderr` flag for stdio transport`
 
 **Body:**
+
 > Currently the spawned MCP server's stderr inherits the parent's stderr. When `mcp-loadtest` runs as a child of an LLM agent, the target server's stderr blends into the agent's view. Add a flag to redirect to a per-run file (`runs/<ulid>/server-stderr.log`).
 
 **Labels:** `feature`, `v0.2`
@@ -111,6 +121,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `feat(cli): docker-compose generator for the `cross` subcommand`
 
 **Body:**
+
 > IBM's mcp-context-forge perf testing wanted Docker Compose multi-server setup. We have `cross` which drives N servers but doesn't scaffold the compose file. Add `mcp-loadtest cross --emit-compose > docker-compose.yml`.
 
 **Labels:** `feature`, `v0.2`
@@ -118,6 +129,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `feat(report): HTML report charts using inline JS for interactivity`
 
 **Body:**
+
 > Current HTML reporter uses inline SVG (static, no JS). v0.2 could optionally embed Chart.js + interactive percentile sliders. Stays self-contained (`<script>` block, no CDN). Trade: report file grows from ~20 KB to ~200 KB.
 
 **Labels:** `feature`, `enhancement`, `v0.2`
@@ -129,6 +141,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `docs(adr): add ADRs 0010+ for v0.2 decisions as they land`
 
 **Body:**
+
 > Pre-publish review added ADRs 0005–0009. As v0.2 features land (host-allowlist, raw-byte transport, etc.) each architectural decision should get an ADR. Template: copy `docs/adr/0001-language-rust.md`.
 
 **Labels:** `docs`, `process`
@@ -136,6 +149,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `docs(readme): add real benchmark numbers from criterion runs`
 
 **Body:**
+
 > Once v0.2 wires bench baselines into CI (above), the README can quote real numbers: `Recorder::record: 47 ns`, `Session::call_tool loopback: 8.2 µs`, etc. Replaces the current "Rust performance" handwave.
 
 **Labels:** `docs`, `v0.2`
@@ -147,6 +161,7 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `release: confirm new GitHub repo URL is live before `cargo publish``
 
 **Body:**
+
 > The `repository` and `homepage` fields in `Cargo.toml` point at `https://github.com/Teerapat-Vatpitak/mcp-loadtest`. The previous repo at that URL was deleted during pre-publish review. Before running `cargo publish`, recreate the public repo at the same URL OR update both fields to the new URL.
 
 **Labels:** `release`, `process`
@@ -154,10 +169,14 @@ Public API paths preserved via `pub use` re-exports throughout. 264 tests pass,
 ### `release: cargo publish dry-run + smoke install`
 
 **Body:**
+
 > Pre-publish checklist:
+>
 > 1. `cargo publish --dry-run -p mcp-loadtest` (lib first)
 > 2. `cargo publish --dry-run -p mcp-loadtest-cli` (then CLI)
 > 3. Both should be clean (no warnings about missing fields, no API surface issues)
 > 4. After actual publish: `cargo install mcp-loadtest-cli` on a fresh shell to verify the binary works end-to-end
 
+**Status (pre-publish, 2026-05-16):** steps 1–3 done — `mcp-loadtest` dry-run packages clean; `mcp-loadtest-cli` dry-run fails with "no matching package `mcp-loadtest`" which is the expected workspace publish-ordering constraint (publish the lib first, then the CLI), not a defect. Local release-build smoke (`--version` / `list-scenarios` / a strict `run`) passed. Step 4 (`cargo install` from crates.io) remains for after the real publish.
+
 **Labels:** `release`, `process`
diff --git a/crates/mcp-loadtest-cli/tests/run_strict.rs b/crates/mcp-loadtest-cli/tests/run_strict.rs
new file mode 100644
index 0000000..a4e3ed2
--- /dev/null
+++ b/crates/mcp-loadtest-cli/tests/run_strict.rs
@@ -0,0 +1,158 @@
+//! End-to-end coverage for the real `run --config` entrypoint with
+//! `[validation] strict = true` (ADR 0010).
+//!
+//! `tests/strict_validation.rs` (lib crate) drives `scenario.drive()`
+//! directly. This test exercises the *production* path that a CI user
+//! actually hits: `cmd_run::run_from_config` → `Config::from_file`
+//! (incl. `[validation]` TOML deserialize) → `build_scenario` →
+//! `Run::execute` (the `run.rs` strict-wiring) → `emit_reports`
+//! (report-on-disk) → non-zero exit on the threshold gate.
+
+use std::fs;
+use std::path::PathBuf;
+
+use mcp_loadtest_cli::cmd_run;
+use tempfile::tempdir;
+
+fn mock_schema_fixture() -> PathBuf {
+    // Fixtures live in the lib crate; this test crate is its sibling.
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("mcp-loadtest")
+        .join("tests")
+        .join("fixtures")
+        .join("mock-schema.py")
+}
+
+/// Build a TOML config string. `args_toml` is the inline `scenario.args`
+/// table; `report_dir` is where `runs/<ulid>/` lands.
+fn config_toml(args_toml: &str, report_dir: &str) -> String {
+    let mock = mock_schema_fixture();
+    // Forward slashes are valid in TOML double-quoted strings and accepted
+    // by Windows path APIs — avoids backslash-escaping headaches.
+    let mock = mock.to_string_lossy().replace('\\', "/");
+    let report_dir = report_dir.replace('\\', "/");
+    format!(
+        r#"
+[server]
+command = "python"
+args = ["{mock}"]
+transport = "stdio"
+
+[scenario]
+type = "sustained"
+tool = "echo"
+duration = "400ms"
+concurrent = 1
+args = {args_toml}
+
+[thresholds]
+# Any error at all trips the gate — strict-mode ProtocolErrors must fail CI.
+error_rate = 0.0
+
+[validation]
+strict = true
+
+[output]
+report_dir = "{report_dir}"
+formats = ["json", "markdown"]
+"#
+    )
+}
+
+/// The single `runs/<ulid>/` dir created under `report_dir`.
+fn sole_run_dir(report_dir: &std::path::Path) -> PathBuf {
+    let mut entries: Vec<PathBuf> = fs::read_dir(report_dir)
+        .expect("read report_dir")
+        .filter_map(|e| e.ok().map(|e| e.path()))
+        .filter(|p| p.is_dir())
+        .collect();
+    assert_eq!(
+        entries.len(),
+        1,
+        "expected exactly one run dir under {report_dir:?}, got {entries:?}"
+    );
+    entries.pop().unwrap()
+}
+
+#[tokio::test]
+async fn strict_mode_gates_the_run_and_writes_reports() {
+    let tmp = tempdir().expect("tempdir");
+    let report_dir = tmp.path().join("runs");
+    let cfg_path = tmp.path().join("bench.toml");
+
+    // `echo` requires a string `msg`; send an int → schema violation on
+    // every call → ProtocolError → error_rate > 0.0 → gate fails.
+    fs::write(
+        &cfg_path,
+        config_toml(r#"{ msg = 123 }"#, report_dir.to_str().unwrap()),
+    )
+    .expect("write config");
+
+    let result = cmd_run::run_from_config(&cfg_path).await;
+
+    // 1. The production entrypoint must surface a non-zero exit (Err).
+    assert!(
+        result.is_err(),
+        "strict-mode schema violations must fail the run; got {result:?}"
+    );
+
+    // 2. Reports were still written before the gate tripped.
+    let run_dir = sole_run_dir(&report_dir);
+    let metrics_path = run_dir.join("metrics.json");
+    assert!(metrics_path.is_file(), "metrics.json missing");
+    assert!(run_dir.join("report.md").is_file(), "report.md missing");
+
+    // 3. The metrics reflect strict rejection: not passed, ProtocolErrors.
+    let v: serde_json::Value =
+        serde_json::from_str(&fs::read_to_string(&metrics_path).expect("read metrics.json"))
+            .expect("parse metrics.json");
+    assert_eq!(v["passed"], false, "run should not have passed: {v}");
+    assert!(
+        v["errors"]["total"].as_u64().unwrap_or(0) > 0,
+        "expected errors > 0: {v}"
+    );
+    assert!(
+        v["errors"]["by_category"]["ProtocolError"]
+            .as_u64()
+            .unwrap_or(0)
+            > 0,
+        "expected ProtocolError category > 0: {v}"
+    );
+}
+
+#[tokio::test]
+async fn strict_mode_lets_schema_compliant_runs_pass() {
+    let tmp = tempdir().expect("tempdir");
+    let report_dir = tmp.path().join("runs");
+    let cfg_path = tmp.path().join("bench.toml");
+
+    // Compliant args: `msg` present and a string.
+    fs::write(
+        &cfg_path,
+        config_toml(r#"{ msg = "hello" }"#, report_dir.to_str().unwrap()),
+    )
+    .expect("write config");
+
+    let result = cmd_run::run_from_config(&cfg_path).await;
+    assert!(
+        result.is_ok(),
+        "strict mode must not gate schema-compliant runs; got {result:?}"
+    );
+
+    let run_dir = sole_run_dir(&report_dir);
+    let v: serde_json::Value = serde_json::from_str(
+        &fs::read_to_string(run_dir.join("metrics.json")).expect("read metrics.json"),
+    )
+    .expect("parse metrics.json");
+    assert_eq!(v["passed"], true, "clean strict run should pass: {v}");
+    assert_eq!(
+        v["errors"]["total"].as_u64().unwrap_or(99),
+        0,
+        "no errors expected: {v}"
+    );
+    assert!(
+        v["throughput"]["successful_requests"].as_u64().unwrap_or(0) > 0,
+        "expected successful calls: {v}"
+    );
+}
diff --git a/crates/mcp-loadtest/Cargo.toml b/crates/mcp-loadtest/Cargo.toml
index 904f347..d654b11 100644
--- a/crates/mcp-loadtest/Cargo.toml
+++ b/crates/mcp-loadtest/Cargo.toml
@@ -11,6 +11,14 @@ repository.workspace = true
 homepage.workspace = true
 authors.workspace = true
 rust-version.workspace = true
+# CLAUDE.md files are agent/dev scaffolding, not library docs — keep them out
+# of the published crate.
+exclude = [
+    "CLAUDE.md",
+    "src/scenario/CLAUDE.md",
+    "tests/CLAUDE.md",
+    "tests/fixtures/CLAUDE.md",
+]
 
 [lints.rust]
 missing_docs = "warn"

From fec989e542f10aeb7c2d6549983d9c953fb752fa Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sat, 16 May 2026 20:55:43 +0700
Subject: [PATCH 7/8] chore(deny): allowlist CDLA + triage RUSTSEC ignores
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`cargo deny check` (a release-checks must-pass gate) failed on the
pre-publish audit: webpki-roots' CDLA-Permissive-2.0 license wasn't
allowlisted, and three advisories tripped the default error. `cargo
audit` exits 0 — all three are unmaintained/unsound notices, not
scored vulnerabilities. Advisory checks hit the live RustSec DB, so
this surfaces regardless of the (main-identical) lockfile.

- licenses: allow CDLA-Permissive-2.0 (webpki-roots' Mozilla CA
  bundle, via tokio-tungstenite — permissive, publish-safe).
- advisories: documented per-ID ignore of RUSTSEC-2025-0052
  (async-std, dev-only via httpmock — never shipped), -2024-0436
  (paste, transitive proc-macro), -2026-0002 (lru IterMut
  unsoundness — transitive via ratatui 0.29, optional `tui` feature
  only, no semver-compatible fix). Nothing else ignored; the gate
  still hard-fails on any new or actual-vulnerability advisory.

Decision recorded in ADR 0011; revisit conditions tracked in
POST_PUBLISH_ISSUES.md; CHANGELOG noted. `cargo deny check` now
passes (advisories/bans/licenses/sources ok); `cargo audit` rc 0.
---
 CHANGELOG.md                                  |  1 +
 POST_PUBLISH_ISSUES.md                        | 13 +++++
 deny.toml                                     | 22 +++++++
 docs/adr/0011-supply-chain-advisory-policy.md | 57 +++++++++++++++++++
 docs/adr/README.md                            | 25 ++++----
 5 files changed, 106 insertions(+), 12 deletions(-)
 create mode 100644 docs/adr/0011-supply-chain-advisory-policy.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 78f0163..0c48ad3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -142,6 +142,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - HTTP / SSE transport host-allowlist for SSRF defense is deferred. Currently mitigated by `Policy::none()` on redirects; the allowlist is operator-facing config and will land alongside the broader transport-hardening pass.
 - ✅ Pre-publish security pass on the B1/B2 surface: bounded `protocol::schema` recursion (`MAX_SCHEMA_DEPTH`, defends strict mode against a maliciously deep server `inputSchema`); non-positive regression thresholds rejected at the CLI/MCP boundary (would otherwise invert the gate); `compare_runs` no longer echoes the raw caller path; `enum`-violation messages are length-capped.
 - ✅ Pre-publish stabilization: `crates/mcp-loadtest` now `exclude`s the four `CLAUDE.md` scaffolding files from the published package (108 → 104 files); added `run_strict` CLI integration test covering the real `run --config` entrypoint with `[validation] strict = true` end-to-end (TOML → `Run::execute` → report-on-disk → non-zero gate); full pipeline (fmt, clippy `-D warnings`, `--locked` build/test, doc) verified on the **x86_64-pc-windows-msvc** target — the toolchain crates.io ships to Windows users.
+- ✅ Supply-chain gate (`cargo deny check` / `cargo audit`): allowlisted `CDLA-Permissive-2.0` (the `webpki-roots` CA-bundle license, via `tokio-tungstenite`) and added three individually-triaged, documented advisory `ignore`s — RUSTSEC-2025-0052 (`async-std`, dev-only via `httpmock`), RUSTSEC-2024-0436 (`paste`, transitive), RUSTSEC-2026-0002 (`lru` unsoundness, transitive via `ratatui`, optional `tui` feature only, no semver fix). Rationale + revisit conditions in ADR 0011 + `POST_PUBLISH_ISSUES.md`; no actual vulnerabilities, gate still hard-fails on anything new.
 
 ## [0.1.0] — TBD
 
diff --git a/POST_PUBLISH_ISSUES.md b/POST_PUBLISH_ISSUES.md
index d258962..64d0c55 100644
--- a/POST_PUBLISH_ISSUES.md
+++ b/POST_PUBLISH_ISSUES.md
@@ -8,6 +8,19 @@ Each block below is in `gh issue create` shape — copy-paste-ready once the rep
 
 ## 🔒 Security
 
+### `chore(deps): revisit triaged RUSTSEC ignores (ADR 0011)`
+
+**Body:**
+
+> `deny.toml` ignores three advisories, each justified in ADR 0011. Re-evaluate each release:
+>
+> - **RUSTSEC-2026-0002** (`lru` `IterMut` Stacked-Borrows unsoundness) — transitive via `ratatui 0.29` (`lru ^0.12`), only under the optional `tui` feature. Drop the ignore + bump once `ratatui` ships on patched `lru` (≥0.13).
+> - **RUSTSEC-2025-0052** (`async-std` discontinued) — dev-only via `httpmock`. Drop when `httpmock` releases without `async-std` (or swap the mock).
+> - **RUSTSEC-2024-0436** (`paste` unmaintained) — transitive proc-macro. Drop when the dep tree moves off `paste` (e.g. `pastey`).
+>   Action: `cargo deny check` after `cargo update`; if any becomes an actual vulnerability or `lru` reaches the default build, fix immediately rather than re-ignoring.
+
+**Labels:** `security`, `dependencies`, `chore`
+
 ### `feat(transport): host-allowlist for HTTP/SSE/WS to defend against SSRF`
 
 **Body:**
diff --git a/deny.toml b/deny.toml
index fdf2fa1..a378e31 100644
--- a/deny.toml
+++ b/deny.toml
@@ -1,6 +1,24 @@
 [advisories]
 version = 2
 yanked = "warn"
+# Each entry is individually triaged — see ADR 0011. These are
+# unmaintained/unsound *notices*, not exploitable vulnerabilities; cargo-deny
+# still hard-fails on any new or actual-vulnerability advisory.
+ignore = [
+    # async-std discontinued. Dev-only: async-std → async-object-pool →
+    # httpmock (a [dev-dependencies] HTTP-test mock). Never in the published
+    # crate; no safe upgrade exists. Revisit if httpmock drops it.
+    "RUSTSEC-2025-0052",
+    # `paste` no longer maintained. Transitive proc-macro (no direct dep);
+    # not a vulnerability; no drop-in upstream migration available yet.
+    "RUSTSEC-2024-0436",
+    # `lru` IterMut Stacked-Borrows unsoundness. Transitive via ratatui 0.29,
+    # and only present under the optional `tui` feature (default lib build
+    # excludes it). No semver-compatible fix within ratatui 0.29's `lru ^0.12`
+    # constraint. Low practical impact (internal TUI cache). Tracked in
+    # POST_PUBLISH_ISSUES — revisit when ratatui ships on patched lru.
+    "RUSTSEC-2026-0002",
+]
 
 [licenses]
 version = 2
@@ -16,6 +34,10 @@ allow = [
     "Zlib",
     "MPL-2.0",
     "0BSD",
+    # webpki-roots (0.26 + 1.0, via tokio-tungstenite rustls-tls-webpki-roots)
+    # ships Mozilla's CA bundle under CDLA-Permissive-2.0 — a permissive,
+    # publish-safe data license. See ADR 0011.
+    "CDLA-Permissive-2.0",
 ]
 confidence-threshold = 0.93
 
diff --git a/docs/adr/0011-supply-chain-advisory-policy.md b/docs/adr/0011-supply-chain-advisory-policy.md
new file mode 100644
index 0000000..93359f9
--- /dev/null
+++ b/docs/adr/0011-supply-chain-advisory-policy.md
@@ -0,0 +1,57 @@
+# 11. Supply-chain policy: CDLA-Permissive-2.0 + triaged advisory ignores
+
+Date: 2026-05-16
+Status: Accepted
+
+## Context
+
+`cargo deny check` (a `release-checks.md` must-pass gate) failed during the
+pre-v0.1.0 stabilization pass with a license rejection and three advisories.
+`cargo audit` exited 0 — all three are `unmaintained`/`unsound` _notices_,
+not severity-scored vulnerabilities. The `Cargo.lock` is byte-identical to
+`main`, so these are pre-existing in the dependency tree; the RustSec DB is
+live, so they surface now regardless of when CI last ran.
+
+Triage:
+
+| Item              | Crate                                                  | Path                                                                | Verdict                                                                                                                                                                                                         |
+| ----------------- | ------------------------------------------------------ | ------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| License rejected  | `webpki-roots` 0.26.11 + 1.0.7 (`CDLA-Permissive-2.0`) | `tokio-tungstenite` (rustls-tls-webpki-roots) → WS transport        | Legitimate permissive data license (Mozilla CA bundle). Allowlist it.                                                                                                                                           |
+| RUSTSEC-2025-0052 | `async-std` discontinued                               | `async-std → async-object-pool → httpmock` (**[dev-dependencies]**) | Never in the published crate. No safe upgrade. Ignore.                                                                                                                                                          |
+| RUSTSEC-2024-0436 | `paste` unmaintained                                   | transitive proc-macro                                               | Not a vulnerability; no drop-in upstream path. Ignore.                                                                                                                                                          |
+| RUSTSEC-2026-0002 | `lru` `IterMut` Stacked-Borrows unsoundness            | `ratatui 0.29 → lru ^0.12`                                          | Only under the optional `tui` feature (default lib build excludes it). No semver-compatible fix (ratatui pins `lru ^0.12`; 0.12.5 is latest 0.12.x). Low practical impact (internal TUI cache). Ignore + track. |
+
+## Decision
+
+1. Add `CDLA-Permissive-2.0` to `deny.toml` `[licenses].allow`. It is a
+   permissive, publish-safe license; `webpki-roots` is a standard rustls
+   dependency used widely across the ecosystem.
+2. Add a `deny.toml` `[advisories].ignore` list containing exactly the three
+   triaged IDs above, each with an inline rationale. Nothing else is
+   ignored; `version = 2`, `yanked = "warn"`, `[bans]`, `[sources]` are
+   unchanged, so cargo-deny still hard-fails on any _new_ or
+   actual-vulnerability advisory.
+3. Track the two that may become fixable upstream in
+   `POST_PUBLISH_ISSUES.md` (revisit when httpmock drops async-std / when
+   ratatui ships on patched `lru`).
+
+## Alternatives considered
+
+| Option                                            | Why rejected                                                                                                                                                                 |
+| ------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Bump `ratatui` past 0.29 to pull patched `lru`    | Out of scope for a stabilization pass; ratatui major/minor bump risks API + MSRV (1.86) regressions across the TUI; the project deliberately pinned 0.29.                    |
+| Drop the `httpmock` dev-dep to remove `async-std` | Loses HTTP-transport test coverage for a dev-only, not-shipped advisory — net negative.                                                                                      |
+| Lower `deny.toml` to warn-only on advisories      | Blanket weakening — would also silence _future real_ vulns. Targeted, documented per-ID ignores keep the gate sharp.                                                         |
+| Block v0.1.0 until upstreams fix                  | The only soundness item (`lru`) is transitive, optional-feature-gated, low-impact, and unfixable from here; the rest are unmaintained notices. Not a release-worthy blocker. |
+
+## Consequences
+
+**Positive:** `cargo deny check` passes deterministically; the policy and its
+rationale are reviewable in one place; the gate still catches anything new.
+
+**Negative:** Three advisories are intentionally suppressed — acceptable given
+each is individually justified and re-evaluated via POST_PUBLISH_ISSUES.
+
+**Open:** Re-audit when `ratatui`/`httpmock` release fixes; if RUSTSEC-2026-0002
+ever gains an exploit vector or `lru` lands in the default build, revisit
+immediately (drop the ignore, force the upgrade).
diff --git a/docs/adr/README.md b/docs/adr/README.md
index 7692552..6067daa 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -4,18 +4,19 @@ Append-only log of significant decisions. New ADRs are added with the next numbe
 
 ## Index
 
-| #                                             | Title                                                              | Status   |
-| --------------------------------------------- | ------------------------------------------------------------------ | -------- |
-| [0001](0001-language-rust.md)                 | Language: Rust                                                     | Accepted |
-| [0002](0002-runtime-tokio.md)                 | Async runtime: tokio                                               | Accepted |
-| [0003](0003-histogram-hdr.md)                 | Latency histogram: hdrhistogram                                    | Accepted |
-| [0004](0004-compete-with-reaatech.md)         | Strategy: head-on competition with reaatech/mcp-load-test          | Accepted |
-| [0005](0005-serve-mcp-mode.md)                | Serve mode: expose load tester as MCP server over stdio            | Accepted |
-| [0006](0006-zero-copy-protocol-types.md)      | Zero-copy protocol types on the request hot path                   | Accepted |
-| [0007](0007-transport-security-posture.md)    | Transport security posture: redirects, frame caps, path validation | Accepted |
-| [0008](0008-release-profile-panic-abort.md)   | Release profile: `panic = "abort"`                                 | Accepted |
-| [0009](0009-regression-threshold-defaults.md) | Regression threshold defaults: 10% p99 / 0.5pp error rate          | Accepted |
-| [0010](0010-strict-schema-validation.md)      | Opt-in strict MCP schema validation                                | Accepted |
+| #                                             | Title                                                               | Status   |
+| --------------------------------------------- | ------------------------------------------------------------------- | -------- |
+| [0001](0001-language-rust.md)                 | Language: Rust                                                      | Accepted |
+| [0002](0002-runtime-tokio.md)                 | Async runtime: tokio                                                | Accepted |
+| [0003](0003-histogram-hdr.md)                 | Latency histogram: hdrhistogram                                     | Accepted |
+| [0004](0004-compete-with-reaatech.md)         | Strategy: head-on competition with reaatech/mcp-load-test           | Accepted |
+| [0005](0005-serve-mcp-mode.md)                | Serve mode: expose load tester as MCP server over stdio             | Accepted |
+| [0006](0006-zero-copy-protocol-types.md)      | Zero-copy protocol types on the request hot path                    | Accepted |
+| [0007](0007-transport-security-posture.md)    | Transport security posture: redirects, frame caps, path validation  | Accepted |
+| [0008](0008-release-profile-panic-abort.md)   | Release profile: `panic = "abort"`                                  | Accepted |
+| [0009](0009-regression-threshold-defaults.md) | Regression threshold defaults: 10% p99 / 0.5pp error rate           | Accepted |
+| [0010](0010-strict-schema-validation.md)      | Opt-in strict MCP schema validation                                 | Accepted |
+| [0011](0011-supply-chain-advisory-policy.md)  | Supply-chain policy: CDLA-Permissive-2.0 + triaged advisory ignores | Accepted |
 
 ## When to write a new ADR
 

From b7e75a2a73427e557ac28dcfa09b5d0e20de0b7d Mon Sep 17 00:00:00 2001
From: Teerapat-Vatpitak <teerapatv.c@gmail.com>
Date: Sun, 17 May 2026 11:18:20 +0700
Subject: [PATCH 8/8] refactor(api): non_exhaustive + publish-readiness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-publish FULL hardening so v0.1.0's public API and crate page are
safe to commit to permanently (from a market + quality/standards
audit before `cargo publish`).

- API durability: `#[non_exhaustive]` on the Config family + public
  error/outcome enums + RunContext (adding a field/variant is no
  longer breaking — the exact pain hit internally when `validation`
  was added). Constructors keep cross-crate use ergonomic:
  Config::new + with_*, ScenarioConfig::new, OutputConfig::new,
  RunContext::new. cli + 12 integration-test helpers refactored to
  the constructors.
- Crate-root docs: dropped the stale "v0.0.0 scaffolding" status and
  the fictional-API example; replaced with a real compiling `no_run`
  example. ValidationConfig added to the re-export facade.
- Standards: docs.rs all-features metadata; missing_docs = deny
  (lib + cli; zero gaps); README repo links → absolute so they
  render on crates.io.
- MSRV corrected 1.86 -> 1.88 (the real floor: edition-2024
  let-chains; verified by an actual build on 1.88) + new CI `msrv`
  job; fixed the clippy nits it surfaced (collapsible_if,
  is_multiple_of).

Edition stays 2024 (an audit "downgrade to 2021" suggestion was
wrong — let-chains require it and dry-run already accepted it).

Re-verified on x86_64-pc-windows-msvc: fmt, clippy -D warnings,
--locked build/test (incl. doctests), doc -D warnings, cargo deny,
cargo audit, publish --dry-run (104 files). CHANGELOG noted.
---
 .github/workflows/ci.yml                      | 14 ++++
 CHANGELOG.md                                  |  6 ++
 Cargo.toml                                    |  2 +-
 README.md                                     | 44 ++++++------
 crates/mcp-loadtest-cli/src/cmd_cross.rs      | 23 ++-----
 crates/mcp-loadtest-cli/src/cmd_deadlock.rs   | 20 ++----
 crates/mcp-loadtest-cli/src/lib.rs            |  2 +-
 crates/mcp-loadtest/Cargo.toml                |  7 +-
 crates/mcp-loadtest/src/config.rs             | 67 +++++++++++++++++++
 crates/mcp-loadtest/src/config/validate.rs    | 36 +++++-----
 crates/mcp-loadtest/src/hang_detector.rs      |  1 +
 crates/mcp-loadtest/src/lib.rs                | 58 +++++++++++-----
 crates/mcp-loadtest/src/metrics/types.rs      |  1 +
 crates/mcp-loadtest/src/protocol/schema.rs    |  2 +
 .../src/protocol/transport/mod.rs             |  1 +
 crates/mcp-loadtest/src/report/common.rs      |  2 +-
 crates/mcp-loadtest/src/report/mod.rs         |  2 +
 crates/mcp-loadtest/src/run.rs                |  1 +
 crates/mcp-loadtest/src/scenario/mod.rs       | 22 +++++-
 crates/mcp-loadtest/src/session.rs            |  1 +
 crates/mcp-loadtest/tests/breaking_point.rs   | 14 ++--
 crates/mcp-loadtest/tests/coverage.rs         | 14 ++--
 crates/mcp-loadtest/tests/deadlock.rs         | 14 ++--
 crates/mcp-loadtest/tests/fuzzer.rs           | 14 ++--
 crates/mcp-loadtest/tests/patterns.rs         | 14 ++--
 crates/mcp-loadtest/tests/race_check.rs       | 14 ++--
 crates/mcp-loadtest/tests/ramp.rs             | 14 ++--
 crates/mcp-loadtest/tests/scenarios_basic.rs  | 14 ++--
 crates/mcp-loadtest/tests/soak.rs             | 14 ++--
 crates/mcp-loadtest/tests/spike.rs            | 14 ++--
 .../mcp-loadtest/tests/strict_validation.rs   | 14 ++--
 .../tests/vibe_trading_regression.rs          | 14 ++--
 32 files changed, 305 insertions(+), 175 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bdce6e3..f1d6d5e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,3 +57,17 @@ jobs:
         with:
           tool: cargo-audit
       - run: cargo audit
+
+  msrv:
+    name: MSRV (Rust 1.88)
+    runs-on: ubuntu-latest
+    # Verifies the declared `rust-version`. Lints are capped to `warn` here so
+    # this stays a pure compile/edition/dependency gate — the `checks` job
+    # enforces `-D warnings` on stable.
+    env:
+      RUSTFLAGS: --cap-lints=warn
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@1.88
+      - uses: Swatinem/rust-cache@v2
+      - run: cargo build --workspace --all-features --locked
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c48ad3..50edbe7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -143,6 +143,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ✅ Pre-publish security pass on the B1/B2 surface: bounded `protocol::schema` recursion (`MAX_SCHEMA_DEPTH`, defends strict mode against a maliciously deep server `inputSchema`); non-positive regression thresholds rejected at the CLI/MCP boundary (would otherwise invert the gate); `compare_runs` no longer echoes the raw caller path; `enum`-violation messages are length-capped.
 - ✅ Pre-publish stabilization: `crates/mcp-loadtest` now `exclude`s the four `CLAUDE.md` scaffolding files from the published package (108 → 104 files); added `run_strict` CLI integration test covering the real `run --config` entrypoint with `[validation] strict = true` end-to-end (TOML → `Run::execute` → report-on-disk → non-zero gate); full pipeline (fmt, clippy `-D warnings`, `--locked` build/test, doc) verified on the **x86_64-pc-windows-msvc** target — the toolchain crates.io ships to Windows users.
 - ✅ Supply-chain gate (`cargo deny check` / `cargo audit`): allowlisted `CDLA-Permissive-2.0` (the `webpki-roots` CA-bundle license, via `tokio-tungstenite`) and added three individually-triaged, documented advisory `ignore`s — RUSTSEC-2025-0052 (`async-std`, dev-only via `httpmock`), RUSTSEC-2024-0436 (`paste`, transitive), RUSTSEC-2026-0002 (`lru` unsoundness, transitive via `ratatui`, optional `tui` feature only, no semver fix). Rationale + revisit conditions in ADR 0011 + `POST_PUBLISH_ISSUES.md`; no actual vulnerabilities, gate still hard-fails on anything new.
+- ✅ Pre-publish API-durability & standards pass (so v0.1.0's public surface is safe to commit to):
+    - `#[non_exhaustive]` on the `Config` family (`Config`/`ServerConfig`/`ScenarioConfig`/`ThresholdsConfig`/`OutputConfig`/`ValidationConfig`/`ConfigError`), the public error/outcome enums (`SessionError`, `RunError`, `CallOutcome`, `ThresholdKind`, `ReportError`, `TransportError`, `HangOutcome`, `SchemaPolicy`, `ValidationSite`) and `RunContext` — adding a field/variant is no longer a breaking change. New constructors keep ergonomic cross-crate construction: `Config::new` + `with_thresholds`/`with_output`/`with_validation`, `ScenarioConfig::new`, `OutputConfig::new`, `RunContext::new`.
+    - Crate-root docs rewritten: dropped the stale "v0.0.0 — scaffolding" status and replaced the fictional-API example with a real, compiling `no_run` one; `ValidationConfig` added to the public re-export facade.
+    - docs.rs builds `--all-features` (so `serve`/`tui` render); `missing_docs` raised to `deny` (lib + CLI); README repo links rewritten to absolute URLs so they resolve on crates.io.
+    - MSRV corrected `1.86` → **`1.88`** (the real floor — edition-2024 let-chains; verified by an actual `cargo build` on 1.88) with a new CI `msrv` job pinning it; modernized the clippy nits it surfaced (`collapsible_if`, `is_multiple_of`).
+    - Re-verified end-to-end on the **x86_64-pc-windows-msvc** target: fmt, clippy `-D warnings`, `--locked` build/test (incl. doctests), `doc -D warnings`, `cargo deny`, `cargo audit`, `publish --dry-run` (104 files).
 
 ## [0.1.0] — TBD
 
diff --git a/Cargo.toml b/Cargo.toml
index 59bf323..1f86194 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@ license = "MIT OR Apache-2.0"
 repository = "https://github.com/Teerapat-Vatpitak/mcp-loadtest"
 homepage = "https://github.com/Teerapat-Vatpitak/mcp-loadtest"
 authors = ["Teerapat Vatpitak"]
-rust-version = "1.86"
+rust-version = "1.88"
 
 [workspace.dependencies]
 tokio = { version = "1", features = ["full"] }
diff --git a/README.md b/README.md
index 6dea968..b70f525 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![CI](https://github.com/Teerapat-Vatpitak/mcp-loadtest/actions/workflows/ci.yml/badge.svg)](https://github.com/Teerapat-Vatpitak/mcp-loadtest/actions/workflows/ci.yml)
 [![License: MIT OR Apache-2.0](https://img.shields.io/badge/license-MIT%20OR%20Apache--2.0-blue.svg)](#license)
-[![Rust 1.86+](https://img.shields.io/badge/rust-1.86%2B-orange.svg)](rust-toolchain.toml)
+[![Rust 1.88+](https://img.shields.io/badge/rust-1.88%2B-orange.svg)](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/rust-toolchain.toml)
 
 > Load tester **and bug detector** for MCP (Model Context Protocol) servers. Catches lazy-init deadlocks, concurrency races, hangs, and perf regressions that unit tests miss.
 
@@ -30,12 +30,12 @@ $ echo $?
 1
 ```
 
-This is the bug class that breaks MCP servers in production. Unit tests don't catch it. `mcp-loadtest` does — and exits non-zero so it can fail your CI gate. The regression that catches the exact Vibe-Trading commit lives at [`crates/mcp-loadtest/tests/vibe_trading_regression.rs`](crates/mcp-loadtest/tests/vibe_trading_regression.rs) (pinned to commit `71220c7c` — the parent of PR #85).
+This is the bug class that breaks MCP servers in production. Unit tests don't catch it. `mcp-loadtest` does — and exits non-zero so it can fail your CI gate. The regression that catches the exact Vibe-Trading commit lives at [`crates/mcp-loadtest/tests/vibe_trading_regression.rs`](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/crates/mcp-loadtest/tests/vibe_trading_regression.rs) (pinned to commit `71220c7c` — the parent of PR #85).
 
 ## What it does
 
 - **Bug-class detection**
-    - `deadlock_probe` — fires N concurrent `tools/call`s through `hang_detect`; classifies each as success / slow / deadlock (see [DESIGN.md §15.2](DESIGN.md#152-deadlock-probe-scenario)).
+    - `deadlock_probe` — fires N concurrent `tools/call`s through `hang_detect`; classifies each as success / slow / deadlock (see [DESIGN.md §15.2](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md#152-deadlock-probe-scenario)).
     - `race_check` — issues identical calls and diffs the responses to surface non-determinism (clocks, RNG, leaked state).
     - Hang detector watchdog wraps every call, so any scenario can surface a hung tool, not just the dedicated probe.
 - **Load testing**
@@ -50,7 +50,7 @@ This is the bug class that breaks MCP servers in production. Unit tests don't ca
     - `mcp-loadtest cross --server "..." --server "..."` for side-by-side runs across N targets.
 - **AI-agent friendly**
     - Stable JSON output and structured error messages with `Hint:` lines.
-    - `mcp-loadtest serve --mcp` exposes the tool itself as an MCP server so Claude Code, Cursor, or any MCP-aware agent can call `deadlock_probe`, `sustained_load`, and `compare_runs` as MCP tools directly. See [DESIGN.md §21.2](DESIGN.md#212-self-hosted-mcp-server-mcp-loadtest-serve---mcp).
+    - `mcp-loadtest serve --mcp` exposes the tool itself as an MCP server so Claude Code, Cursor, or any MCP-aware agent can call `deadlock_probe`, `sustained_load`, and `compare_runs` as MCP tools directly. See [DESIGN.md §21.2](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md#212-self-hosted-mcp-server-mcp-loadtest-serve---mcp).
 
 ## CI gating & protocol-aware assertions
 
@@ -73,7 +73,7 @@ This is the bug class that breaks MCP servers in production. Unit tests don't ca
     strict = true
     ```
 
-Full GitHub Actions example: [docs/examples/ci-integration.md](docs/examples/ci-integration.md).
+Full GitHub Actions example: [docs/examples/ci-integration.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/docs/examples/ci-integration.md).
 
 ## Quick start
 
@@ -129,7 +129,7 @@ cargo run -p mcp-loadtest-cli -- deadlock-probe \
     --args '{"ticker":"AAPL"}'
 ```
 
-Library usage (pseudocode — see [`crates/mcp-loadtest/tests/vibe_trading_regression.rs`](crates/mcp-loadtest/tests/vibe_trading_regression.rs) for a runnable example):
+Library usage (pseudocode — see [`crates/mcp-loadtest/tests/vibe_trading_regression.rs`](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/crates/mcp-loadtest/tests/vibe_trading_regression.rs) for a runnable example):
 
 ```rust,ignore
 // Sketch of the library API. RunContext requires run_start, cancel_token,
@@ -189,7 +189,7 @@ async fn no_deadlock_under_concurrent_calls() {
 | Console + markdown + JSON reporters                                | yes                           | yes                                     |
 | Programmatic library API                                           | yes                           | yes                                     |
 
-We tracked 4 direct competitors (reaatech, haakco/mcp-testing-framework, spbiju/MCP-Benchmark, IBM mcp-context-forge internal) and 6 adjacent LLM-eval frameworks (MCP-Bench, MCPBench, MCP-Universe, MCPMark, MCP-Inspector, k6-MCP). See [DESIGN.md §10.5](DESIGN.md#105-competitive-parity--differentiation-matrix) for the full matrix and [ADR 0004](docs/adr/0004-compete-with-reaatech.md) for the positioning decision.
+We tracked 4 direct competitors (reaatech, haakco/mcp-testing-framework, spbiju/MCP-Benchmark, IBM mcp-context-forge internal) and 6 adjacent LLM-eval frameworks (MCP-Bench, MCPBench, MCP-Universe, MCPMark, MCP-Inspector, k6-MCP). See [DESIGN.md §10.5](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md#105-competitive-parity--differentiation-matrix) for the full matrix and [ADR 0004](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/docs/adr/0004-compete-with-reaatech.md) for the positioning decision.
 
 ## Built-in scenarios
 
@@ -204,15 +204,15 @@ We tracked 4 direct competitors (reaatech, haakco/mcp-testing-framework, spbiju/
 | `race_check`     | non-determinism / order-sensitive bugs                                |
 | `pattern`        | weighted random mixes (explore-then-act, read-then-write, multi-step) |
 
-Each scenario is one `impl Scenario` in [`crates/mcp-loadtest/src/scenario/`](crates/mcp-loadtest/src/scenario/) with a JSON-Schema describing its config block. See [DESIGN.md §8](DESIGN.md#8-built-in-scenarios) for the full table.
+Each scenario is one `impl Scenario` in [`crates/mcp-loadtest/src/scenario/`](https://github.com/Teerapat-Vatpitak/mcp-loadtest/tree/main/crates/mcp-loadtest/src/scenario/) with a JSON-Schema describing its config block. See [DESIGN.md §8](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md#8-built-in-scenarios) for the full table.
 
 ## Cookbook
 
-Three worked examples in [`docs/examples/`](docs/examples/):
+Three worked examples in [`docs/examples/`](https://github.com/Teerapat-Vatpitak/mcp-loadtest/tree/main/docs/examples/):
 
-- **[CI integration](docs/examples/ci-integration.md)** — GitHub Actions workflow that runs `mcp-loadtest` on every PR and fails the build on threshold violations.
-- **[Custom scenario](docs/examples/custom-scenario.md)** — write `impl Scenario for MyThing`, register it, drive it from a TOML config. Uses `DeadlockProbe` as a reference.
-- **[Debugging deadlocks](docs/examples/debugging-deadlocks.md)** — narrative walkthrough of what to do when `deadlock-probe` says DEADLOCK DETECTED. Stderr inspection, the lazy-init pattern that caused Vibe-Trading PR #85, and a worked-example test you can copy.
+- **[CI integration](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/docs/examples/ci-integration.md)** — GitHub Actions workflow that runs `mcp-loadtest` on every PR and fails the build on threshold violations.
+- **[Custom scenario](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/docs/examples/custom-scenario.md)** — write `impl Scenario for MyThing`, register it, drive it from a TOML config. Uses `DeadlockProbe` as a reference.
+- **[Debugging deadlocks](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/docs/examples/debugging-deadlocks.md)** — narrative walkthrough of what to do when `deadlock-probe` says DEADLOCK DETECTED. Stderr inspection, the lazy-init pattern that caused Vibe-Trading PR #85, and a worked-example test you can copy.
 
 ## Install
 
@@ -235,19 +235,19 @@ bash scripts/ci-checks.sh        # or: pwsh scripts/ci-checks.ps1 on Windows
 cargo nextest run --workspace --all-features
 ```
 
-See [CLAUDE.md](CLAUDE.md) for project conventions and [CONTRIBUTING.md](CONTRIBUTING.md) before opening a PR.
+See [CLAUDE.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/CLAUDE.md) for project conventions and [CONTRIBUTING.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/CONTRIBUTING.md) before opening a PR.
 
 ## Documents
 
-- [DESIGN.md](DESIGN.md) — what this is + how it works (21 sections)
-- [PROJECT-STRUCTURE.md](PROJECT-STRUCTURE.md) — how the repo is laid out for AI-assisted development
-- [CONTRIBUTING.md](CONTRIBUTING.md) — how to propose changes and open a PR
-- [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) — contributor expectations
-- [SECURITY.md](SECURITY.md) — how to report vulnerabilities
-- [docs/adr/](docs/adr/) — architecture decision records
-- [docs/examples/](docs/examples/) — cookbook
-- [CHANGELOG.md](CHANGELOG.md) — release history
+- [DESIGN.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md) — what this is + how it works (21 sections)
+- [PROJECT-STRUCTURE.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/PROJECT-STRUCTURE.md) — how the repo is laid out for AI-assisted development
+- [CONTRIBUTING.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/CONTRIBUTING.md) — how to propose changes and open a PR
+- [CODE_OF_CONDUCT.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/CODE_OF_CONDUCT.md) — contributor expectations
+- [SECURITY.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/SECURITY.md) — how to report vulnerabilities
+- [docs/adr/](https://github.com/Teerapat-Vatpitak/mcp-loadtest/tree/main/docs/adr/) — architecture decision records
+- [docs/examples/](https://github.com/Teerapat-Vatpitak/mcp-loadtest/tree/main/docs/examples/) — cookbook
+- [CHANGELOG.md](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/CHANGELOG.md) — release history
 
 ## License
 
-Dual-licensed under [MIT](LICENSE-MIT) **OR** [Apache-2.0](LICENSE-APACHE), at your option.
+Dual-licensed under [MIT](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/LICENSE-MIT) **OR** [Apache-2.0](https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/LICENSE-APACHE), at your option.
diff --git a/crates/mcp-loadtest-cli/src/cmd_cross.rs b/crates/mcp-loadtest-cli/src/cmd_cross.rs
index 54dab0e..8748a4d 100644
--- a/crates/mcp-loadtest-cli/src/cmd_cross.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_cross.rs
@@ -16,7 +16,7 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow};
 use futures::StreamExt;
 use mcp_loadtest::config::{
-    Config, OutputConfig, ScenarioConfig, ServerConfig, ThresholdsConfig, split_server_command,
+    Config, OutputConfig, ScenarioConfig, ServerConfig, split_server_command,
 };
 use mcp_loadtest::report::Report;
 use mcp_loadtest::run::Run;
@@ -163,22 +163,11 @@ async fn run_one(server: &str, args: &CrossArgs, args_value: &Value) -> Result<R
         CrossScenario::DeadlockProbe => "deadlock_probe",
     };
 
-    let config = Config {
-        server: server_cfg,
-        scenario: ScenarioConfig {
-            kind: scenario_kind.into(),
-            params: json!({}),
-        },
-        thresholds: ThresholdsConfig::default(),
-        output: OutputConfig {
-            report_dir: args.output_dir.clone(),
-            // Skip writing artifacts for cross-runs — the comparison itself
-            // is the output. Callers that want per-server reports can run
-            // individual `mcp-loadtest run` invocations.
-            formats: Vec::new(),
-        },
-        validation: mcp_loadtest::config::ValidationConfig::default(),
-    };
+    // Skip writing artifacts for cross-runs — the comparison itself is the
+    // output. Callers wanting per-server reports run individual
+    // `mcp-loadtest run` invocations.
+    let config = Config::new(server_cfg, ScenarioConfig::new(scenario_kind, json!({})))
+        .with_output(OutputConfig::new(args.output_dir.clone(), Vec::new()));
 
     let run = Run::new(config, scenario, args.output_dir.clone());
     let report = run
diff --git a/crates/mcp-loadtest-cli/src/cmd_deadlock.rs b/crates/mcp-loadtest-cli/src/cmd_deadlock.rs
index 21983cb..8fa73c0 100644
--- a/crates/mcp-loadtest-cli/src/cmd_deadlock.rs
+++ b/crates/mcp-loadtest-cli/src/cmd_deadlock.rs
@@ -8,7 +8,7 @@ use anyhow::{Context, Result};
 use serde_json::{Value, json};
 
 use mcp_loadtest::config::{
-    Config, OutputConfig, ScenarioConfig, ServerConfig, ThresholdsConfig, split_server_command,
+    Config, OutputConfig, ScenarioConfig, ServerConfig, split_server_command,
 };
 use mcp_loadtest::run::Run;
 use mcp_loadtest::scenario::deadlock_probe::DeadlockProbe;
@@ -42,19 +42,11 @@ pub async fn run_deadlock_probe(
         args: args_value,
     };
 
-    let config = Config {
-        server: server_cfg,
-        scenario: ScenarioConfig {
-            kind: "deadlock_probe".into(),
-            params: json!({}),
-        },
-        thresholds: ThresholdsConfig::default(),
-        output: OutputConfig {
-            report_dir: output_dir.to_path_buf(),
-            formats: vec!["terminal".into(), "markdown".into(), "json".into()],
-        },
-        validation: mcp_loadtest::config::ValidationConfig::default(),
-    };
+    let config = Config::new(server_cfg, ScenarioConfig::new("deadlock_probe", json!({})))
+        .with_output(OutputConfig::new(
+            output_dir.to_path_buf(),
+            vec!["terminal".into(), "markdown".into(), "json".into()],
+        ));
 
     let run = Run::new(config, Box::new(scenario), output_dir.to_path_buf());
     let report = run.execute().await?;
diff --git a/crates/mcp-loadtest-cli/src/lib.rs b/crates/mcp-loadtest-cli/src/lib.rs
index 62cd05d..6a55e6d 100644
--- a/crates/mcp-loadtest-cli/src/lib.rs
+++ b/crates/mcp-loadtest-cli/src/lib.rs
@@ -6,7 +6,7 @@
 //! re-exported here.
 
 #![deny(unsafe_code)]
-#![warn(missing_docs)]
+#![deny(missing_docs)]
 
 pub mod cmd_compare;
 pub mod cmd_cross;
diff --git a/crates/mcp-loadtest/Cargo.toml b/crates/mcp-loadtest/Cargo.toml
index d654b11..d777e7f 100644
--- a/crates/mcp-loadtest/Cargo.toml
+++ b/crates/mcp-loadtest/Cargo.toml
@@ -20,8 +20,13 @@ exclude = [
     "tests/fixtures/CLAUDE.md",
 ]
 
+# docs.rs builds with all features so the feature-gated `serve` / `tui`
+# modules render in the published API docs.
+[package.metadata.docs.rs]
+all-features = true
+
 [lints.rust]
-missing_docs = "warn"
+missing_docs = "deny"
 unsafe_code = "forbid"
 
 [lints.clippy]
diff --git a/crates/mcp-loadtest/src/config.rs b/crates/mcp-loadtest/src/config.rs
index 80add4d..ad7d8b6 100644
--- a/crates/mcp-loadtest/src/config.rs
+++ b/crates/mcp-loadtest/src/config.rs
@@ -20,7 +20,13 @@ pub use example::example_config;
 /// `Config::from_file`.
 ///
 /// **Locked for M3.** Field additions OK; removal requires sync.
+///
+/// `#[non_exhaustive]`: construct via [`Config::from_toml_str`] /
+/// [`Config::from_file`] (the canonical path) or [`Config::new`] +
+/// `with_*` builders — never a struct literal from another crate, so adding
+/// a field stays non-breaking.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct Config {
     /// Server invocation under test.
     pub server: ServerConfig,
@@ -46,6 +52,7 @@ pub struct Config {
 /// `inputSchema` and mismatches are classified via
 /// [`crate::protocol::schema::classify_schema_violation`].
 #[derive(Debug, Default, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct ValidationConfig {
     /// Enable strict MCP schema validation. Default `false`.
     #[serde(default)]
@@ -60,6 +67,7 @@ pub struct ValidationConfig {
 /// - `transport = "http"` or `"sse"` → `url` required; `command`/`args`/`env`
 ///   ignored.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct ServerConfig {
     /// Stdio: command to spawn (e.g., `"python"`). Required when `transport = "stdio"`.
     #[serde(default)]
@@ -129,6 +137,7 @@ pub fn split_server_command(s: &str) -> Result<(String, Vec<String>), ConfigErro
 
 /// Workload block.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct ScenarioConfig {
     /// Scenario name: `"sustained"`, `"deadlock_probe"`, etc.
     #[serde(rename = "type")]
@@ -138,8 +147,19 @@ pub struct ScenarioConfig {
     pub params: serde_json::Value,
 }
 
+impl ScenarioConfig {
+    /// Build a scenario block from a kind name + free-form params object.
+    pub fn new(kind: impl Into<String>, params: serde_json::Value) -> Self {
+        Self {
+            kind: kind.into(),
+            params,
+        }
+    }
+}
+
 /// Threshold block. All fields optional — missing means "no constraint".
 #[derive(Debug, Default, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct ThresholdsConfig {
     #[serde(default, with = "humantime_serde::option")]
     /// p50 latency budget.
@@ -172,6 +192,7 @@ pub struct ThresholdsConfig {
 
 /// Output block.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct OutputConfig {
     /// Where per-run dirs are created.
     #[serde(default = "default_report_dir")]
@@ -181,6 +202,16 @@ pub struct OutputConfig {
     pub formats: Vec<String>,
 }
 
+impl OutputConfig {
+    /// Build an output block from an explicit run dir + format list.
+    pub fn new(report_dir: PathBuf, formats: Vec<String>) -> Self {
+        Self {
+            report_dir,
+            formats,
+        }
+    }
+}
+
 impl Default for OutputConfig {
     fn default() -> Self {
         Self {
@@ -199,6 +230,7 @@ fn default_formats() -> Vec<String> {
 
 /// Errors returned by `Config::from_*`.
 #[derive(Debug, thiserror::Error)]
+#[non_exhaustive]
 pub enum ConfigError {
     /// File I/O error.
     #[error("io: {0}")]
@@ -212,6 +244,41 @@ pub enum ConfigError {
 }
 
 impl Config {
+    /// Construct a `Config` programmatically (thresholds / output /
+    /// validation default). Pair with the `with_*` builders. The usual
+    /// path is [`Config::from_toml_str`] / [`Config::from_file`]; this is
+    /// for callers building a run without a TOML file.
+    pub fn new(server: ServerConfig, scenario: ScenarioConfig) -> Self {
+        Self {
+            server,
+            scenario,
+            thresholds: ThresholdsConfig::default(),
+            output: OutputConfig::default(),
+            validation: ValidationConfig::default(),
+        }
+    }
+
+    /// Builder: set the pass/fail thresholds block.
+    #[must_use]
+    pub fn with_thresholds(mut self, thresholds: ThresholdsConfig) -> Self {
+        self.thresholds = thresholds;
+        self
+    }
+
+    /// Builder: set the output (run dir / formats) block.
+    #[must_use]
+    pub fn with_output(mut self, output: OutputConfig) -> Self {
+        self.output = output;
+        self
+    }
+
+    /// Builder: set the protocol-validation block.
+    #[must_use]
+    pub fn with_validation(mut self, validation: ValidationConfig) -> Self {
+        self.validation = validation;
+        self
+    }
+
     /// Parse a TOML string into a `Config` and run semantic validation.
     ///
     /// Returns [`ConfigError::Toml`] on syntactic failures and
diff --git a/crates/mcp-loadtest/src/config/validate.rs b/crates/mcp-loadtest/src/config/validate.rs
index f54aaea..cca7136 100644
--- a/crates/mcp-loadtest/src/config/validate.rs
+++ b/crates/mcp-loadtest/src/config/validate.rs
@@ -61,30 +61,30 @@ pub(super) fn validate(cfg: &Config) -> Result<(), ConfigError> {
     }
 
     // Thresholds: every Some(_) value must be sensible.
-    if let Some(rate) = cfg.thresholds.error_rate {
-        if !(0.0..=1.0).contains(&rate) {
-            return Err(ConfigError::Invalid(format!(
-                "thresholds.error_rate: must be in [0.0, 1.0], got {rate}"
-            )));
-        }
+    if let Some(rate) = cfg.thresholds.error_rate
+        && !(0.0..=1.0).contains(&rate)
+    {
+        return Err(ConfigError::Invalid(format!(
+            "thresholds.error_rate: must be in [0.0, 1.0], got {rate}"
+        )));
     }
-    if let Some(growth) = cfg.thresholds.memory_growth_mb {
-        if growth < 0.0 {
-            return Err(ConfigError::Invalid(format!(
-                "thresholds.memory_growth_mb: must be >= 0, got {growth}"
-            )));
-        }
+    if let Some(growth) = cfg.thresholds.memory_growth_mb
+        && growth < 0.0
+    {
+        return Err(ConfigError::Invalid(format!(
+            "thresholds.memory_growth_mb: must be >= 0, got {growth}"
+        )));
     }
     // Duration is unsigned (`std::time::Duration`) so it can't be negative;
     // we don't need to check for negative latencies. We do check for the
     // pathological `0` case on the per-call hang timeout which would make
     // every call instantly hang.
-    if let Some(d) = cfg.thresholds.hang_timeout {
-        if d.is_zero() {
-            return Err(ConfigError::Invalid(
-                "thresholds.hang_timeout: must be > 0".to_string(),
-            ));
-        }
+    if let Some(d) = cfg.thresholds.hang_timeout
+        && d.is_zero()
+    {
+        return Err(ConfigError::Invalid(
+            "thresholds.hang_timeout: must be > 0".to_string(),
+        ));
     }
 
     Ok(())
diff --git a/crates/mcp-loadtest/src/hang_detector.rs b/crates/mcp-loadtest/src/hang_detector.rs
index 3a92910..eca9b9d 100644
--- a/crates/mcp-loadtest/src/hang_detector.rs
+++ b/crates/mcp-loadtest/src/hang_detector.rs
@@ -12,6 +12,7 @@ use crate::protocol::mcp::CallToolResult;
 ///
 /// **Locked for M2.**
 #[derive(Debug)]
+#[non_exhaustive]
 pub enum HangOutcome {
     /// Call returned within `hang_threshold`.
     Ok {
diff --git a/crates/mcp-loadtest/src/lib.rs b/crates/mcp-loadtest/src/lib.rs
index 8e387a6..e761ed5 100644
--- a/crates/mcp-loadtest/src/lib.rs
+++ b/crates/mcp-loadtest/src/lib.rs
@@ -2,29 +2,56 @@
 //!
 //! Detects deadlocks, hangs, and perf regressions that unit tests miss.
 //!
-//! # Status
+//! Drives an MCP server under concurrent / ramp / soak / spike / pattern
+//! load and surfaces the failure classes unit tests miss — lazy-init
+//! **deadlocks**, non-determinism **races**, **hangs**, memory leaks, and
+//! latency / throughput **regressions** — resolving every run to a
+//! CI-gating pass/fail plus machine-readable reports. Strict MCP-schema
+//! validation is available opt-in (see [`config::ValidationConfig`]).
 //!
-//! v0.0.0 — scaffolding. Implementation in progress per [DESIGN.md][1].
+//! Most users drive it via the `mcp-loadtest` CLI (`mcp-loadtest run
+//! --config bench.toml`); the library API below is for embedding it in a
+//! Rust test/CI harness. See the [README] and [DESIGN] for the full story.
 //!
-//! [1]: https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md
+//! [README]: https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/README.md
+//! [DESIGN]: https://github.com/Teerapat-Vatpitak/mcp-loadtest/blob/main/DESIGN.md
 //!
-//! # Quick example (target API for v0.1)
+//! # Library example
 //!
-//! ```ignore
+//! ```rust,no_run
+//! use std::path::PathBuf;
 //! use std::time::Duration;
-//! use mcp_loadtest::{Run, Scenario, Server};
-//! use serde_json::json;
+//!
+//! use mcp_loadtest::{Config, Run};
+//! use mcp_loadtest::scenario::sustained::Sustained;
 //!
 //! # async fn _example() -> Result<(), Box<dyn std::error::Error>> {
-//! let server = Server::stdio("python").args(["-m", "my_mcp"]).build();
-//! let scenario = Scenario::sustained()
-//!     .concurrent(20)
-//!     .duration(Duration::from_secs(30))
-//!     .tool("get_market_data")
-//!     .args(json!({ "ticker": "AAPL" }));
+//! // Parse a run config (server + thresholds + optional strict validation).
+//! let config = Config::from_toml_str(
+//!     r#"
+//!         [server]
+//!         command = "python"
+//!         args = ["-m", "my_mcp"]
+//!         [scenario]
+//!         type = "sustained"
+//!         tool = "echo"
+//!     "#,
+//! )?;
+//!
+//! // Drive a constant-load workload; the run is gated on the config's
+//! // `[thresholds]` and exits non-zero (here: `report.passed()` is false)
+//! // on any breach — drop it straight into CI.
+//! let scenario = Box::new(Sustained {
+//!     concurrent: 10,
+//!     duration: Duration::from_secs(10),
+//!     tool: "echo".to_string(),
+//!     args: Default::default(),
+//! });
 //!
-//! let report = Run::new(server, scenario).execute().await?;
-//! assert!(report.passed());
+//! let report = Run::new(config, scenario, PathBuf::from("./runs"))
+//!     .execute()
+//!     .await?;
+//! assert!(report.passed(), "thresholds breached");
 //! # Ok(())
 //! # }
 //! ```
@@ -56,6 +83,7 @@ pub mod tui;
 
 pub use config::{
     Config, ConfigError, OutputConfig, ScenarioConfig, ServerConfig, ThresholdsConfig,
+    ValidationConfig,
 };
 pub use hang_detector::{HangOutcome, hang_detect};
 pub use metrics::{
diff --git a/crates/mcp-loadtest/src/metrics/types.rs b/crates/mcp-loadtest/src/metrics/types.rs
index d951950..79a0680 100644
--- a/crates/mcp-loadtest/src/metrics/types.rs
+++ b/crates/mcp-loadtest/src/metrics/types.rs
@@ -15,6 +15,7 @@ use std::time::Duration;
 ///
 /// **Locked for M2.** New variants are non-breaking only if added at the end.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[non_exhaustive]
 pub enum CallOutcome {
     /// Call completed within `hang_threshold`.
     Success,
diff --git a/crates/mcp-loadtest/src/protocol/schema.rs b/crates/mcp-loadtest/src/protocol/schema.rs
index 3f5081d..2060bec 100644
--- a/crates/mcp-loadtest/src/protocol/schema.rs
+++ b/crates/mcp-loadtest/src/protocol/schema.rs
@@ -32,6 +32,7 @@ pub struct SchemaViolation {
 /// the tool-call **result** (what the server sent back). The policy can
 /// treat the two sides differently.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
 pub enum ValidationSite {
     /// Arguments we are about to send in a `tools/call`.
     ToolCallArgs,
@@ -45,6 +46,7 @@ pub enum ValidationSite {
 /// count against the run (it maps to `CallOutcome::ProtocolError`), while
 /// [`SchemaPolicy::Warn`]/[`SchemaPolicy::Ignore`] keep the run green.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
 pub enum SchemaPolicy {
     /// Mismatch is a protocol error — gate the run on it.
     Fail,
diff --git a/crates/mcp-loadtest/src/protocol/transport/mod.rs b/crates/mcp-loadtest/src/protocol/transport/mod.rs
index fe7a794..13d54a4 100644
--- a/crates/mcp-loadtest/src/protocol/transport/mod.rs
+++ b/crates/mcp-loadtest/src/protocol/transport/mod.rs
@@ -20,6 +20,7 @@ use thiserror::Error;
 ///
 /// **Locked for M4.** New variants are non-breaking only if added at the end.
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum TransportError {
     /// I/O error from the underlying pipe / socket.
     #[error("io: {0}")]
diff --git a/crates/mcp-loadtest/src/report/common.rs b/crates/mcp-loadtest/src/report/common.rs
index 6d8a77f..8c73272 100644
--- a/crates/mcp-loadtest/src/report/common.rs
+++ b/crates/mcp-loadtest/src/report/common.rs
@@ -48,7 +48,7 @@ pub(crate) fn fmt_count(n: u64) -> String {
     let mut out = String::with_capacity(s.len() + s.len() / 3);
     for (i, b) in bytes.iter().enumerate() {
         let from_right = bytes.len() - i;
-        if i > 0 && from_right % 3 == 0 {
+        if i > 0 && from_right.is_multiple_of(3) {
             out.push(',');
         }
         out.push(*b as char);
diff --git a/crates/mcp-loadtest/src/report/mod.rs b/crates/mcp-loadtest/src/report/mod.rs
index 94ab894..efc2709 100644
--- a/crates/mcp-loadtest/src/report/mod.rs
+++ b/crates/mcp-loadtest/src/report/mod.rs
@@ -141,6 +141,7 @@ pub struct ProcessSample {
 /// names match the original strings (`"p99_latency"`, `"error_rate"`, ...)
 /// so `metrics.json` consumers don't break.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[non_exhaustive]
 pub enum ThresholdKind {
     /// `thresholds.p50_latency` — 50th percentile latency budget.
     #[serde(rename = "p50_latency")]
@@ -197,6 +198,7 @@ pub struct ThresholdViolation {
 
 /// Errors a Reporter may surface during render.
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum ReportError {
     /// I/O writing the rendered output.
     #[error("io: {0}")]
diff --git a/crates/mcp-loadtest/src/run.rs b/crates/mcp-loadtest/src/run.rs
index eb5fb23..090f27d 100644
--- a/crates/mcp-loadtest/src/run.rs
+++ b/crates/mcp-loadtest/src/run.rs
@@ -44,6 +44,7 @@ pub struct Run {
 
 /// Errors that can fail an entire run.
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum RunError {
     /// Server-side failure during spawn / handshake.
     #[error("session: {0}")]
diff --git a/crates/mcp-loadtest/src/scenario/mod.rs b/crates/mcp-loadtest/src/scenario/mod.rs
index 7825f3c..c75c6e0 100644
--- a/crates/mcp-loadtest/src/scenario/mod.rs
+++ b/crates/mcp-loadtest/src/scenario/mod.rs
@@ -53,7 +53,9 @@ pub trait Scenario: Send + Sync {
 
 /// Per-run context shared between scenarios and the orchestrator.
 ///
-/// **Locked for M2.**
+/// **Locked for M2.** Construct via [`RunContext::new`] — `#[non_exhaustive]`
+/// so adding a field stays non-breaking for external scenario authors.
+#[non_exhaustive]
 pub struct RunContext {
     /// Time the run started; used to compute elapsed offsets in metrics.
     pub run_start: Instant,
@@ -68,6 +70,24 @@ pub struct RunContext {
 }
 
 impl RunContext {
+    /// Construct a run context. External scenario authors and tests use
+    /// this instead of a struct literal (the struct is `#[non_exhaustive]`).
+    pub fn new(
+        run_start: Instant,
+        cancel_token: CancellationToken,
+        metrics: Recorder,
+        hang_threshold: Duration,
+        grace_period: Duration,
+    ) -> Self {
+        Self {
+            run_start,
+            cancel_token,
+            metrics,
+            hang_threshold,
+            grace_period,
+        }
+    }
+
     /// Convenience: time elapsed since run start.
     pub fn elapsed(&self) -> Duration {
         self.run_start.elapsed()
diff --git a/crates/mcp-loadtest/src/session.rs b/crates/mcp-loadtest/src/session.rs
index dfaae83..b0c4c4f 100644
--- a/crates/mcp-loadtest/src/session.rs
+++ b/crates/mcp-loadtest/src/session.rs
@@ -34,6 +34,7 @@ const DEFAULT_STARTUP_TIMEOUT: Duration = Duration::from_secs(10);
 
 /// Errors returned by [`Session`] operations.
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum SessionError {
     /// I/O error from the underlying transport (pipe closed, write failed, etc.).
     #[error("io: {0}")]
diff --git a/crates/mcp-loadtest/tests/breaking_point.rs b/crates/mcp-loadtest/tests/breaking_point.rs
index 643c53f..767503f 100644
--- a/crates/mcp-loadtest/tests/breaking_point.rs
+++ b/crates/mcp-loadtest/tests/breaking_point.rs
@@ -22,13 +22,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    )
 }
 
 fn synthetic_metrics(p99_ms: u64, total: u64, successful: u64) -> ScenarioMetrics {
diff --git a/crates/mcp-loadtest/tests/coverage.rs b/crates/mcp-loadtest/tests/coverage.rs
index 5a1214d..5c48c32 100644
--- a/crates/mcp-loadtest/tests/coverage.rs
+++ b/crates/mcp-loadtest/tests/coverage.rs
@@ -21,13 +21,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/deadlock.rs b/crates/mcp-loadtest/tests/deadlock.rs
index 8733161..9a72d04 100644
--- a/crates/mcp-loadtest/tests/deadlock.rs
+++ b/crates/mcp-loadtest/tests/deadlock.rs
@@ -16,13 +16,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(2),
-        grace_period: Duration::from_secs(5),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(2),
+        Duration::from_secs(5),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/fuzzer.rs b/crates/mcp-loadtest/tests/fuzzer.rs
index a8d26a9..0728aa1 100644
--- a/crates/mcp-loadtest/tests/fuzzer.rs
+++ b/crates/mcp-loadtest/tests/fuzzer.rs
@@ -18,13 +18,13 @@ use mcp_loadtest::scenario::{RunContext, Scenario};
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(2),
-        grace_period: Duration::from_secs(3),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(2),
+        Duration::from_secs(3),
+    )
 }
 
 #[test]
diff --git a/crates/mcp-loadtest/tests/patterns.rs b/crates/mcp-loadtest/tests/patterns.rs
index 6a72442..bc4b10f 100644
--- a/crates/mcp-loadtest/tests/patterns.rs
+++ b/crates/mcp-loadtest/tests/patterns.rs
@@ -24,13 +24,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    )
 }
 
 fn echo_step(payload: serde_json::Value) -> PatternStep {
diff --git a/crates/mcp-loadtest/tests/race_check.rs b/crates/mcp-loadtest/tests/race_check.rs
index 64a60c9..74c51d7 100644
--- a/crates/mcp-loadtest/tests/race_check.rs
+++ b/crates/mcp-loadtest/tests/race_check.rs
@@ -19,13 +19,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/ramp.rs b/crates/mcp-loadtest/tests/ramp.rs
index a627ac6..243788e 100644
--- a/crates/mcp-loadtest/tests/ramp.rs
+++ b/crates/mcp-loadtest/tests/ramp.rs
@@ -15,13 +15,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(2),
-        grace_period: Duration::from_secs(5),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(2),
+        Duration::from_secs(5),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/scenarios_basic.rs b/crates/mcp-loadtest/tests/scenarios_basic.rs
index 292cd88..2b02fef 100644
--- a/crates/mcp-loadtest/tests/scenarios_basic.rs
+++ b/crates/mcp-loadtest/tests/scenarios_basic.rs
@@ -17,13 +17,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/soak.rs b/crates/mcp-loadtest/tests/soak.rs
index b40e780..9edb1a8 100644
--- a/crates/mcp-loadtest/tests/soak.rs
+++ b/crates/mcp-loadtest/tests/soak.rs
@@ -15,13 +15,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/spike.rs b/crates/mcp-loadtest/tests/spike.rs
index ed0a924..e543962 100644
--- a/crates/mcp-loadtest/tests/spike.rs
+++ b/crates/mcp-loadtest/tests/spike.rs
@@ -16,13 +16,13 @@ use serde_json::json;
 use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(2),
-        grace_period: Duration::from_secs(5),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(2),
+        Duration::from_secs(5),
+    )
 }
 
 #[tokio::test]
diff --git a/crates/mcp-loadtest/tests/strict_validation.rs b/crates/mcp-loadtest/tests/strict_validation.rs
index d0759d8..26a576f 100644
--- a/crates/mcp-loadtest/tests/strict_validation.rs
+++ b/crates/mcp-loadtest/tests/strict_validation.rs
@@ -20,13 +20,13 @@ use tokio_util::sync::CancellationToken;
 
 fn make_ctx() -> (RunContext, Recorder) {
     let metrics = Recorder::new();
-    let ctx = RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: metrics.clone(),
-        hang_threshold: Duration::from_secs(5),
-        grace_period: Duration::from_secs(10),
-    };
+    let ctx = RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        metrics.clone(),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+    );
     (ctx, metrics)
 }
 
diff --git a/crates/mcp-loadtest/tests/vibe_trading_regression.rs b/crates/mcp-loadtest/tests/vibe_trading_regression.rs
index 3bf7925..7c3a4e0 100644
--- a/crates/mcp-loadtest/tests/vibe_trading_regression.rs
+++ b/crates/mcp-loadtest/tests/vibe_trading_regression.rs
@@ -223,13 +223,13 @@ fn vibe_trading_python() -> String {
 }
 
 fn make_ctx() -> RunContext {
-    RunContext {
-        run_start: Instant::now(),
-        cancel_token: CancellationToken::new(),
-        metrics: Recorder::new(),
-        hang_threshold: Duration::from_secs(2),
-        grace_period: Duration::from_secs(5),
-    }
+    RunContext::new(
+        Instant::now(),
+        CancellationToken::new(),
+        Recorder::new(),
+        Duration::from_secs(2),
+        Duration::from_secs(5),
+    )
 }
 
 /// Drive `DeadlockProbe` against the *buggy* Vibe-Trading commit and assert