Skip to content

Commit 893f7aa

Browse files
test(mcp): catalog exposure integration tests and benchmarks (#913)
Add 6 integration tests validating the full catalog exposure pipeline (spec §12) and a benchmark script measuring description size and rendering latency at various tool counts (spec §13). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ec12f2b commit 893f7aa

2 files changed

Lines changed: 538 additions & 0 deletions

File tree

mcp_server/bench/catalog_bench.exs

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# Catalog exposure benchmark: description size and tools/list latency.
2+
#
3+
# Measures rendered description size (characters) and rendering latency
4+
# at 10, 30, 50, 100, and 200 tools for both inline and lazy modes.
5+
# Outputs a markdown table suitable for pasting into a PR comment to
6+
# justify or revise the default threshold values.
7+
#
8+
# Spec: `Plans/ptc-runner-mcp-catalog-exposure.md` §13.
9+
#
10+
# Usage (from repo root):
11+
#
12+
# mix run mcp_server/bench/catalog_bench.exs
13+
# mix run mcp_server/bench/catalog_bench.exs --runs=100
14+
# mix run mcp_server/bench/catalog_bench.exs --out=catalog_bench_report.md
15+
16+
{opts, _, _} =
17+
OptionParser.parse(System.argv(),
18+
strict: [runs: :integer, out: :string, help: :boolean],
19+
aliases: [h: :help, n: :runs, o: :out]
20+
)
21+
22+
if Keyword.get(opts, :help, false) do
23+
IO.puts("""
24+
Usage:
25+
mix run mcp_server/bench/catalog_bench.exs [options]
26+
27+
Options:
28+
--runs=N Number of iterations per measurement (default: 50)
29+
--out=PATH Write markdown report to file
30+
-h, --help Show this help
31+
""")
32+
33+
System.halt(0)
34+
end
35+
36+
runs = Keyword.get(opts, :runs, 50)
37+
out_path = Keyword.get(opts, :out)
38+
39+
alias PtcRunnerMcp.{CatalogConfig, CatalogDescription}
40+
41+
defmodule CatalogBench.Helpers do
42+
@moduledoc false
43+
44+
def make_entries(tool_count, servers \\ 3) do
45+
per_server = div(tool_count, servers)
46+
remainder = rem(tool_count, servers)
47+
48+
Enum.map(1..servers, fn i ->
49+
count = if i <= remainder, do: per_server + 1, else: per_server
50+
51+
%{
52+
name: "server_#{i}",
53+
tools: make_tools(count, "server_#{i}"),
54+
impl: PtcRunnerMcp.Upstream.Stdio,
55+
metadata: %{
56+
description: "Test server #{i} for benchmarking catalog rendering",
57+
capabilities: ["capability_a", "capability_b"]
58+
}
59+
}
60+
end)
61+
end
62+
63+
defp make_tools(count, server_name) do
64+
Enum.map(1..max(count, 0), fn i ->
65+
%{
66+
name: "#{server_name}_tool_#{i}",
67+
description:
68+
"Performs operation #{i} on #{server_name} resources. " <>
69+
"Accepts standard parameters and returns structured results.",
70+
input_schema: %{
71+
"type" => "object",
72+
"properties" => %{
73+
"query" => %{"type" => "string", "description" => "Search query"},
74+
"limit" => %{"type" => "integer", "description" => "Max results"}
75+
},
76+
"required" => ["query"]
77+
}
78+
}
79+
end)
80+
end
81+
82+
def measure_us(fun, runs) do
83+
times =
84+
Enum.map(1..runs, fn _ ->
85+
{time_us, _result} = :timer.tc(fun)
86+
time_us
87+
end)
88+
89+
sorted = Enum.sort(times)
90+
median = Enum.at(sorted, div(length(sorted), 2))
91+
p99 = Enum.at(sorted, trunc(length(sorted) * 0.99))
92+
mean = div(Enum.sum(times), length(times))
93+
94+
%{median_us: median, p99_us: p99, mean_us: mean}
95+
end
96+
end
97+
98+
alias CatalogBench.Helpers
99+
100+
tool_counts = [10, 30, 50, 100, 200]
101+
102+
inline_config = Map.put(CatalogConfig.defaults(), :catalog_mode, :inline)
103+
lazy_config = Map.put(CatalogConfig.defaults(), :catalog_mode, :lazy)
104+
105+
IO.puts("\nCatalog Exposure Benchmark (#{runs} runs per measurement)")
106+
IO.puts(String.duplicate("=", 60))
107+
108+
results =
109+
Enum.map(tool_counts, fn count ->
110+
entries = Helpers.make_entries(count)
111+
112+
inline_text = CatalogDescription.render_for_entries(entries, inline_config)
113+
lazy_text = CatalogDescription.render_for_entries(entries, lazy_config)
114+
115+
inline_chars = if inline_text, do: String.length(inline_text), else: 0
116+
lazy_chars = if lazy_text, do: String.length(lazy_text), else: 0
117+
118+
inline_timing =
119+
Helpers.measure_us(
120+
fn -> CatalogDescription.render_for_entries(entries, inline_config) end,
121+
runs
122+
)
123+
124+
lazy_timing =
125+
Helpers.measure_us(
126+
fn -> CatalogDescription.render_for_entries(entries, lazy_config) end,
127+
runs
128+
)
129+
130+
auto_config = CatalogConfig.defaults()
131+
auto_text = CatalogDescription.render_for_entries(entries, auto_config)
132+
133+
auto_mode =
134+
if auto_text && String.contains?(auto_text, "catalog/search-tools"),
135+
do: "lazy",
136+
else: "inline"
137+
138+
%{
139+
tools: count,
140+
inline_chars: inline_chars,
141+
lazy_chars: lazy_chars,
142+
inline_median_us: inline_timing.median_us,
143+
inline_p99_us: inline_timing.p99_us,
144+
lazy_median_us: lazy_timing.median_us,
145+
lazy_p99_us: lazy_timing.p99_us,
146+
auto_mode: auto_mode
147+
}
148+
end)
149+
150+
size_rows =
151+
Enum.map_join(results, "\n", fn r ->
152+
"| #{r.tools} | #{r.inline_chars} | #{r.lazy_chars} | #{r.auto_mode} |"
153+
end)
154+
155+
latency_rows =
156+
Enum.map_join(results, "\n", fn r ->
157+
"| #{r.tools} | #{r.inline_median_us} | #{r.inline_p99_us} | #{r.lazy_median_us} | #{r.lazy_p99_us} |"
158+
end)
159+
160+
threshold_rows =
161+
Enum.map_join(results, "\n", fn r ->
162+
threshold_note =
163+
cond do
164+
r.tools > 40 -> "Over tool threshold (#{r.tools} > 40)"
165+
r.inline_chars > 12_000 -> "Over char threshold (#{r.inline_chars} > 12000)"
166+
true -> "Under both thresholds"
167+
end
168+
169+
"- **#{r.tools} tools**: #{threshold_note}. Auto selects **#{r.auto_mode}**. " <>
170+
"Inline: #{r.inline_chars} chars, lazy: #{r.lazy_chars} chars."
171+
end)
172+
173+
report =
174+
"""
175+
## Catalog Description Size
176+
177+
| Tools | Inline (chars) | Lazy (chars) | Auto mode |
178+
|------:|---------------:|-------------:|-----------|
179+
#{size_rows}
180+
181+
## Rendering Latency
182+
183+
| Tools | Inline median (µs) | Inline p99 (µs) | Lazy median (µs) | Lazy p99 (µs) |
184+
|------:|--------------------:|----------------:|------------------:|---------------:|
185+
#{latency_rows}
186+
187+
## Threshold Analysis
188+
189+
Default thresholds: `catalog_inline_max_chars=12000`, `catalog_inline_max_tools=40`.
190+
191+
#{threshold_rows}
192+
"""
193+
|> String.trim_leading()
194+
195+
IO.puts(report)
196+
197+
if out_path do
198+
File.write!(out_path, report)
199+
IO.puts("Report written to #{out_path}")
200+
end

0 commit comments

Comments
 (0)