|
| 1 | +# Catalog exposure benchmark: description size and tools/list latency. |
| 2 | +# |
| 3 | +# Measures rendered description size (characters) and rendering latency |
| 4 | +# at 10, 30, 50, 100, and 200 tools for both inline and lazy modes. |
| 5 | +# Outputs a markdown table suitable for pasting into a PR comment to |
| 6 | +# justify or revise the default threshold values. |
| 7 | +# |
| 8 | +# Spec: `Plans/ptc-runner-mcp-catalog-exposure.md` §13. |
| 9 | +# |
| 10 | +# Usage (from repo root): |
| 11 | +# |
| 12 | +# mix run mcp_server/bench/catalog_bench.exs |
| 13 | +# mix run mcp_server/bench/catalog_bench.exs --runs=100 |
| 14 | +# mix run mcp_server/bench/catalog_bench.exs --out=catalog_bench_report.md |
| 15 | + |
| 16 | +{opts, _, _} = |
| 17 | + OptionParser.parse(System.argv(), |
| 18 | + strict: [runs: :integer, out: :string, help: :boolean], |
| 19 | + aliases: [h: :help, n: :runs, o: :out] |
| 20 | + ) |
| 21 | + |
| 22 | +if Keyword.get(opts, :help, false) do |
| 23 | + IO.puts(""" |
| 24 | + Usage: |
| 25 | + mix run mcp_server/bench/catalog_bench.exs [options] |
| 26 | +
|
| 27 | + Options: |
| 28 | + --runs=N Number of iterations per measurement (default: 50) |
| 29 | + --out=PATH Write markdown report to file |
| 30 | + -h, --help Show this help |
| 31 | + """) |
| 32 | + |
| 33 | + System.halt(0) |
| 34 | +end |
| 35 | + |
| 36 | +runs = Keyword.get(opts, :runs, 50) |
| 37 | +out_path = Keyword.get(opts, :out) |
| 38 | + |
| 39 | +alias PtcRunnerMcp.{CatalogConfig, CatalogDescription} |
| 40 | + |
| 41 | +defmodule CatalogBench.Helpers do |
| 42 | + @moduledoc false |
| 43 | + |
| 44 | + def make_entries(tool_count, servers \\ 3) do |
| 45 | + per_server = div(tool_count, servers) |
| 46 | + remainder = rem(tool_count, servers) |
| 47 | + |
| 48 | + Enum.map(1..servers, fn i -> |
| 49 | + count = if i <= remainder, do: per_server + 1, else: per_server |
| 50 | + |
| 51 | + %{ |
| 52 | + name: "server_#{i}", |
| 53 | + tools: make_tools(count, "server_#{i}"), |
| 54 | + impl: PtcRunnerMcp.Upstream.Stdio, |
| 55 | + metadata: %{ |
| 56 | + description: "Test server #{i} for benchmarking catalog rendering", |
| 57 | + capabilities: ["capability_a", "capability_b"] |
| 58 | + } |
| 59 | + } |
| 60 | + end) |
| 61 | + end |
| 62 | + |
| 63 | + defp make_tools(count, server_name) do |
| 64 | + Enum.map(1..max(count, 0), fn i -> |
| 65 | + %{ |
| 66 | + name: "#{server_name}_tool_#{i}", |
| 67 | + description: |
| 68 | + "Performs operation #{i} on #{server_name} resources. " <> |
| 69 | + "Accepts standard parameters and returns structured results.", |
| 70 | + input_schema: %{ |
| 71 | + "type" => "object", |
| 72 | + "properties" => %{ |
| 73 | + "query" => %{"type" => "string", "description" => "Search query"}, |
| 74 | + "limit" => %{"type" => "integer", "description" => "Max results"} |
| 75 | + }, |
| 76 | + "required" => ["query"] |
| 77 | + } |
| 78 | + } |
| 79 | + end) |
| 80 | + end |
| 81 | + |
| 82 | + def measure_us(fun, runs) do |
| 83 | + times = |
| 84 | + Enum.map(1..runs, fn _ -> |
| 85 | + {time_us, _result} = :timer.tc(fun) |
| 86 | + time_us |
| 87 | + end) |
| 88 | + |
| 89 | + sorted = Enum.sort(times) |
| 90 | + median = Enum.at(sorted, div(length(sorted), 2)) |
| 91 | + p99 = Enum.at(sorted, trunc(length(sorted) * 0.99)) |
| 92 | + mean = div(Enum.sum(times), length(times)) |
| 93 | + |
| 94 | + %{median_us: median, p99_us: p99, mean_us: mean} |
| 95 | + end |
| 96 | +end |
| 97 | + |
| 98 | +alias CatalogBench.Helpers |
| 99 | + |
| 100 | +tool_counts = [10, 30, 50, 100, 200] |
| 101 | + |
| 102 | +inline_config = Map.put(CatalogConfig.defaults(), :catalog_mode, :inline) |
| 103 | +lazy_config = Map.put(CatalogConfig.defaults(), :catalog_mode, :lazy) |
| 104 | + |
| 105 | +IO.puts("\nCatalog Exposure Benchmark (#{runs} runs per measurement)") |
| 106 | +IO.puts(String.duplicate("=", 60)) |
| 107 | + |
| 108 | +results = |
| 109 | + Enum.map(tool_counts, fn count -> |
| 110 | + entries = Helpers.make_entries(count) |
| 111 | + |
| 112 | + inline_text = CatalogDescription.render_for_entries(entries, inline_config) |
| 113 | + lazy_text = CatalogDescription.render_for_entries(entries, lazy_config) |
| 114 | + |
| 115 | + inline_chars = if inline_text, do: String.length(inline_text), else: 0 |
| 116 | + lazy_chars = if lazy_text, do: String.length(lazy_text), else: 0 |
| 117 | + |
| 118 | + inline_timing = |
| 119 | + Helpers.measure_us( |
| 120 | + fn -> CatalogDescription.render_for_entries(entries, inline_config) end, |
| 121 | + runs |
| 122 | + ) |
| 123 | + |
| 124 | + lazy_timing = |
| 125 | + Helpers.measure_us( |
| 126 | + fn -> CatalogDescription.render_for_entries(entries, lazy_config) end, |
| 127 | + runs |
| 128 | + ) |
| 129 | + |
| 130 | + auto_config = CatalogConfig.defaults() |
| 131 | + auto_text = CatalogDescription.render_for_entries(entries, auto_config) |
| 132 | + |
| 133 | + auto_mode = |
| 134 | + if auto_text && String.contains?(auto_text, "catalog/search-tools"), |
| 135 | + do: "lazy", |
| 136 | + else: "inline" |
| 137 | + |
| 138 | + %{ |
| 139 | + tools: count, |
| 140 | + inline_chars: inline_chars, |
| 141 | + lazy_chars: lazy_chars, |
| 142 | + inline_median_us: inline_timing.median_us, |
| 143 | + inline_p99_us: inline_timing.p99_us, |
| 144 | + lazy_median_us: lazy_timing.median_us, |
| 145 | + lazy_p99_us: lazy_timing.p99_us, |
| 146 | + auto_mode: auto_mode |
| 147 | + } |
| 148 | + end) |
| 149 | + |
| 150 | +size_rows = |
| 151 | + Enum.map_join(results, "\n", fn r -> |
| 152 | + "| #{r.tools} | #{r.inline_chars} | #{r.lazy_chars} | #{r.auto_mode} |" |
| 153 | + end) |
| 154 | + |
| 155 | +latency_rows = |
| 156 | + Enum.map_join(results, "\n", fn r -> |
| 157 | + "| #{r.tools} | #{r.inline_median_us} | #{r.inline_p99_us} | #{r.lazy_median_us} | #{r.lazy_p99_us} |" |
| 158 | + end) |
| 159 | + |
| 160 | +threshold_rows = |
| 161 | + Enum.map_join(results, "\n", fn r -> |
| 162 | + threshold_note = |
| 163 | + cond do |
| 164 | + r.tools > 40 -> "Over tool threshold (#{r.tools} > 40)" |
| 165 | + r.inline_chars > 12_000 -> "Over char threshold (#{r.inline_chars} > 12000)" |
| 166 | + true -> "Under both thresholds" |
| 167 | + end |
| 168 | + |
| 169 | + "- **#{r.tools} tools**: #{threshold_note}. Auto selects **#{r.auto_mode}**. " <> |
| 170 | + "Inline: #{r.inline_chars} chars, lazy: #{r.lazy_chars} chars." |
| 171 | + end) |
| 172 | + |
| 173 | +report = |
| 174 | + """ |
| 175 | + ## Catalog Description Size |
| 176 | +
|
| 177 | + | Tools | Inline (chars) | Lazy (chars) | Auto mode | |
| 178 | + |------:|---------------:|-------------:|-----------| |
| 179 | + #{size_rows} |
| 180 | +
|
| 181 | + ## Rendering Latency |
| 182 | +
|
| 183 | + | Tools | Inline median (µs) | Inline p99 (µs) | Lazy median (µs) | Lazy p99 (µs) | |
| 184 | + |------:|--------------------:|----------------:|------------------:|---------------:| |
| 185 | + #{latency_rows} |
| 186 | +
|
| 187 | + ## Threshold Analysis |
| 188 | +
|
| 189 | + Default thresholds: `catalog_inline_max_chars=12000`, `catalog_inline_max_tools=40`. |
| 190 | +
|
| 191 | + #{threshold_rows} |
| 192 | + """ |
| 193 | + |> String.trim_leading() |
| 194 | + |
| 195 | +IO.puts(report) |
| 196 | + |
| 197 | +if out_path do |
| 198 | + File.write!(out_path, report) |
| 199 | + IO.puts("Report written to #{out_path}") |
| 200 | +end |
0 commit comments