|
| 1 | +//////////////////////////////////////////////////////////////////////////////// |
| 2 | +// |
| 3 | +// MIT License |
| 4 | +// |
| 5 | +// Copyright (c) 2024 - 2026 Advanced Micro Devices, Inc. |
| 6 | +// |
| 7 | +// Permission is hereby granted, free of charge, to any person obtaining a copy |
| 8 | +// of this software and associated documentation files (the "Software"), to deal |
| 9 | +// in the Software without restriction, including without limitation the rights |
| 10 | +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 11 | +// copies of the Software, and to permit persons to whom the Software is |
| 12 | +// furnished to do so, subject to the following conditions: |
| 13 | +// |
| 14 | +// The above copyright notice and this permission notice shall be included in |
| 15 | +// all copies or substantial portions of the Software. |
| 16 | +// |
| 17 | +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 18 | +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 19 | +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 20 | +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 21 | +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 22 | +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 23 | +// SOFTWARE. |
| 24 | +// |
| 25 | +//////////////////////////////////////////////////////////////////////////////// |
| 26 | +// |
| 27 | +// Framework benchmarks measure the OpenVX graph runtime itself rather than |
| 28 | +// individual kernel throughput. The single most important framework metric is |
| 29 | +// the "graph dividend": for an N-stage processing chain, how much faster is |
| 30 | +// the graph form (one verified DAG, intermediates managed by the runtime) than |
| 31 | +// the equivalent N back-to-back immediate-mode (vxu*) calls? |
| 32 | +// |
| 33 | +// This file implements graph_dividend across two chain shapes: |
| 34 | +// - GraphDividend_Box3x3_x4: 4 chained Box3x3 nodes. Same kernel everywhere |
| 35 | +// so the headline number isolates framework orchestration overhead from |
| 36 | +// per-kernel work mix. |
| 37 | +// - GraphDividend_MixedFilters: Gaussian3x3 -> Box3x3 -> Median3x3 -> |
| 38 | +// Erode3x3. Realistic mix of filter types; the dividend here reflects what |
| 39 | +// a real pipeline would see. |
| 40 | +// |
| 41 | +// For each chain we report: |
| 42 | +// sum_immediate_ms - sum of N standalone vxu* calls per iteration |
| 43 | +// graph_real_ms - one graph with real (non-virtual) intermediates |
| 44 | +// graph_virtual_ms - one graph with virtual intermediates (the form most |
| 45 | +// OpenVX implementations can optimize aggressively) |
| 46 | +// graph_speedup - sum_immediate_ms / graph_virtual_ms (>1 = graph wins) |
| 47 | +// virtual_dividend - graph_real_ms / graph_virtual_ms (>1 = virtual |
| 48 | +// intermediates help) |
| 49 | +// |
| 50 | +//////////////////////////////////////////////////////////////////////////////// |
| 51 | + |
| 52 | +#include "benchmark_runner.h" |
| 53 | +#include "benchmark_stats.h" |
| 54 | +#include "benchmark_timer.h" |
| 55 | +#include "resource_tracker.h" |
| 56 | +#include "test_data_generator.h" |
| 57 | +#include <VX/vx.h> |
| 58 | +#include <VX/vxu.h> |
| 59 | +#include <functional> |
| 60 | +#include <string> |
| 61 | +#include <vector> |
| 62 | + |
| 63 | +namespace { |
| 64 | + |
| 65 | +// A single stage of a U8->U8 filter chain. Both forms (graph node + immediate |
| 66 | +// function) take exactly one input image and one output image so the chain is |
| 67 | +// trivially composable. |
| 68 | +struct ChainStage { |
| 69 | + std::string kernel_name; |
| 70 | + std::function<vx_node(vx_graph, vx_image, vx_image)> make_node; |
| 71 | + std::function<vx_status(vx_context, vx_image, vx_image)> immediate; |
| 72 | +}; |
| 73 | + |
| 74 | +// Time the chain executed as N back-to-back vxu* immediate-mode calls. |
| 75 | +// Returns the median wall-clock time (ns) for one full chain pass. |
| 76 | +double timeImmediateChain(vx_context ctx, uint32_t width, uint32_t height, |
| 77 | + const std::vector<ChainStage>& stages, |
| 78 | + int warmup, int iterations, |
| 79 | + TestDataGenerator& gen) { |
| 80 | + ResourceTracker tracker; |
| 81 | + |
| 82 | + vx_image input = gen.createFilledImage(ctx, width, height, VX_DF_IMAGE_U8); |
| 83 | + if (vxGetStatus((vx_reference)input) != VX_SUCCESS) return 0.0; |
| 84 | + tracker.trackImage(input); |
| 85 | + |
| 86 | + // Reusable intermediates, one per stage boundary, plus the final output. |
| 87 | + std::vector<vx_image> buffers; |
| 88 | + buffers.reserve(stages.size()); |
| 89 | + for (size_t i = 0; i < stages.size(); i++) { |
| 90 | + vx_image buf = vxCreateImage(ctx, width, height, VX_DF_IMAGE_U8); |
| 91 | + if (vxGetStatus((vx_reference)buf) != VX_SUCCESS) return 0.0; |
| 92 | + tracker.trackImage(buf); |
| 93 | + buffers.push_back(buf); |
| 94 | + } |
| 95 | + |
| 96 | + auto runOnce = [&]() -> vx_status { |
| 97 | + vx_image src = input; |
| 98 | + for (size_t i = 0; i < stages.size(); i++) { |
| 99 | + vx_status s = stages[i].immediate(ctx, src, buffers[i]); |
| 100 | + if (s != VX_SUCCESS) return s; |
| 101 | + src = buffers[i]; |
| 102 | + } |
| 103 | + return VX_SUCCESS; |
| 104 | + }; |
| 105 | + |
| 106 | + for (int i = 0; i < warmup; i++) runOnce(); |
| 107 | + |
| 108 | + std::vector<double> samples; |
| 109 | + samples.reserve(iterations); |
| 110 | + BenchmarkTimer timer; |
| 111 | + for (int i = 0; i < iterations; i++) { |
| 112 | + timer.start(); |
| 113 | + if (runOnce() != VX_SUCCESS) return 0.0; |
| 114 | + timer.stop(); |
| 115 | + samples.push_back(timer.elapsed_ns()); |
| 116 | + } |
| 117 | + |
| 118 | + return BenchmarkStats::compute(samples).median_ns; |
| 119 | +} |
| 120 | + |
| 121 | +// Time the chain executed as one verified graph. When use_virtual=true the |
| 122 | +// intermediates are vxCreateVirtualImage so the runtime is free to fuse, |
| 123 | +// alias, or tile them; when false they are real vxCreateImage objects with |
| 124 | +// host-visible storage. |
| 125 | +// |
| 126 | +// Returns the median wall-clock time (ns) of one vxProcessGraph call. |
| 127 | +double timeGraphChain(vx_context ctx, uint32_t width, uint32_t height, |
| 128 | + const std::vector<ChainStage>& stages, |
| 129 | + bool use_virtual, |
| 130 | + int warmup, int iterations, |
| 131 | + TestDataGenerator& gen) { |
| 132 | + ResourceTracker tracker; |
| 133 | + |
| 134 | + vx_graph graph = vxCreateGraph(ctx); |
| 135 | + if (vxGetStatus((vx_reference)graph) != VX_SUCCESS) return 0.0; |
| 136 | + tracker.trackGraph(graph); |
| 137 | + |
| 138 | + vx_image input = gen.createFilledImage(ctx, width, height, VX_DF_IMAGE_U8); |
| 139 | + if (vxGetStatus((vx_reference)input) != VX_SUCCESS) return 0.0; |
| 140 | + tracker.trackImage(input); |
| 141 | + |
| 142 | + // Final output is always real so the runtime has somewhere observable to |
| 143 | + // write to (otherwise dead-code elimination could in principle drop the |
| 144 | + // whole chain). |
| 145 | + vx_image output = vxCreateImage(ctx, width, height, VX_DF_IMAGE_U8); |
| 146 | + if (vxGetStatus((vx_reference)output) != VX_SUCCESS) return 0.0; |
| 147 | + tracker.trackImage(output); |
| 148 | + |
| 149 | + vx_image src = input; |
| 150 | + for (size_t i = 0; i < stages.size(); i++) { |
| 151 | + bool is_last = (i + 1 == stages.size()); |
| 152 | + vx_image dst; |
| 153 | + if (is_last) { |
| 154 | + dst = output; |
| 155 | + } else if (use_virtual) { |
| 156 | + dst = vxCreateVirtualImage(graph, width, height, VX_DF_IMAGE_U8); |
| 157 | + } else { |
| 158 | + dst = vxCreateImage(ctx, width, height, VX_DF_IMAGE_U8); |
| 159 | + } |
| 160 | + if (vxGetStatus((vx_reference)dst) != VX_SUCCESS) return 0.0; |
| 161 | + if (!is_last) tracker.trackImage(dst); |
| 162 | + |
| 163 | + vx_node node = stages[i].make_node(graph, src, dst); |
| 164 | + if (vxGetStatus((vx_reference)node) != VX_SUCCESS) return 0.0; |
| 165 | + tracker.trackNode(node); |
| 166 | + |
| 167 | + src = dst; |
| 168 | + } |
| 169 | + |
| 170 | + if (vxVerifyGraph(graph) != VX_SUCCESS) return 0.0; |
| 171 | + |
| 172 | + for (int i = 0; i < warmup; i++) vxProcessGraph(graph); |
| 173 | + |
| 174 | + std::vector<double> samples; |
| 175 | + samples.reserve(iterations); |
| 176 | + BenchmarkTimer timer; |
| 177 | + for (int i = 0; i < iterations; i++) { |
| 178 | + timer.start(); |
| 179 | + if (vxProcessGraph(graph) != VX_SUCCESS) return 0.0; |
| 180 | + timer.stop(); |
| 181 | + samples.push_back(timer.elapsed_ns()); |
| 182 | + } |
| 183 | + |
| 184 | + return BenchmarkStats::compute(samples).median_ns; |
| 185 | +} |
| 186 | + |
| 187 | +// Run all three timing modes for a chain and return a populated |
| 188 | +// BenchmarkResult with framework_metrics filled. The runner backfills name / |
| 189 | +// category / feature_set / resolution after this returns. |
| 190 | +BenchmarkResult runGraphDividend(const std::vector<ChainStage>& stages, |
| 191 | + vx_context ctx, const Resolution& res, |
| 192 | + const BenchmarkConfig& cfg) { |
| 193 | + BenchmarkResult r; |
| 194 | + r.iterations = cfg.iterations; |
| 195 | + r.warmup = cfg.warmup; |
| 196 | + |
| 197 | + TestDataGenerator gen(cfg.seed); |
| 198 | + |
| 199 | + double t_imm = timeImmediateChain(ctx, res.width, res.height, stages, |
| 200 | + cfg.warmup, cfg.iterations, gen); |
| 201 | + double t_real = timeGraphChain(ctx, res.width, res.height, stages, |
| 202 | + /*use_virtual=*/false, |
| 203 | + cfg.warmup, cfg.iterations, gen); |
| 204 | + double t_virt = timeGraphChain(ctx, res.width, res.height, stages, |
| 205 | + /*use_virtual=*/true, |
| 206 | + cfg.warmup, cfg.iterations, gen); |
| 207 | + |
| 208 | + if (t_imm <= 0.0 || t_real <= 0.0 || t_virt <= 0.0) { |
| 209 | + r.supported = false; |
| 210 | + r.skip_reason = "chain timing failed (resource creation or graph verify error)"; |
| 211 | + return r; |
| 212 | + } |
| 213 | + |
| 214 | + double speedup = t_imm / t_virt; |
| 215 | + double virt_div = t_real / t_virt; |
| 216 | + |
| 217 | + r.framework_metrics = { |
| 218 | + {"sum_immediate_ms", t_imm / 1e6, "ms", false}, |
| 219 | + {"graph_real_ms", t_real / 1e6, "ms", false}, |
| 220 | + {"graph_virtual_ms", t_virt / 1e6, "ms", false}, |
| 221 | + {"graph_speedup", speedup, "x", true}, |
| 222 | + {"virtual_dividend", virt_div, "x", true}, |
| 223 | + }; |
| 224 | + |
| 225 | + // Surface the best graph form as the canonical wall-clock / MP/s so the |
| 226 | + // result aggregates sensibly in scaling and top-N views without polluting |
| 227 | + // the existing per-feature-set Vision Score (which only includes |
| 228 | + // feature_set == "vision" / "enhanced_vision"). |
| 229 | + r.wall_clock.median_ns = t_virt; |
| 230 | + r.wall_clock.mean_ns = t_virt; |
| 231 | + r.wall_clock.min_ns = t_virt; |
| 232 | + r.wall_clock.max_ns = t_virt; |
| 233 | + r.wall_clock.sample_count = static_cast<size_t>(cfg.iterations); |
| 234 | + r.megapixels_per_sec = BenchmarkStats::computeThroughput( |
| 235 | + res.width, res.height, t_virt); |
| 236 | + |
| 237 | + return r; |
| 238 | +} |
| 239 | + |
| 240 | +// Build the canonical "pure framework" chain: 4 Box3x3 nodes back-to-back. |
| 241 | +std::vector<ChainStage> makeBox3x3Chain() { |
| 242 | + ChainStage box; |
| 243 | + box.kernel_name = "Box3x3"; |
| 244 | + box.make_node = [](vx_graph g, vx_image in, vx_image out) { |
| 245 | + return vxBox3x3Node(g, in, out); |
| 246 | + }; |
| 247 | + box.immediate = [](vx_context c, vx_image in, vx_image out) { |
| 248 | + return vxuBox3x3(c, in, out); |
| 249 | + }; |
| 250 | + return {box, box, box, box}; |
| 251 | +} |
| 252 | + |
| 253 | +// Build the "realistic" chain: Gaussian3x3 -> Box3x3 -> Median3x3 -> Erode3x3. |
| 254 | +std::vector<ChainStage> makeMixedFilterChain() { |
| 255 | + ChainStage gauss; |
| 256 | + gauss.kernel_name = "Gaussian3x3"; |
| 257 | + gauss.make_node = [](vx_graph g, vx_image in, vx_image out) { |
| 258 | + return vxGaussian3x3Node(g, in, out); |
| 259 | + }; |
| 260 | + gauss.immediate = [](vx_context c, vx_image in, vx_image out) { |
| 261 | + return vxuGaussian3x3(c, in, out); |
| 262 | + }; |
| 263 | + |
| 264 | + ChainStage box; |
| 265 | + box.kernel_name = "Box3x3"; |
| 266 | + box.make_node = [](vx_graph g, vx_image in, vx_image out) { |
| 267 | + return vxBox3x3Node(g, in, out); |
| 268 | + }; |
| 269 | + box.immediate = [](vx_context c, vx_image in, vx_image out) { |
| 270 | + return vxuBox3x3(c, in, out); |
| 271 | + }; |
| 272 | + |
| 273 | + ChainStage median; |
| 274 | + median.kernel_name = "Median3x3"; |
| 275 | + median.make_node = [](vx_graph g, vx_image in, vx_image out) { |
| 276 | + return vxMedian3x3Node(g, in, out); |
| 277 | + }; |
| 278 | + median.immediate = [](vx_context c, vx_image in, vx_image out) { |
| 279 | + return vxuMedian3x3(c, in, out); |
| 280 | + }; |
| 281 | + |
| 282 | + ChainStage erode; |
| 283 | + erode.kernel_name = "Erode3x3"; |
| 284 | + erode.make_node = [](vx_graph g, vx_image in, vx_image out) { |
| 285 | + return vxErode3x3Node(g, in, out); |
| 286 | + }; |
| 287 | + erode.immediate = [](vx_context c, vx_image in, vx_image out) { |
| 288 | + return vxuErode3x3(c, in, out); |
| 289 | + }; |
| 290 | + |
| 291 | + return {gauss, box, median, erode}; |
| 292 | +} |
| 293 | + |
| 294 | +} // namespace |
| 295 | + |
| 296 | +std::vector<BenchmarkCase> registerFrameworkBenchmarks() { |
| 297 | + std::vector<BenchmarkCase> cases; |
| 298 | + |
| 299 | + { |
| 300 | + BenchmarkCase bc; |
| 301 | + bc.name = "GraphDividend_Box3x3_x4"; |
| 302 | + bc.category = "framework_dividend"; |
| 303 | + bc.feature_set = "framework"; |
| 304 | + bc.kernel_enum = VX_KERNEL_BOX_3x3; |
| 305 | + bc.required_kernels = {VX_KERNEL_BOX_3x3}; |
| 306 | + bc.framework_run = [](vx_context ctx, const Resolution& res, |
| 307 | + const BenchmarkConfig& cfg) -> BenchmarkResult { |
| 308 | + return runGraphDividend(makeBox3x3Chain(), ctx, res, cfg); |
| 309 | + }; |
| 310 | + cases.push_back(bc); |
| 311 | + } |
| 312 | + |
| 313 | + { |
| 314 | + BenchmarkCase bc; |
| 315 | + bc.name = "GraphDividend_MixedFilters"; |
| 316 | + bc.category = "framework_dividend"; |
| 317 | + bc.feature_set = "framework"; |
| 318 | + bc.kernel_enum = VX_KERNEL_GAUSSIAN_3x3; |
| 319 | + bc.required_kernels = {VX_KERNEL_GAUSSIAN_3x3, VX_KERNEL_BOX_3x3, |
| 320 | + VX_KERNEL_MEDIAN_3x3, VX_KERNEL_ERODE_3x3}; |
| 321 | + bc.framework_run = [](vx_context ctx, const Resolution& res, |
| 322 | + const BenchmarkConfig& cfg) -> BenchmarkResult { |
| 323 | + return runGraphDividend(makeMixedFilterChain(), ctx, res, cfg); |
| 324 | + }; |
| 325 | + cases.push_back(bc); |
| 326 | + } |
| 327 | + |
| 328 | + return cases; |
| 329 | +} |
0 commit comments