openvx-mark/.github/workflows/ci.yml at 2ca16aca95062562f72955b53eebf825a5ddb3af · kiritigowda/openvx-mark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
name: CI

on:
  push:
    branches: [main]
  # Run CI on pull requests targeting any base branch, not just main.
  # This keeps stacked PR workflows covered (a PR's base may be another
  # feature branch, e.g. an umbrella branch or a previous PR in a stack).
  pull_request:

# Auto-cancel superseded runs on the same ref so a rapid push series
# (e.g. force-push during PR review) doesn't queue 3+ stale runs and
# starve the GitHub Actions runner pool. main pushes are exempt — we
# always want a clean signal on main.
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

# ============================================================================
# Architecture
#
# Phase 1 (parallel) — four independent build jobs:
#   * Three OpenVX-impl jobs (MIVisionX, Khronos sample, rustVX). Each:
#       1. Builds the implementation from source.
#       2. Stages a self-contained artifact: <impl>-stage/lib + <impl>-stage/include.
#       3. Builds openvx-mark against the just-built impl.
#       4. Runs a quick smoke benchmark as a "local unit test" — catches
#          build-link breakage and missing-symbol issues immediately,
#          scoped to the specific impl, without waiting for the slower
#          comparison job downstream.
#       5. Uploads the staged artifact for the comparison job to consume.
#   * One OpenCV-baseline job (opencv-mark companion binary). Differs from
#     the OpenVX jobs because OpenCV is apt-installable and opencv-mark has
#     no OpenVX dependency — see the build-opencv job below for the shape.
#     Stages its smoke JSON directly (no impl tarball needed).
#
# Per-impl feature-set policy
# ---------------------------
# Not every impl ships the full OpenVX 1.3.1 conformance surface, so each
# bench is scoped to the feature sets that impl actually implements:
#
#   * MIVisionX     — `vision,framework`. AMD's runtime exports the 42
#                     Vision Conformance kernels but **does NOT export
#                     most of the 19 Enhanced Vision APIs** (Bilateral-
#                     Filter, HOG*, Tensor*, Select, ScalarOperation,
#                     etc.). With `enhanced_vision` enabled, the per-
#                     benchmark dlsym shim in openvx_optional_apis.h
#                     would dutifully report 19 SKIPPED rows, which is
#                     accurate but uninformative noise on every run —
#                     so we omit it.
#   * Khronos sample — `vision,framework` + `enhanced_vision` split
#                     across TWO invocations. CTS-conformant reference
#                     impl, ships both profiles on paper, but the
#                     sample-impl's enhanced_vision tensor kernels
#                     (TensorAdd, TensorSub, ...) SIGSEGV inside
#                     vxProcessGraph and take the whole bench process
#                     down. The split (rock-solid set first, crash-
#                     prone set second, merge via merge_reports.py)
#                     guarantees we ALWAYS get vision+framework data
#                     even when the enhanced_vision invocation dies.
#   * rustVX        — `vision,enhanced_vision,framework`. CTS-conformant
#                     for Vision (5923/5923) and Enhanced Vision
#                     (1235/1235) per the rustVX README.
#   * opencv-mark   — `vision,enhanced_vision` (no `framework`; cv:: has
#                     no graph runtime to measure). All 79 + 19 = 98
#                     OpenCV-side benchmarks run.
#
# Phase 2 (single job, depends on all four Phase-1 jobs) — comparison.
#   1. Downloads all three OpenVX impl artifacts onto a single runner;
#      apt-installs OpenCV on that same runner.
#   2. Builds openvx-mark × 3 (one per OpenVX impl) so all binaries link
#      against the same openvx-mark source tree at the same commit.
#      Builds opencv-mark from the same source tree.
#   3. Runs the full benchmark against each impl using that impl's
#      feature-set policy (above). Same hardware = fair cross-vendor
#      comparison. `compare_reports.py` joins by (name, mode, resolution)
#      and silently drops rows not on both sides, so enhanced_vision
#      rows naturally appear in pairs where both impls produced them
#      (Khronos↔OpenCV, rustVX↔OpenCV, Khronos↔rustVX) and are absent
#      from MIVisionX↔* pairs.
#   4. Generates six pairwise comparison reports:
#        OpenVX-vs-OpenVX:
#          * MIVisionX vs Khronos sample
#          * MIVisionX vs rustVX
#          * Khronos sample vs rustVX
#        OpenVX-vs-OpenCV (the "does adopting OpenVX pay off?" trio):
#          * MIVisionX vs OpenCV
#          * Khronos sample vs OpenCV
#          * rustVX vs OpenCV
#   5. Posts each report to the job summary and uploads as an artifact.
#
# Inspired by the layered build/perf-gate design in rustVX's conformance CI:
# https://github.com/kiritigowda/rustVX/blob/main/.github/workflows/conformance.yml
# ============================================================================

jobs:
  # --------------------------------------------------------------------------
  # Phase 1 — MIVisionX (AMD OpenVX, CPU backend)
  # --------------------------------------------------------------------------
  build-mivisionx:
    name: Build MIVisionX (CPU) + smoke test
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout openvx-mark
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential cmake git python3

      # Why -DCMAKE_CXX_FLAGS_RELEASE override (the "optimized kernels" knob):
      #
      # MIVisionX's amd_openvx/openvx/ago/ago_haf_cpu_*.cpp files contain
      # hand-written AVX2 intrinsics (_mm256_*) for the CPU-side "Hardware
      # Acceleration Functions" — these are the OPTIMIZED kernel paths.
      # However, MIVisionX's own top-level CMakeLists.txt appends ONLY
      # `-msse4.2` to CMAKE_CXX_FLAGS, with no -mavx2/-mfma and no
      # __attribute__((target("avx2"))) on any function. With just -msse4.2
      # the compiler can still emit the AVX2 intrinsics in those specific
      # call sites, but it CANNOT auto-vectorise the surrounding scalar /
      # loop code beyond SSE4.2, can't use FMA, can't use BMI/BMI2 — so
      # the per-kernel dispatch glue, address arithmetic, and any kernel
      # code that's not hand-vectorised stays at SSE4.2 throughput. That's
      # the "base kernel" path the umbrella PR description points at.
      #
      # By overriding CMAKE_CXX_FLAGS_RELEASE we get -O3 -DNDEBUG plus
      # x86-64-v3 (= SSE4.2 + AVX + AVX2 + BMI + BMI2 + FMA + LZCNT + POPCNT),
      # which is the conservative-portable AMD64 baseline modern compilers
      # ship for since gcc 11. GitHub Actions Ubuntu 22.04 runners use Intel
      # Xeon or AMD EPYC CPUs which all support x86-64-v3.
      #
      # MIVisionX still appends `-msse4.2` to CMAKE_CXX_FLAGS (we don't
      # override CMAKE_CXX_FLAGS, only the per-config Release variant),
      # so the final compile line is "-O3 -DNDEBUG -march=x86-64-v3
      # -msse4.2". -march wins for code-gen ceiling; the dup -msse4.2 is
      # redundant but harmless.
      - name: Build MIVisionX (CPU backend, optimized)
        run: |
          set -euo pipefail
          git clone --depth 1 --branch develop \
            https://github.com/ROCm/MIVisionX.git /tmp/mivisionx-src
          mkdir -p /tmp/mivisionx-src/build
          cd /tmp/mivisionx-src/build
          cmake \
            -DBACKEND=CPU \
            -DNEURAL_NET=OFF \
            -DLOOM=OFF \
            -DMIGRAPHX=OFF \
            -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_CXX_FLAGS_RELEASE="-O3 -DNDEBUG -march=x86-64-v3" \
            -DCMAKE_INSTALL_PREFIX=/tmp/mivisionx-install \
            ..
          make -j$(nproc)
          make install
          # Sanity-print the actual compile flags the make rules used —
          # surfaces in CI logs so a reviewer can confirm AVX2 made it
          # into the build (look for `-march=x86-64-v3` in the cmake echo).
          grep -h 'CXX_FLAGS' CMakeFiles/openvx.dir/flags.make 2>/dev/null \
            | head -2 || true

      - name: Stage MIVisionX artifact
        id: stage
        run: |
          set -euo pipefail
          mkdir -p mivisionx-stage/lib mivisionx-stage/include
          LIB_SRC=$(dirname "$(find /tmp/mivisionx-install -name 'libopenvx.so' | head -1)")
          echo "MIVisionX libraries discovered in: $LIB_SRC"
          # Copy ALL libopenvx* / libvxu* entries (libopenvx.so symlink,
          # libopenvx.so.1 SONAME symlink, libopenvx.so.X.Y.Z real file)
          # preserving symlinks (-P) so ld.so can follow the SONAME chain.
          # Without versioned files the linker reports
          # "libopenvx.so.1: cannot open shared object file".
          find "$LIB_SRC" -maxdepth 1 -name 'libopenvx*' -exec cp -P {} mivisionx-stage/lib/ \;
          find "$LIB_SRC" -maxdepth 1 -name 'libvxu*'    -exec cp -P {} mivisionx-stage/lib/ \;
          cp -r /tmp/mivisionx-install/include/mivisionx/. mivisionx-stage/include/
          echo "--- staged lib ---"
          ls -la mivisionx-stage/lib
          echo "--- staged include (top-level) ---"
          ls -la mivisionx-stage/include
          {
            echo "lib_dir=$(pwd)/mivisionx-stage/lib"
            echo "include_dir=$(pwd)/mivisionx-stage/include"
          } >> "$GITHUB_OUTPUT"

      - name: Build openvx-mark (smoke)
        run: |
          set -euo pipefail
          mkdir -p build-smoke
          cd build-smoke
          cmake \
            -DCMAKE_BUILD_TYPE=Release \
            -DOPENVX_INCLUDES=${{ steps.stage.outputs.include_dir }} \
            -DOPENVX_LIB_DIR=${{ steps.stage.outputs.lib_dir }} \
            ..
          cmake --build . -j$(nproc)

      # Smoke covers the `vision` + `framework` feature sets only.
      # MIVisionX's runtime exports the 42 Vision Conformance kernels
      # but does NOT export most of the 19 Enhanced Vision APIs
      # (BilateralFilter, HOG*, Tensor*, Select, ScalarOperation, etc.).
      # With `enhanced_vision` enabled, the per-benchmark dlsym shim in
      # openvx_optional_apis.h would dutifully report 19 SKIPPED rows
      # on every run — accurate but uninformative noise. The Khronos
      # sample, rustVX, and opencv-mark smoke jobs DO exercise
      # `enhanced_vision` because those impls actually ship it.
      - name: Run smoke benchmark (vision + framework, VGA × 5 iters, single-threaded)
        # Smoke is advisory — if a specific impl crashes inside a
        # specific kernel the artifact upload (which the compare job
        # depends on) must still happen so vendor-vs-vendor signal
        # isn't lost.
        continue-on-error: true
        run: |
          set -eo pipefail
          cd build-smoke
          export LD_LIBRARY_PATH=${{ steps.stage.outputs.lib_dir }}:${LD_LIBRARY_PATH:-}
          # Timer self-test up front so a sloppy runner clock fails
          # loud before we trust a smoke timing number.
          ./openvx-mark --validate-timing
          # `--threads 1` matches the Phase 2 compare config — same
          # apples-to-apples threading policy on smoke and full bench
          # so smoke timings are interpretable as a coarse preview.
          ./openvx-mark --feature-set vision,framework \
            --resolution VGA --iterations 5 --warmup 1 --threads 1 \
            --output-dir smoke-results

      - name: Verify MIVisionX smoke report
        if: always()
        run: |
          set -euo pipefail
          cd build-smoke
          python3 ../scripts/check_report.py \
            smoke-results/benchmark_results.json \
            --allow-feature-set vision,framework

      - name: Upload MIVisionX artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: impl-mivisionx
          path: mivisionx-stage/
          retention-days: 1

      - name: Upload MIVisionX smoke results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: smoke-results-mivisionx
          path: build-smoke/smoke-results/
          if-no-files-found: ignore

  # --------------------------------------------------------------------------
  # Phase 1 — Khronos OpenVX sample implementation
  # --------------------------------------------------------------------------
  build-khronos-sample:
    name: Build Khronos sample + smoke test
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout openvx-mark
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential cmake git python3

      # Khronos sample is a reference impl (no SIMD intrinsics), so
      # most of the perf budget rides on whatever compiler auto-vec the
      # build picks up. Build.py honours CFLAGS / CXXFLAGS from the
      # environment, so we use those to upgrade the compile baseline
      # to x86-64-v3 (= AVX2 + FMA + BMI2 + LZCNT + POPCNT), matching
      # what the MIVisionX build above gets. No fairness claim that
      # the sample becomes "competitive" — it's a reference — just
      # that it's being measured at the SAME compile baseline as
      # MIVisionX so the cross-impl comparison isn't contaminated by
      # one side getting better auto-vec than the other.
      - name: Build Khronos OpenVX sample (Release, x86-64-v3)
        run: |
          set -euo pipefail
          git clone --recursive --depth 1 \
            https://github.com/KhronosGroup/OpenVX-sample-impl.git /tmp/khronos-src
          cd /tmp/khronos-src
          export CFLAGS="-O3 -march=x86-64-v3 ${CFLAGS:-}"
          export CXXFLAGS="-O3 -march=x86-64-v3 ${CXXFLAGS:-}"
          echo "CFLAGS  = ${CFLAGS}"
          echo "CXXFLAGS= ${CXXFLAGS}"
          python3 Build.py --os=Linux --arch=64 --conf=Release

      - name: Stage Khronos sample artifact
        id: stage
        run: |
          set -euo pipefail
          mkdir -p khronos-stage/lib khronos-stage/include
          LIB_SRC=$(dirname "$(find /tmp/khronos-src -name 'libopenvx.so' -not -path '*/build/*' | head -1)")
          echo "Khronos libraries discovered in: $LIB_SRC"
          # Same approach as MIVisionX: copy all libopenvx* / libvxu* entries
          # preserving symlinks so ld.so can follow the SONAME chain.
          find "$LIB_SRC" -maxdepth 1 -name 'libopenvx*' -exec cp -P {} khronos-stage/lib/ \;
          find "$LIB_SRC" -maxdepth 1 -name 'libvxu*'    -exec cp -P {} khronos-stage/lib/ \;
          cp -r /tmp/khronos-src/api-docs/include/. khronos-stage/include/
          echo "--- staged lib ---"
          ls -la khronos-stage/lib
          echo "--- staged include (top-level) ---"
          ls -la khronos-stage/include
          {
            echo "lib_dir=$(pwd)/khronos-stage/lib"
            echo "include_dir=$(pwd)/khronos-stage/include"
          } >> "$GITHUB_OUTPUT"

      - name: Build openvx-mark (smoke)
        run: |
          set -euo pipefail
          mkdir -p build-smoke
          cd build-smoke
          cmake \
            -DCMAKE_BUILD_TYPE=Release \
            -DOPENVX_INCLUDES=${{ steps.stage.outputs.include_dir }} \
            -DOPENVX_LIB_DIR=${{ steps.stage.outputs.lib_dir }} \
            ..
          cmake --build . -j$(nproc)

      # Khronos sample is a CTS-conformant reference impl that ships
      # both the Vision (42 kernels) and Enhanced Vision (19 kernels)
      # profiles. In practice the enhanced_vision tensor kernels in
      # the sample-impl are buggy at runtime — `TensorAdd` SIGSEGVs
      # the moment we invoke `vxProcessGraph`, taking the whole bench
      # process down with it and losing JSON output for every kernel
      # that hadn't run yet (openvx-mark writes its report only at
      # end-of-run).
      #
      # Workaround: split the smoke into TWO invocations along
      # feature-set lines, each writing to its own output dir, then
      # merge with scripts/merge_reports.py. The first invocation
      # (vision + framework) is rock-solid and always produces a
      # JSON. The second invocation (enhanced_vision) is the one that
      # might crash — `|| true` keeps the step alive, and if it
      # crashed, the merger silently skips its missing JSON. End
      # result: we ALWAYS get the vision+framework smoke data, and
      # we get enhanced_vision data when the sample impl cooperates.
      # Step name quoted because the `(split: ...)` parenthesised list
      # contains a colon, which YAML 1.2 would otherwise try to parse
      # as a mapping separator inside the unquoted scalar.
      - name: "Run smoke benchmark (split — vision+framework, then enhanced_vision, VGA × 5 iters)"
        continue-on-error: true
        run: |
          set -eo pipefail
          cd build-smoke
          export LD_LIBRARY_PATH=${{ steps.stage.outputs.lib_dir }}:${LD_LIBRARY_PATH:-}
          ./openvx-mark --validate-timing
          # 1. Rock-solid set first — always produces a JSON.
          ./openvx-mark --feature-set vision,framework \
            --resolution VGA --iterations 5 --warmup 1 --threads 1 \
            --output-dir smoke-results-base
          # 2. Crash-prone set — `|| true` so the step survives a
          #    SIGSEGV inside e.g. the Khronos sample's TensorAdd.
          ./openvx-mark --feature-set enhanced_vision \
            --resolution VGA --iterations 5 --warmup 1 --threads 1 \
            --output-dir smoke-results-extra \
            || echo "enhanced_vision smoke crashed (Khronos sample known issue) — vision results still saved"
          # 3. Merge whichever JSONs survived into the final smoke
          #    report. merge_reports.py handles the missing-input
          #    case silently.
          mkdir -p smoke-results
          python3 ../scripts/merge_reports.py \
            smoke-results-base/benchmark_results.json \
            smoke-results-extra/benchmark_results.json \
            --output smoke-results/benchmark_results.json

      - name: Verify Khronos smoke report (vision + framework only)
        if: always()
        run: |
          set -euo pipefail
          cd build-smoke
          # The Khronos reference sample has known vision-profile gaps
          # (e.g. S16 LaplacianPyramid is rejected at vxVerifyGraph with
          # VX_ERROR_INVALID_PARAMETERS), so unsupported rows are expected
          # here. Use --warn-only so the smoke still surfaces the count
          # without failing the job for impl-side limitations.
          python3 ../scripts/check_report.py \
            smoke-results/benchmark_results.json \
            --allow-feature-set vision,framework \
            --warn-only

      - name: Upload Khronos sample artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: impl-khronos-sample
          path: khronos-stage/
          retention-days: 1

      - name: Upload Khronos sample smoke results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: smoke-results-khronos-sample
          path: build-smoke/smoke-results/
          if-no-files-found: ignore

  # --------------------------------------------------------------------------
  # Phase 1 — rustVX (Rust OpenVX implementation)
  #
  # rustVX ships a single libopenvx_ffi.so that exports the full vx*/vxu*
  # symbol set. openvx-mark's CMake uses find_library(NAMES openvx) and
  # find_library(NAMES vxu) — so we symlink the two classic Khronos lib
  # names to the FFI .so during staging, without modifying rustVX's own
  # build output.
  #
  # SIMD config: AVX2 + `-C target-cpu=x86-64-v3`, matching what rustVX's
  # own CI ships. We deliberately skip the alignment-pad RUSTFLAGS used in
  # rustVX's PR-vs-main perf gate — those exist to make rustVX-vs-rustVX
  # bench numbers invariant to .text shifts, which is irrelevant for the
  # vendor-vs-vendor comparison this workflow runs.
  # --------------------------------------------------------------------------
  build-rustvx:
    name: Build rustVX + smoke test
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout openvx-mark
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential cmake git

      - name: Install Rust toolchain
        run: |
          set -euo pipefail
          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
            | sh -s -- -y --default-toolchain stable
          source "$HOME/.cargo/env"
          rustc --version
          cargo --version

      - name: Build rustVX (release, AVX2)
        run: |
          set -euo pipefail
          source "$HOME/.cargo/env"
          git clone --depth 1 \
            https://github.com/kiritigowda/rustVX.git /tmp/rustvx-src
          cd /tmp/rustvx-src
          case "$(uname -m)" in
            x86_64|amd64)
              FEATURES="openvx-core/sse2 openvx-core/avx2 openvx-vision/sse2 openvx-vision/avx2"
              export RUSTFLAGS="-C target-cpu=x86-64-v3"
              ;;
            aarch64|arm64)
              FEATURES="openvx-core/neon openvx-vision/neon"
              export RUSTFLAGS=""
              ;;
            *)
              FEATURES=""
              export RUSTFLAGS=""
              ;;
          esac
          echo "Architecture : $(uname -m)"
          echo "Cargo features: ${FEATURES:-<none>}"
          echo "RUSTFLAGS     : ${RUSTFLAGS:-<none>}"
          if [ -n "$FEATURES" ]; then
            cargo build --release -p openvx-ffi --features "$FEATURES"
          else
            cargo build --release -p openvx-ffi
          fi

      - name: Stage rustVX artifact (with libopenvx / libvxu symlinks)
        id: stage
        run: |
          set -euo pipefail
          mkdir -p rustvx-stage/lib rustvx-stage/include
          cp /tmp/rustvx-src/target/release/libopenvx_ffi.so rustvx-stage/lib/
          # Classic Khronos library names so openvx-mark's find_library picks
          # them up. Symlinks survive upload-artifact@v4 (it preserves them
          # within tar), so the comparison job downstream sees the same.
          (
            cd rustvx-stage/lib
            ln -sf libopenvx_ffi.so libopenvx.so
            ln -sf libopenvx_ffi.so libvxu.so
          )
          cp -r /tmp/rustvx-src/include/. rustvx-stage/include/
          echo "--- staged lib ---"
          ls -la rustvx-stage/lib
          echo "--- staged include (top-level) ---"
          ls -la rustvx-stage/include
          {
            echo "lib_dir=$(pwd)/rustvx-stage/lib"
            echo "include_dir=$(pwd)/rustvx-stage/include"
          } >> "$GITHUB_OUTPUT"

      - name: Build openvx-mark (smoke)
        run: |
          set -euo pipefail
          mkdir -p build-smoke
          cd build-smoke
          cmake \
            -DCMAKE_BUILD_TYPE=Release \
            -DOPENVX_INCLUDES=${{ steps.stage.outputs.include_dir }} \
            -DOPENVX_LIB_DIR=${{ steps.stage.outputs.lib_dir }} \
            ..
          cmake --build . -j$(nproc)

      # rustVX is CTS-conformant for both Vision (5923/5923) and
      # Enhanced Vision (1235/1235), so we exercise the full
      # `vision,enhanced_vision,framework` surface at smoke time. This
      # is the impl that gives the headline "all 19 enhanced_vision
      # kernels produce real measurements" cell in the comparison
      # table — every other OpenVX backend either omits the profile
      # (MIVisionX) or has known per-kernel quirks.
      - name: Run smoke benchmark (vision + enhanced_vision + framework, VGA × 5 iters)
        continue-on-error: true
        run: |
          set -eo pipefail
          cd build-smoke
          export LD_LIBRARY_PATH=${{ steps.stage.outputs.lib_dir }}:${LD_LIBRARY_PATH:-}
          ./openvx-mark --validate-timing
          ./openvx-mark --feature-set vision,enhanced_vision,framework \
            --resolution VGA --iterations 5 --warmup 1 --threads 1 \
            --output-dir smoke-results

      - name: Verify rustVX smoke report
        if: always()
        run: |
          set -euo pipefail
          cd build-smoke
          python3 ../scripts/check_report.py \
            smoke-results/benchmark_results.json \
            --allow-feature-set vision,enhanced_vision,framework

      - name: Upload rustVX artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: impl-rustvx
          path: rustvx-stage/
          retention-days: 1

      - name: Upload rustVX smoke results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: smoke-results-rustvx
          path: build-smoke/smoke-results/
          if-no-files-found: ignore

  # --------------------------------------------------------------------------
  # Phase 1 — OpenCV baseline (companion binary `opencv-mark`)
  #
  # OpenCV is the de facto vision baseline. This job exists so we can answer
  # "does adopting OpenVX actually pay off vs the cv:: code I already have?"
  # at the per-kernel level, on the same CI hardware as every OpenVX impl.
  #
  # Differs from the OpenVX impl jobs in two ways:
  #   1. OpenCV is apt-installable (no from-source build), so this job is
  #      much shorter — install, configure parent CMake, build, smoke.
  #   2. There is no impl-tarball staging step. opencv-mark IS the binary
  #      that runs the OpenCV-side measurements; there is no separate
  #      "link openvx-mark against this libopenvx.so" rebuild downstream.
  #      The Phase 2 comparison job re-runs opencv-mark itself (after a
  #      fresh apt-install of OpenCV) for strict same-runner fairness vs
  #      the per-impl benches — see compare job's `Build & bench
  #      opencv-mark` step.
  #
  # The smoke run here is fast feedback only (catches build/link breakage
  # in <1 min on every PR); the comparison-grade FHD × 20 iter benchmark
  # lives in Phase 2 alongside the OpenVX impl benches.
  # --------------------------------------------------------------------------
  build-opencv:
    name: Build opencv-mark (OpenCV baseline) + smoke test
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout openvx-mark
        uses: actions/checkout@v4

      - name: Install dependencies (OpenCV 4 from apt)
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential cmake git python3 \
            libopencv-dev
          # Sanity-print the OpenCV version that pkg-config sees so
          # comparison reports later can be cross-referenced against
          # exactly this version string.
          pkg-config --modversion opencv4 || true

      - name: Configure & build opencv-mark
        run: |
          set -euo pipefail
          mkdir -p build-opencv
          cd build-opencv
          # Parent CMake auto-includes opencv-mark/ when OpenCV is found.
          # No OPENVX_* flags needed — opencv-mark has no OpenVX dep.
          cmake -DCMAKE_BUILD_TYPE=Release ..
          cmake --build . --target opencv-mark -j$(nproc)
          # Fail loudly if the binary somehow didn't get produced (e.g.
          # OpenCV detection silently no-op'd). This is the exact failure
          # mode that PR #1's first CI run was missing.
          test -x opencv-mark/opencv-mark \
            || { echo "ERROR: opencv-mark binary not built — OpenCV likely not detected by CMake"; exit 1; }
          # `--help` doubles as a version probe — it prints the opencv-mark
          # version line and the linked OpenCV version up top. PR1's CLI
          # does not implement a dedicated `--version` flag yet.
          ./opencv-mark/opencv-mark --help | head -3

      # Same shape as the OpenVX-impl smokes (VGA × 5 iters, 1 warmup)
      # so timing noise stays comparable. Not continue-on-error —
      # opencv-mark has no impl-side quirks to tolerate; if a kernel
      # breaks here it's our bug.
      #
      # Feature-set: `vision,enhanced_vision`. opencv-mark has 1:1
      # coverage of both profiles (42 vision + 19 enhanced = 61
      # kernels) — that's the entire OpenCV-side surface this CI
      # exercises. `framework` is intentionally omitted (OpenCV has
      # no graph runtime to measure; the framework benches that
      # depend on `vxProcessGraph` semantics are OpenVX-only).
      - name: Run smoke benchmark (vision + enhanced_vision, VGA × 5 iters)
        run: |
          set -eo pipefail
          cd build-opencv
          # Timer self-test up front — same gate that runs in the
          # Phase 2 compare job. Catches a borked runner clock at
          # smoke time so we don't waste a full FHD bench cycle.
          ./opencv-mark/opencv-mark --validate-timing
          # `--threads 1` for symmetry with the smokes that run
          # against single-threaded OpenVX impls — keeps the smoke
          # comparable in shape to the cross-impl ones, even though
          # the smoke itself is just a "did it build & did it run?"
          # check, not a perf claim.
          ./opencv-mark/opencv-mark --feature-set vision,enhanced_vision \
            --resolution VGA --iterations 5 --warmup 1 --threads 1 \
            --output-dir smoke-results

      - name: Verify OpenCV smoke report
        if: always()
        run: |
          set -euo pipefail
          cd build-opencv
          python3 ../scripts/check_report.py \
            smoke-results/benchmark_results.json

      - name: Upload opencv-mark smoke results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: smoke-results-opencv
          path: build-opencv/smoke-results/
          if-no-files-found: ignore

  # --------------------------------------------------------------------------
  # Phase 2 — Pairwise comparison
  #
  # Pulls all three OpenVX implementation artifacts onto the same runner,
  # plus apt-installs OpenCV, so every benchmark is exercised on identical
  # hardware. Builds openvx-mark once per OpenVX impl (against this commit's
  # source tree, not pre-built artifacts — keeps the comparison binary
  # identical apart from the linked OpenVX lib), builds opencv-mark from
  # the same source tree, runs the full feature-set bench against each,
  # and emits six pairwise comparison reports:
  #
  #   OpenVX-vs-OpenVX (3):
  #     * MIVisionX over Khronos sample          — AMD over reference
  #     * MIVisionX over rustVX                  — AMD over Rust impl
  #     * rustVX     over Khronos sample         — Rust impl over reference
  #
  #   OpenVX-vs-OpenCV (3) — "does adopting OpenVX pay off?":
  #     * MIVisionX over OpenCV                  — best-tuned OpenVX vs cv::
  #     * Khronos sample over OpenCV             — reference OpenVX vs cv::
  #     * rustVX     over OpenCV                 — Rust OpenVX vs cv::
  #
  # `if: always()` + per-download `continue-on-error` + per-bench
  # `if: always() && steps.detect...` so a single failed build still
  # surfaces the comparison signal for whichever other impls are
  # available, instead of losing all visibility.
  # --------------------------------------------------------------------------
  compare:
    name: Pairwise comparison (MIVisionX, Khronos, rustVX, OpenCV)
    runs-on: ubuntu-22.04
    needs:
      - build-mivisionx
      - build-khronos-sample
      - build-rustvx
      - build-opencv
    if: always()
    steps:
      - name: Checkout openvx-mark
        uses: actions/checkout@v4

      - name: Install dependencies
        run: |
          sudo apt-get update
          # libopencv-dev is needed so the Phase 2 `Build & bench
          # opencv-mark` step can re-link opencv-mark on this runner.
          # Strictly same-hardware fairness vs the per-impl benches.
          sudo apt-get install -y build-essential cmake git python3 \
            libopencv-dev
          pkg-config --modversion opencv4 || true

      - name: Download MIVisionX artifact
        uses: actions/download-artifact@v4
        with:
          name: impl-mivisionx
          path: ${{ github.workspace }}/impl/mivisionx
        continue-on-error: true

      - name: Download Khronos sample artifact
        uses: actions/download-artifact@v4
        with:
          name: impl-khronos-sample
          path: ${{ github.workspace }}/impl/khronos
        continue-on-error: true

      - name: Download rustVX artifact
        uses: actions/download-artifact@v4
        with:
          name: impl-rustvx
          path: ${{ github.workspace }}/impl/rustvx
        continue-on-error: true

      - name: Detect available implementations
        id: detect
        run: |
          set -euo pipefail
          for impl in mivisionx khronos rustvx; do
            lib="${{ github.workspace }}/impl/$impl/lib/libopenvx.so"
            if [ -e "$lib" ]; then
              echo "$impl: AVAILABLE ($lib)"
              chmod -R u+rwX "${{ github.workspace }}/impl/$impl/lib"
              echo "${impl}=true" >> "$GITHUB_OUTPUT"
            else
              echo "$impl: MISSING (artifact download failed or build job did not produce it)"
              echo "${impl}=false" >> "$GITHUB_OUTPUT"
            fi
          done

      # ----- Per-impl build + benchmark (FHD, 20 iter, 5 warmup) -----
      #
      # Each per-impl bench uses `if: always() && steps.detect...` because
      # GitHub Actions treats any explicit `if:` without `always()` as
      # implicit `success()` — meaning a crash in MIVisionX bench would
      # skip the Khronos / rustVX bench steps entirely and we'd lose all
      # comparison signal. With `always()` the three benches stay
      # independent and the comparison job downstream handles whichever
      # JSON files actually got produced.
      #
      # `--threads 1` is passed EXPLICITLY (it's also the default — but
      # we want the CI compare config to be self-documenting). Rationale:
      #
      #   * MIVisionX CPU backend, Khronos sample, and rustVX are all
      #     fundamentally single-threaded per kernel — none of them have
      #     an internal thread pool on the CPU path.
      #   * OpenCV, by contrast, will happily spawn nproc threads via
      #     TBB/OpenMP if left at its default. Without the `--threads 1`
      #     pin, the OpenCV side would get an unfair (nproc)x parallelism
      #     boost just from defaults — the comparison would no longer be
      #     "OpenVX kernel vs OpenCV kernel" but "1-thread OpenVX vs
      #     n-thread OpenCV". `--threads 1` calls cv::setNumThreads(1)
      #     for opencv-mark and sets OMP_NUM_THREADS=1 in the env for
      #     anything OpenMP-using downstream.
      #
      # Feature set is per-impl (see the architecture comment block
      # at the top of this file for the full policy):
      #   * MIVisionX     — `vision,framework` (no enhanced_vision;
      #                     AMD's runtime doesn't export the APIs)
      #   * Khronos sample — `vision,enhanced_vision,framework`
      #   * rustVX        — `vision,enhanced_vision,framework`
      #   * opencv-mark   — `vision,enhanced_vision` (no framework;
      #                     OpenCV has no graph runtime to measure)
      # `compare_reports.py` joins by (name, mode, resolution) and
      # silently drops rows not on both sides, so enhanced_vision
      # rows naturally appear in pairs where both impls produced them
      # (Khronos↔OpenCV, rustVX↔OpenCV, Khronos↔rustVX) and are absent
      # from MIVisionX↔* pairs.
      - name: Build & bench against MIVisionX (single-threaded, FHD × 20)
        if: always() && steps.detect.outputs.mivisionx == 'true'
        run: |
          set -euo pipefail
          mkdir -p build-mivisionx
          cd build-mivisionx
          cmake \
            -DCMAKE_BUILD_TYPE=Release \
            -DOPENVX_INCLUDES=${{ github.workspace }}/impl/mivisionx/include \
            -DOPENVX_LIB_DIR=${{ github.workspace }}/impl/mivisionx/lib \
            ..
          cmake --build . -j$(nproc)
          export LD_LIBRARY_PATH=${{ github.workspace }}/impl/mivisionx/lib:${LD_LIBRARY_PATH:-}
          # Timer self-test first — gates the rest of the bench. If the
          # runner clock is sloppy, our timing numbers are meaningless
          # and we'd rather know about it now than ship bad data.
          ./openvx-mark --validate-timing
          ./openvx-mark --feature-set vision,framework \
            --resolution FHD --iterations 20 --warmup 5 --threads 1 \
            --output-dir results
          # Sentinel-set dump for cross-impl numerical verification —
          # see scripts/cross_verify_outputs.py. Runs the kernel set
          # ONCE (no timing, no warmup) so it's cheap, then the
          # downstream verify step compares this dump against the
          # OpenCV dump for correctness.
          ./openvx-mark --dump-outputs dump-mivisionx --seed 42

      - name: Build & bench against Khronos sample (single-threaded, FHD × 20)
        if: always() && steps.detect.outputs.khronos == 'true'
        # Khronos sample's enhanced_vision tensor kernels
        # (TensorAdd, TensorSub, ...) SIGSEGV inside vxProcessGraph
        # and take the whole bench process down — losing JSON output
        # for every kernel that hadn't run yet (openvx-mark writes
        # its report only at end-of-run). Same architecture as the
        # split smoke step above: bench the rock-solid feature sets
        # in their own invocation first (always produces a JSON),
        # then bench enhanced_vision in a second invocation that's
        # allowed to crash (`|| echo …`), and merge whichever JSONs
        # survived into the final per-impl report consumed by the
        # comparison phase downstream.
        #
        # `continue-on-error: true` is belt-and-suspenders — even if
        # the merge step itself fails for some reason, the
        # comparison job continues with whatever Khronos data it has.
        continue-on-error: true
        run: |
          set -eo pipefail
          mkdir -p build-khronos
          cd build-khronos
          cmake \
            -DCMAKE_BUILD_TYPE=Release \
            -DOPENVX_INCLUDES=${{ github.workspace }}/impl/khronos/include \
            -DOPENVX_LIB_DIR=${{ github.workspace }}/impl/khronos/lib \
            ..
          cmake --build . -j$(nproc)
          export LD_LIBRARY_PATH=${{ github.workspace }}/impl/khronos/lib:${LD_LIBRARY_PATH:-}
          ./openvx-mark --validate-timing
          # 1. Rock-solid: vision (42 kernels) + framework benchmarks.
          ./openvx-mark --feature-set vision,framework \
            --resolution FHD --iterations 20 --warmup 5 --threads 1 \
            --output-dir results-base
          # 2. Crash-prone: enhanced_vision (19 kernels). Note the
          #    Khronos sample's HOG and tensor support is patchy —
          #    HOGCells / HOGFeatures graph_setup tends to fail
          #    cleanly (SKIPPED) but tensor kernels often SIGSEGV.
          ./openvx-mark --feature-set enhanced_vision \
            --resolution FHD --iterations 20 --warmup 5 --threads 1 \
            --output-dir results-extra \
            || echo "enhanced_vision FHD bench crashed (Khronos sample known issue) — vision+framework results still saved"
          # 3. Merge into the canonical `results/` dir the downstream
          #    comparison phase expects.
          mkdir -p results
          python3 ../scripts/merge_reports.py \
            results-base/benchmark_results.json \
            results-extra/benchmark_results.json \
            --output results/benchmark_results.json
          ./openvx-mark --dump-outputs dump-khronos --seed 42 || true

      - name: Build & bench against rustVX (single-threaded, FHD × 20)
        if: always() && steps.detect.outputs.rustvx == 'true'
        # rustVX is CTS-conformant for both Vision (5923/5923) and
        # Enhanced Vision (1235/1235), so all 42 + 19 kernels should
        # actually produce real measurements here. This row is the
        # headline cell for "what does a fully-conformant OpenVX impl
        # look like vs OpenCV on the same hardware?".
        # `continue-on-error: true` is a belt-and-suspenders safety
        # in case any one kernel surfaces a regression mid-bench —
        # the artifact upload (which downstream comparisons depend
        # on) must still happen.
        continue-on-error: true
        run: |
          set -eo pipefail
          mkdir -p build-rustvx
          cd build-rustvx
          cmake \
            -DCMAKE_BUILD_TYPE=Release \
            -DOPENVX_INCLUDES=${{ github.workspace }}/impl/rustvx/include \
            -DOPENVX_LIB_DIR=${{ github.workspace }}/impl/rustvx/lib \
            ..
          cmake --build . -j$(nproc)
          export LD_LIBRARY_PATH=${{ github.workspace }}/impl/rustvx/lib:${LD_LIBRARY_PATH:-}
          ./openvx-mark --validate-timing
          ./openvx-mark --feature-set vision,enhanced_vision,framework \
            --resolution FHD --iterations 20 --warmup 5 --threads 1 \
            --output-dir results
          ./openvx-mark --dump-outputs dump-rustvx --seed 42 || true

      # opencv-mark has no OpenVX dependency, so no OPENVX_* flags and no
      # detect-step gate — it only needs `libopencv-dev` (already installed
      # above). Same FHD × 20 iter × 5 warmup × --threads 1 shape as the
      # OpenVX benches so per-kernel speedups are directly comparable.
      #
      # Feature-set is `vision,enhanced_vision` — opencv-mark has 1:1
      # coverage of both profiles (79 + 19 = 98 OpenCV-side benchmarks
      # total). `framework` is intentionally omitted because OpenCV has
      # no graph runtime to measure (the framework benches that depend
      # on `vxProcessGraph` / virtual-image fusion semantics are
      # OpenVX-only by design). `compare_reports.py` ignores rows that
      # only exist on one side, so framework rows naturally don't
      # appear in OpenCV pairwise tables.
      - name: Build & bench opencv-mark (single-threaded, FHD × 20)
        if: always()
        id: bench_opencv
        run: |
          set -euo pipefail
          mkdir -p build-opencv-bench
          cd build-opencv-bench
          cmake -DCMAKE_BUILD_TYPE=Release ..
          cmake --build . --target opencv-mark -j$(nproc)
          test -x opencv-mark/opencv-mark \
            || { echo "ERROR: opencv-mark not built — OpenCV detection failed in compare job"; exit 1; }
          ./opencv-mark/opencv-mark --validate-timing
          ./opencv-mark/opencv-mark --feature-set vision,enhanced_vision \
            --resolution FHD --iterations 20 --warmup 5 --threads 1 \
            --output-dir results
          ./opencv-mark/opencv-mark --dump-outputs dump-opencv --seed 42

      # ----- Cross-impl numerical verification -----
      #
      # We have one dump-* directory per impl that produced a build.
      # Run scripts/cross_verify_outputs.py for each (opencv, openvx)
      # pair so a reviewer can see at a glance whether MIVisionX,
      # Khronos sample, and rustVX agree with OpenCV at the pixel
      # level — proves the timing comparison rows below are honest
      # apples-to-apples and not "OpenCV is faster because it's
      # silently computing the wrong thing".
      #
      # The verifier exits non-zero on any kernel exceeding its
      # per-kernel tolerance; we collect all three reports into the
      # step summary first, then fail the step at the end if any
      # report failed. That way a single divergence on one impl
      # doesn't hide the other two impls' results.
      - name: Cross-impl output verification (OpenCV ↔ each OpenVX impl)
        if: always()
        run: |
          set -euo pipefail
          # numpy is the only Python dep — used by the verifier for
          # array compare + PSNR. apt's python3-numpy on ubuntu-22.04
          # is fine and avoids a pip wheel download.
          sudo apt-get install -y python3-numpy
          mkdir -p comparisons

          OPENCV_DUMP=build-opencv-bench/dump-opencv
          {
            echo ""
            echo "---"
            echo ""
            echo "## Cross-impl numerical verification"
            echo ""
            echo "Sentinel kernel suite (VGA × 1 run, no timing) dumped by"
            echo "\`--dump-outputs\` on each binary; \`scripts/cross_verify_outputs.py\`"
            echo "loads both dumps and computes max-abs-diff + PSNR + exact-%"
            echo "per kernel. Tolerances are tuned per kernel (see \`RULES\` in"
            echo "the script). Numbers prove inputs are byte-identical (the"
            echo "\`_input_u8\` row) and kernels are semantically equivalent."
            echo ""
          } >> "$GITHUB_STEP_SUMMARY"

          OVERALL=0
          for impl in mivisionx khronos rustvx; do
            VX_DUMP="build-${impl}/dump-${impl}"
            if [ ! -d "$OPENCV_DUMP" ] || [ ! -d "$VX_DUMP" ]; then
              echo "skipping verify for $impl: missing dump dir ($VX_DUMP or $OPENCV_DUMP)"
              echo "_Skipped \`$impl\` verify — dump directory missing._" >> "$GITHUB_STEP_SUMMARY"
              continue
            fi
            set +e
            python3 scripts/cross_verify_outputs.py \
              "$OPENCV_DUMP" "$VX_DUMP" \
              --left-label "OpenCV" --right-label "${impl}" \
              --json comparisons/cross-verify-${impl}.json \
              >> "$GITHUB_STEP_SUMMARY"
            rc=$?
            set -e
            if [ "$rc" -ne 0 ]; then OVERALL=1; fi
            echo "" >> "$GITHUB_STEP_SUMMARY"
          done

          # Surface OVERALL into a step-level marker — the job stays
          # green on a divergence (so reviewers still see the timing
          # comparison) but the row is annotated and an artifact link
          # is uploaded below.
          if [ "$OVERALL" -ne 0 ]; then
            echo "::warning::Cross-impl verification flagged ≥1 divergence — see job summary"
          fi

      # ----- Pairwise comparisons -----
      #
      # Each comparison is oriented as "<candidate> over <baseline>" so
      # the speedup column reads as `candidate / baseline` (>1.00x =
      # candidate is faster). The orientation is deliberate:
      #
      #   OpenVX-vs-OpenVX trio — "how much faster is the more-tuned
      #   impl than the reference":
      #     * MIVisionX over Khronos sample   (AMD over reference)
      #     * MIVisionX over rustVX           (AMD over Rust impl)
      #     * rustVX     over Khronos sample  (Rust impl over reference)
      #
      #   OpenVX-vs-OpenCV trio — "does adopting OpenVX pay off vs cv::":
      #     * MIVisionX over OpenCV
      #     * Khronos sample over OpenCV
      #     * rustVX over OpenCV
      #
      # Mechanically, `scripts/compare_reports.py` computes
      #     speedup = throughput(arg2) / throughput(arg1)
      # so the candidate is passed as the SECOND positional arg.
      #
      # The step does two things:
      #   1. Runs `compare_reports.py` once per pair to produce a
      #      per-kernel detail .md in comparisons/. These also become