qvac/.github/workflows/benchmark-llm-llamacpp.yml at 5cdc65df6b5a02f895e5c6f0500efa4739a8345d · tetherto/qvac · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
name: Benchmark VLM (LLM)

# Manually-triggered VLM benchmark. Runs
# packages/llm-llamacpp/benchmarks/vlm-performance against Qwen3.5-VL
# on a fixed object-listing task and uploads a consolidated report.
#
# 3-source comparison: addon (JS binding) vs fabric-cli (fork CLI) vs
# upstream-cli (upstream llama.cpp CLI). Measures JS binding overhead
# and fork divergence using the same model across all sources.

on:
  workflow_dispatch:
    inputs:
      # ── Sources (what to compare) ─────────────────────────────
      run_addon:
        description: "── SOURCE 1 ── addon (@qvac/llm-llamacpp JS binding)"
        required: false
        type: boolean
        default: true
      ref:
        description: "  addon ref — qvac branch / tag / SHA (default: current branch)"
        required: false
        type: string
      addon_from_source:
        description: "  build addon from source (slow, but uses latest fabric)"
        required: false
        type: boolean
        default: false
      run_addon_source:
        description: "  A/B: also build addon from source with the vcpkg overlay applied, run alongside npm addon in one cell (same runner). x86-CPU cells only."
        required: false
        type: boolean
        default: false
      addon_source_overlay:
        description: "  apply the vcpkg overlay during the addon-source build (default true). Set false to rule out 'is it the overlay or something else?'"
        required: false
        type: boolean
        default: true
      run_fabric_cli:
        description: "── SOURCE 2 ── fabric (qvac fork, native CLI)"
        required: false
        type: boolean
        default: true
      fabric_ref:
        description: "  fabric ref (default: v8189.0.2)"
        required: false
        type: string
        default: "v8189.0.2"
      run_upstream_cli:
        description: "── SOURCE 3 ── upstream (vanilla llama.cpp, native CLI)"
        required: false
        type: boolean
        default: true
      upstream_ref:
        description: "  upstream ref (default: b8189)"
        required: false
        type: string
        default: "b8189"
      # ── Platforms × backends ──────────────────────────────────
      # Comma-separated selection. Tokens: linux-cpu, linux-gpu,
      # windows-cpu, windows-gpu, macos. "all" expands to every desktop
      # cell. GPU rows for Linux/Windows go to self-hosted Vulkan runners;
      # macOS uses Metal on GitHub-hosted macos-15-xlarge.
      platforms:
        description: "── PLATFORMS ── e.g. linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos (or 'all')"
        required: false
        type: string
        default: "linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos"
      run_android:
        description: "── PLATFORM ── Android (stub)"
        required: false
        type: boolean
        default: false
      # ── Run settings ──────────────────────────────────────────
      warmup_runs:
        description: "Warmup iterations (discarded)"
        required: false
        type: string
        default: "1"
      measured_runs:
        description: "Measured iterations (median reported)"
        required: false
        type: string
        default: "3"
      # ── Matrix mode (config-driven quality+speed matrix) ──────
      # Orthogonal to the source-engines benchmark above. When on, runs
      # the @qvac/llm-llamacpp addon over the vlm-matrix fixture (lmms-eval
      # quality + vision-encode speed) on Linux (and S25), driven by
      # packages/llm-llamacpp/test/integration/vlm-matrix.config.cjs.
      run_matrix:
        description: "── MATRIX ── run the config-driven VLM quality+speed matrix (addon, Linux + S25)"
        required: false
        type: boolean
        default: false
      matrix_mode:
        description: "  matrix mode: two-models (f16 vs q8, addon) or several-sources (addon+fabric-cli+upstream-cli, Linux-only)"
        required: false
        type: string
        default: "two-models"
      matrix_preset:
        description: "  matrix preset: compare (two-models), sources (several-sources), smoke, or full. Overrides config.defaultPreset on Linux."
        required: false
        type: string
        default: "compare"
      matrix_engine:
        description: "  inference engine (two-models mode): addon | fabric-cli | upstream-cli. CLI engines are desktop-only; addon runs everywhere."
        required: false
        type: string
        default: "addon"
      matrix_linux:
        description: "  Linux matrix legs, comma-sep: linux-cpu,linux-gpu"
        required: false
        type: string
        default: "linux-cpu,linux-gpu"
      run_matrix_s25:
        description: "  also run the matrix on Samsung S25 (AWS Device Farm)"
        required: false
        type: boolean
        default: false
permissions:
  contents: read
  packages: read
  pull-requests: write
  id-token: write

jobs:
  # ── Context ────────────────────────────────────────────────────────
  # Resolves the repo + ref so downstream jobs check out the right
  # commit even when the workflow_dispatch is invoked without `ref`,
  # and builds the desktop matrix from the per-platform input toggles.
  # Matrix is computed here (instead of via job-level `if:`) because
  # GitHub Actions doesn't allow `matrix.*` references in job-level
  # conditions — they're evaluated before the matrix is expanded.
  context:
    runs-on: ubuntu-latest
    outputs:
      repository: ${{ steps.ctx.outputs.repository }}
      ref: ${{ steps.ctx.outputs.ref }}
      desktop_matrix: ${{ steps.matrix.outputs.value }}
      desktop_count: ${{ steps.matrix.outputs.count }}
      linux_matrix: ${{ steps.lmatrix.outputs.value }}
      linux_count: ${{ steps.lmatrix.outputs.count }}
      merge_base: ${{ steps.commits.outputs.merge_base }}
      head_sha: ${{ steps.commits.outputs.head_sha }}
      head_title: ${{ steps.commits.outputs.head_title }}
      head_date: ${{ steps.commits.outputs.head_date }}
      base_title: ${{ steps.commits.outputs.base_title }}
      base_date: ${{ steps.commits.outputs.base_date }}
    steps:
      - id: ctx
        shell: bash
        env:
          INPUT_REF: ${{ inputs.ref }}
          REPO: ${{ github.repository }}
          REF_NAME: ${{ github.ref_name }}
        run: |
          repo="$REPO"
          ref="${INPUT_REF:-$REF_NAME}"
          echo "repository=$repo" >> "$GITHUB_OUTPUT"
          echo "ref=$ref" >> "$GITHUB_OUTPUT"
      - id: matrix
        shell: bash
        env:
          RUN_ADDON: ${{ inputs.run_addon }}
          RUN_FABRIC: ${{ inputs.run_fabric_cli }}
          RUN_UPSTREAM: ${{ inputs.run_upstream_cli }}
          RUN_ADDON_SOURCE: ${{ inputs.run_addon_source }}
          PLATFORMS: ${{ inputs.platforms }}
        run: |
          # Sources
          sources='[]'
          if [[ "$RUN_ADDON" == "true" ]]; then
            sources=$(echo "$sources" | jq -c '. + ["addon"]')
          fi
          if [[ "$RUN_FABRIC" == "true" ]]; then
            sources=$(echo "$sources" | jq -c '. + ["fabric"]')
          fi
          if [[ "$RUN_UPSTREAM" == "true" ]]; then
            sources=$(echo "$sources" | jq -c '. + ["upstream"]')
          fi
          # addon-source is an opt-in 4th source that builds the addon
          # from local sources (with vcpkg overlay applied) and runs it
          # in the same cell as the npm addon. Gated to *-cpu cells
          # because llamafile is the primary thing this exists to A/B,
          # and that's x86-CPU-specific.
          if [[ "$RUN_ADDON_SOURCE" == "true" ]]; then
            sources=$(echo "$sources" | jq -c '. + ["addon-source"]')
          fi
          # Selected platform×backend tokens
          selected="${PLATFORMS:-linux-cpu}"
          if [[ "$selected" == "all" ]]; then
            selected="linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos"
          fi
          # Build cells from tokens. Each token maps to a fixed
          # (platform, arch, backend, runner) tuple.
          cells='[]'
          IFS=',' read -ra tokens <<< "$selected"
          for raw in "${tokens[@]}"; do
            sel=$(echo "$raw" | xargs)
            case "$sel" in
              linux-cpu)    plat=linux-x64;   arch=x64;   backend=cpu; runner=ubuntu-latest ;;
              linux-gpu)    plat=linux-x64;   arch=x64;   backend=gpu; runner=qvac-ubuntu2404-x64-gpu ;;
              windows-cpu)  plat=windows-x64; arch=x64;   backend=cpu; runner=windows-latest ;;
              windows-gpu)  plat=windows-x64; arch=x64;   backend=gpu; runner=qvac-win25-x64-gpu ;;
              macos)        plat=macos-arm64; arch=arm64; backend=gpu; runner=macos-15-xlarge ;;
              "")           continue ;;
              *) echo "::warning::Unknown platform token '$sel' (known: linux-cpu, linux-gpu, windows-cpu, windows-gpu, macos)"; continue ;;
            esac
            for src in $(echo "$sources" | jq -r '.[]'); do
              # windows-gpu currently only supports the addon leg: the
              # self-hosted qvac-win25-x64-gpu runner has Vulkan and
              # chocolatey but no MSVC and chocolatey can't install
              # LLVM at job time (lock/permission errors), so the
              # fabric/upstream CLI builds aren't viable there yet.
              # Re-enable once the runner image ships LLVM+Ninja.
              if [[ "$sel" == "windows-gpu" && "$src" != "addon" ]]; then
                continue
              fi
              # addon-source: llamafile is x86-CPU specific, so only
              # emit the from-source A/B cell on CPU cells (linux-cpu,
              # windows-cpu). On GPU cells the matmul path goes through
              # Vulkan/Metal shaders that don't change with llamafile.
              if [[ "$src" == "addon-source" ]]; then
                case "$sel" in
                  linux-cpu|windows-cpu) ;;
                  *) continue ;;
                esac
              fi
              # When the addon-source A/B is enabled on this cell, the
              # addon-source leg already runs --sources=addon,addon-source
              # in one process. The dedicated 'addon' cell would
              # produce a duplicate row on a different runner — skip
              # it so the consolidated report stays clean.
              if [[ "$src" == "addon" && "$RUN_ADDON_SOURCE" == "true" ]]; then
                case "$sel" in
                  linux-cpu|windows-cpu) continue ;;
                esac
              fi
              cells=$(echo "$cells" | jq -c \
                --arg p "$plat" --arg a "$arch" --arg b "$backend" --arg r "$runner" --arg s "$src" \
                '. + [{"platform":$p,"arch":$a,"backend":$b,"runner":$r,"source":$s}]')
            done
          done
          count=$(echo "$cells" | jq 'length')
          echo "value=$cells" >> "$GITHUB_OUTPUT"
          echo "count=$count" >> "$GITHUB_OUTPUT"
          echo "Desktop matrix ($count entries): $cells"
      # Linux legs for the config-driven matrix mode (addon over the
      # vlm-matrix fixture). Independent of the source-engines matrix above.
      - id: lmatrix
        shell: bash
        env:
          RUN_MATRIX: ${{ inputs.run_matrix }}
          MATRIX_LINUX: ${{ inputs.matrix_linux }}
          MATRIX_MODE: ${{ inputs.matrix_mode }}
        run: |
          cells='[]'
          if [[ "$RUN_MATRIX" == "true" ]]; then
            IFS=',' read -ra tokens <<< "${MATRIX_LINUX:-linux-cpu}"
            for raw in "${tokens[@]}"; do
              sel=$(echo "$raw" | xargs)
              # several-sources builds native fabric/upstream CLIs, so the CPU leg
              # needs a runner with cmake+toolchain → GitHub-hosted ubuntu-latest.
              cpu_runner=qvac-ubuntu2204-x64
              if [[ "$MATRIX_MODE" == "several-sources" ]]; then cpu_runner=ubuntu-latest; fi
              case "$sel" in
                linux-cpu) backend=cpu; runner=$cpu_runner;             no_gpu=true ;;
                linux-gpu) backend=gpu; runner=qvac-ubuntu2404-x64-gpu;  no_gpu=false ;;
                "") continue ;;
                *) echo "::warning::Unknown matrix_linux token '$sel' (known: linux-cpu, linux-gpu)"; continue ;;
              esac
              cells=$(echo "$cells" | jq -c \
                --arg b "$backend" --arg r "$runner" --arg n "$no_gpu" \
                '. + [{"backend":$b,"runner":$r,"no_gpu":$n}]')
            done
          fi
          count=$(echo "$cells" | jq 'length')
          echo "value=$cells" >> "$GITHUB_OUTPUT"
          echo "count=$count" >> "$GITHUB_OUTPUT"
          echo "Linux matrix ($count entries): $cells"
      # Resolve commit metadata so the consolidated report can show what
      # the candidate ref + merge-base actually point at (hash, title,
      # date). Needs a real clone — sparse checkout doesn't give us git
      # history.
      - name: Checkout for commit lookup
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          repository: ${{ steps.ctx.outputs.repository }}
          ref: ${{ steps.ctx.outputs.ref }}
          fetch-depth: 0
      - id: commits
        shell: bash
        run: |
          git fetch origin main --quiet
          HEAD_SHA=$(git rev-parse HEAD)
          MERGE_BASE=$(git merge-base HEAD origin/main || echo "")
          HEAD_TITLE=$(git log -1 --pretty=%s "$HEAD_SHA")
          HEAD_DATE=$(git log -1 --pretty=%cI "$HEAD_SHA")
          if [[ -n "$MERGE_BASE" ]]; then
            BASE_TITLE=$(git log -1 --pretty=%s "$MERGE_BASE")
            BASE_DATE=$(git log -1 --pretty=%cI "$MERGE_BASE")
          else
            BASE_TITLE=""
            BASE_DATE=""
          fi
          {
            echo "head_sha=$HEAD_SHA"
            echo "merge_base=$MERGE_BASE"
            echo "head_title=$HEAD_TITLE"
            echo "head_date=$HEAD_DATE"
            echo "base_title=$BASE_TITLE"
            echo "base_date=$BASE_DATE"
          } >> "$GITHUB_OUTPUT"
          echo "HEAD:       $HEAD_SHA - $HEAD_TITLE ($HEAD_DATE)"
          echo "merge-base: $MERGE_BASE - $BASE_TITLE ($BASE_DATE)"

  # ── Desktop benchmark matrix ───────────────────────────────────────
  # Each leg is the same shape — pick the runner via matrix.runner.
  # GPU rows (linux-x64 / windows-x64) target self-hosted Vulkan
  # runners pre-provisioned with the Vulkan SDK; macOS arm64 uses
  # Metal on the GitHub-hosted macos-15-xlarge runner. The matrix
  # itself is built dynamically by the context job above from the
  # `platforms` input.
  desktop:
    needs: context
    if: needs.context.outputs.desktop_count != '0'
    name: vlm-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 30
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJSON(needs.context.outputs.desktop_matrix) }}
    env:
      HF_TOKEN: ${{ secrets.HF_TOKEN }}
      WORKDIR: packages/llm-llamacpp/benchmarks/vlm-performance
    steps:
      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          repository: ${{ needs.context.outputs.repository }}
          ref: ${{ needs.context.outputs.ref }}

      # The addon-source cell needs the full native toolchain (LLVM,
      # vcpkg, bare-make, Vulkan SDK) because it builds the addon from
      # local sources with the vcpkg overlay applied. The 'addon',
      # 'fabric', and 'upstream' cells stay on the lighter npm path.
      - name: Setup Node.js and Bare tooling
        if: matrix.source == 'addon-source'
        uses: ./.github/actions/setup-bare-tooling

      - name: Setup Node.js
        if: matrix.source != 'addon-source'
        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
        with:
          node-version: 22

      - name: Setup LLVM
        if: matrix.source == 'addon-source'
        uses: ./.github/actions/setup-llvm

      # Inline vcpkg bootstrap. We can't use the repo's setup-vcpkg
      # composite action because it hard-requires MODEL_S3_BUCKET for
      # the prebuilds-shared S3 binary cache — a secret we don't
      # plumb into the bench workflow. Set VCPKG_ROOT to the runner's
      # pre-installed vcpkg and leave VCPKG_BINARY_SOURCES at the
      # default (per-runner disk cache), which is fine for a one-off
      # bench build.
      - name: Configure vcpkg (addon-source)
        if: matrix.source == 'addon-source' && runner.os == 'Linux'
        shell: bash
        run: |
          echo "VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT" >> "$GITHUB_ENV"
          echo "VCPKG_BUILD_TYPE=release" >> "$GITHUB_ENV"
          echo "VCPKG_CMAKE_CONFIGURE_OPTIONS=--no-parallel-configure" >> "$GITHUB_ENV"

      # qvac-fabric defaults to the gpu-backends feature, which
      # transitively requires the Vulkan SDK at build time. ubuntu-latest
      # doesn't ship one; install upstream's prebuilt SDK and stamp the
      # env so cmake's FindVulkan picks it up. Same install pattern
      # benchmark-embed-llamacpp.yml uses. libvulkan-dev pulls
      # libvulkan.so + libvulkan1 from the distro — the LunarG SDK
      # 1.4.x no longer bundles the loader, so cmake's FindVulkan
      # can't see Vulkan_LIBRARY without it.
      - name: Install Vulkan SDK (addon-source on Linux)
        if: matrix.source == 'addon-source' && runner.os == 'Linux'
        shell: bash
        run: |
          sudo apt-get update
          sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils libvulkan-dev
          wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
          mkdir -p "$HOME/vulkan"
          tar -xf /tmp/vulkansdk.tar.xz -C "$HOME/vulkan" --strip-components=1
          VULKAN_SDK="$HOME/vulkan/x86_64"
          echo "VULKAN_SDK=$VULKAN_SDK" >> "$GITHUB_ENV"
          echo "$VULKAN_SDK/bin" >> "$GITHUB_PATH"
          echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
          echo "PKG_CONFIG_PATH=$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}" >> "$GITHUB_ENV"

      # Install npm @qvac/llm-llamacpp first so we have the published
      # addon (no llamafile) on disk, then snapshot it to /tmp before
      # the from-source build overwrites prebuilds/ in the workspace.
      # The bench then runs both addon variants in this cell on the
      # same runner: addon → /tmp/npm-addon-snapshot, addon-source →
      # the workspace's freshly-built artifact.
      - name: "Install benchmark deps (addon-source: npm baseline)"
        if: matrix.source == 'addon-source'
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        run: npm install --no-audit --no-fund

      # Copy the WHOLE node_modules tree so the snapshotted addon at
      # /tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp can still
      # resolve its sibling deps (bare-fs, bare-path, …) via standard
      # require-walking. A naked copy of just @qvac/llm-llamacpp leaves
      # those siblings unreachable and the snapshot fails to load.
      - name: Snapshot npm addon (addon-source A/B)
        if: matrix.source == 'addon-source'
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        run: |
          mkdir -p /tmp/npm-snapshot
          cp -r node_modules /tmp/npm-snapshot/

      # Optional: drop the overlay before bare-make so the from-source
      # build matches the npm-published binary as closely as possible.
      # Used to test "is the addon-vs-fabric gap caused by the overlay
      # (llamafile) or by something else in the source build path?"
      - name: Disable vcpkg overlay for addon-source A/B
        if: matrix.source == 'addon-source' && !inputs.addon_source_overlay
        shell: bash
        run: |
          rm -rf packages/llm-llamacpp/vcpkg/ports/qvac-fabric
          echo "::notice::Overlay removed — addon-source will build with the registry version of qvac-fabric (llamafile OFF, same as npm)"

      - name: Build addon from source
        if: matrix.source == 'addon-source'
        shell: bash
        working-directory: packages/llm-llamacpp
        env:
          # vcpkg needs to clone the private qvac-registry-vcpkg repo
          # at configure time. Workflow-level token is enough for read.
          GH_TOKEN: ${{ secrets.GH_TOKEN || github.token }}
          GITHUB_TOKEN: ${{ secrets.GH_TOKEN || github.token }}
        run: |
          npm install --no-audit --no-fund
          bare-make generate
          bare-make build
          bare-make install

      - name: Re-link workspace addon (addon-source A/B)
        if: matrix.source == 'addon-source'
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        run: npm install --no-audit --no-fund --install-links ../../

      - name: Install benchmark deps (addon from npm)
        if: matrix.source != 'addon-source'
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        run: npm install --no-audit --no-fund

      # Vulkan SDK is pre-installed on the self-hosted GPU runners
      # (qvac-ubuntu2404-x64-gpu, qvac-win25-x64-gpu) at the well-known
      # paths shown below — same convention reusable-prebuilds.yml uses.
      # macOS GPU goes through Metal and needs no SDK.
      - name: Configure Vulkan SDK env (Linux GPU)
        if: matrix.backend == 'gpu' && matrix.platform == 'linux-x64'
        shell: bash
        run: |
          echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV"
          echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH"
      - name: Configure Vulkan SDK env (Windows GPU)
        if: matrix.backend == 'gpu' && matrix.platform == 'windows-x64'
        shell: bash
        run: |
          # Single-quoted to preserve the backslash literally.
          echo 'VULKAN_SDK=C:\VulkanSDK' >> "$GITHUB_ENV"

      # The self-hosted qvac-win25-x64-gpu runner doesn't ship cmake on
      # PATH (windows-latest does, via the bundled VS install). Drop a
      # Kitware build in front of PATH so both Windows runners look the
      # same to build-cli-sources.js. Matches the bootstrap pattern in
      # pr-test-inference-addon-cpp-js.yml. Addon legs skip this — they
      # use the npm prebuild and never call cmake.
      - name: Setup CMake (Windows CLI builds)
        if: matrix.platform == 'windows-x64' && matrix.source != 'addon'
        shell: bash
        working-directory: ${{ runner.temp }}
        run: |
          curl -L https://github.com/Kitware/CMake/releases/download/v3.31.6/cmake-3.31.6-windows-x86_64.zip -o cmake.zip
          unzip -q cmake.zip
          echo "$PWD/cmake-3.31.6-windows-x86_64/bin" >> "$GITHUB_PATH"

      - name: Cache CLI builds
        if: matrix.source != 'addon'
        uses: actions/cache@v4
        with:
          path: ${{ env.WORKDIR }}/cli-builds
          key: vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ matrix.source == 'fabric' && inputs.fabric_ref || inputs.upstream_ref }}
          restore-keys: |
            vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-

      - name: Build CLI source
        if: matrix.source != 'addon'
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        env:
          # Windows-CPU runs on windows-latest where the bundled VS
          # install provides MSVC; force the VS multi-config generator
          # so cmake doesn't fall back to MinGW. Matches cpp-tests-*.yml.
          CMAKE_GENERATOR: ${{ matrix.platform == 'windows-x64' && 'Visual Studio 17 2022' || '' }}
        run: |
          REF=${{ matrix.source == 'fabric' && inputs.fabric_ref || inputs.upstream_ref }}
          node scripts/build-cli-sources.js \
            --sources=${{ matrix.source }} \
            --${{ matrix.source }}-ref=$REF \
            --backend=${{ matrix.backend }}

      - name: Prepare models
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        run: npm run prepare:models

      - name: Run VLM benchmark
        shell: bash
        working-directory: ${{ env.WORKDIR }}
        run: |
          if [[ "${{ matrix.source }}" == "addon-source" ]]; then
            # Same-runner A/B: run npm addon (from the snapshot) AND
            # the freshly-built source addon (workspace via @qvac/...)
            # back-to-back in one process so they hit identical hardware.
            npm run run:vlm-bench -- \
              --sources=addon,addon-source \
              --addon-path=/tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp \
              --backend=${{ matrix.backend }} \
              --force-gpu-row \
              --warmup-runs=${{ inputs.warmup_runs }} \
              --measured-runs=${{ inputs.measured_runs }}
          else
            npm run run:vlm-bench -- \
              --sources=${{ matrix.source }} \
              --backend=${{ matrix.backend }} \
              --force-gpu-row \
              --warmup-runs=${{ inputs.warmup_runs }} \
              --measured-runs=${{ inputs.measured_runs }}
          fi

      - name: Upload per-platform results
        if: always()
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
        with:
          name: vlm-perf-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ github.run_number }}
          path: |
            ${{ env.WORKDIR }}/results/vlm-perf-*.md
            ${{ env.WORKDIR }}/results/vlm-perf-*.json
            ${{ env.WORKDIR }}/results/cell-*-stderr.log
          retention-days: 14
          if-no-files-found: warn

  # ── Android (stub) ─────────────────────────────────────────────────
  # Earlier iteration tried to reuse integration-mobile-test-llm-
  # llamacpp.yml in perf-only mode, but the existing mobile workflow
  # is built for breadth (Android + iOS matrix, 3+12 Device-Farm
  # sessions covering many tests) — one full invocation took ~20 min
  # of mostly-irrelevant work for our use case. Until we either land a
  # leaner mobile workflow that runs just our benchmark, or bundle our
  # benchmark logic into the existing mobile test app, this job is a
  # placeholder so the workflow shape covers Android.
  #
  # Default is OFF — flip run_android to true to see the marker
  # artifact and confirm wiring.
  android:
    needs: context
    if: inputs.run_android
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - name: Stub notice
        shell: bash
        run: |
          mkdir -p android-stub
          cat > android-stub/README.txt <<'EOF'
          Android VLM benchmark - placeholder
          ===================================
          The full Android benchmark is not yet wired. The existing
          mobile workflow (integration-mobile-test-llm-llamacpp.yml)
          runs the broader integration test suite and is too heavy
          for the one-cell VLM benchmark; a dedicated leaner mobile
          path is planned.

          For Android perf numbers right now, run
            Actions -> Benchmark Performance (LLM) -> Run workflow
          (workflow file: benchmark-performance-infer-llm-llamacpp.yml)
          EOF
          echo "Android benchmark is a stub in this iteration."
          cat android-stub/README.txt
      - name: Upload Android stub marker
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
        with:
          name: vlm-perf-android-${{ github.run_number }}
          path: android-stub/
          retention-days: 14

  # ── Summarize ──────────────────────────────────────────────────────
  # Downloads every desktop artifact and renders a consolidated table.
  # `if: always()` keeps the summary going when one matrix leg fails.
  summarize:
    needs:
      - context
      - desktop
      - android
    if: always() && needs.context.result == 'success' && needs.context.outputs.desktop_count != '0'
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          repository: ${{ needs.context.outputs.repository }}
          ref: ${{ needs.context.outputs.ref }}
          sparse-checkout: |
            packages/llm-llamacpp/benchmarks/vlm-performance/scripts
          sparse-checkout-cone-mode: false

      - name: Setup Node.js
        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
        with:
          node-version: 22

      - name: Download desktop per-platform artifacts
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
        with:
          pattern: vlm-perf-*-${{ github.run_number }}
          path: per-platform

      - name: Build commit-info JSON
        shell: bash
        run: |
          cat > commit-info.json <<EOF
          {
            "head": {
              "sha": "${{ needs.context.outputs.head_sha }}",
              "title": ${{ toJSON(needs.context.outputs.head_title) }},
              "date": "${{ needs.context.outputs.head_date }}"
            },
            "merge_base": {
              "sha": "${{ needs.context.outputs.merge_base }}",
              "title": ${{ toJSON(needs.context.outputs.base_title) }},
              "date": "${{ needs.context.outputs.base_date }}"
            },
            "comparison_mode": "source-engines"
          }
          EOF
          cat commit-info.json

      - name: Aggregate into one report
        shell: bash
        run: |
          mkdir -p consolidated
          node packages/llm-llamacpp/benchmarks/vlm-performance/scripts/aggregate-platforms.js \
            --inputs=per-platform \
            --commit-info=commit-info.json \
            --output-md=consolidated/vlm-perf-consolidated.md \
            --output-json=consolidated/vlm-perf-consolidated.json

      - name: Post step summary
        if: always()
        shell: bash
        run: |
          {
            echo "## VLM Benchmark - Consolidated"
            echo ""
            if [ -f consolidated/vlm-perf-consolidated.md ]; then
              tail -n +2 consolidated/vlm-perf-consolidated.md
            else
              echo "No consolidated report generated."
            fi
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Post PR comment (View 2 summary)
        if: always() && hashFiles('consolidated/vlm-perf-consolidated.md') != ''
        shell: bash
        env:
          GH_TOKEN: ${{ github.token }}
          REF: ${{ needs.context.outputs.ref }}
        run: |
          PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null || echo "")
          if [[ -z "$PR_NUMBER" ]]; then
            echo "No open PR found for ref $REF — skipping PR comment."
            exit 0
          fi
          echo "Posting VLM benchmark summary to PR #$PR_NUMBER"
          {
            echo "## VLM Benchmark Summary"
            echo ""
            echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_"
            echo ""
            tail -n +2 consolidated/vlm-perf-consolidated.md
          } > /tmp/pr-comment-body.md
          gh pr comment "$PR_NUMBER" --body-file /tmp/pr-comment-body.md

      - name: Upload consolidated report
        if: always()
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
        with:
          name: vlm-perf-consolidated-${{ github.run_number }}
          path: consolidated/
          retention-days: 30
          if-no-files-found: warn

  # ── Matrix mode: Linux legs ────────────────────────────────────────
  # Runs the @qvac/llm-llamacpp addon (published linux-x64 prebuild, which
  # is CPU + Vulkan-GPU capable) over the vlm-matrix fixture. Branch JS
  # (harness/config/fixture) + published native prebuild. One leg per
  # backend; each emits [VLMROW]/[VLMSEG]/[VLMMETA] markers to a log that
  # matrix-combine aggregates with vlm-matrix/aggregate.js.
  matrix-linux:
    needs: context
    if: needs.context.outputs.linux_count != '0'
    name: vlm-matrix-linux-${{ matrix.backend }}
    runs-on: ${{ matrix.runner }}
    # several-sources builds two CLIs from source (first run, pre-cache) + per-image
    # CLI model reloads, so allow more wall-clock than the addon-only two-models path.
    timeout-minutes: 120
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJSON(needs.context.outputs.linux_matrix) }}
    env:
      HF_TOKEN: ${{ secrets.HF_TOKEN }}
      WORKDIR: packages/llm-llamacpp
    steps:
      - name: Manual Workspace Cleanup
        if: startsWith(matrix.runner, 'qvac-')
        shell: bash
        run: rm -rf "$GITHUB_WORKSPACE" && mkdir -p "$GITHUB_WORKSPACE"

      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          repository: ${{ needs.context.outputs.repository }}
          ref: ${{ needs.context.outputs.ref }}

      - name: Setup Node.js
        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
        with:
          node-version: 22

      - name: Install addon deps
        working-directory: ${{ env.WORKDIR }}
        shell: bash
        run: npm install --no-audit --no-fund

      - name: Install bare tooling
        shell: bash
        run: npm install -g --force bare bare-make bare-runtime bare-https brittle

      # Branch JS + published native prebuild: pull the linux-x64 prebuild
      # from the public npm package into the workspace so the harness's
      # require('../../index.js') loads. The published prebuild handles both
      # cpu and (Vulkan) gpu via runtime device selection.
      - name: Fetch published prebuilds
        working-directory: ${{ env.WORKDIR }}
        shell: bash
        run: |
          npm pack @qvac/llm-llamacpp@latest
          tar -xzf *.tgz
          ADDON_VER=$(node -e "console.log(require('./package/package.json').version)")
          echo "ADDON_VERSION=$ADDON_VER" >> "$GITHUB_ENV"
          rm -rf prebuilds
          mv package/prebuilds ./prebuilds
          rm -rf package *.tgz
          echo "addon @qvac/llm-llamacpp@$ADDON_VER"
          ls -la prebuilds/

      # GPU runners ship the Vulkan SDK at the well-known path; stamp the
      # env so the addon's loader finds it. No-op on the cpu leg.
      - name: Configure Vulkan SDK env (Linux GPU)
        if: matrix.backend == 'gpu'
        shell: bash
        run: |
          echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV"
          echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH"

      - name: Run VLM matrix
        working-directory: ${{ env.WORKDIR }}
        shell: bash
        env:
          QVAC_VLM_MATRIX: "1"
          QVAC_VLM_MODE: ${{ inputs.matrix_mode }}
          QVAC_VLM_PRESET: ${{ inputs.matrix_preset }}
          QVAC_VLM_ENGINE: addon # this leg is always the 'addon' source
          QVAC_VLM_DEVICES: ${{ matrix.backend }}
          NO_GPU: ${{ matrix.no_gpu }}
        run: |
          # Regenerate the brittle runner to include ONLY the matrix test,
          # then run it under bare (same pattern as the perf-only path).
          # In several-sources mode this is the addon leg; fabric/upstream CLIs
          # are appended to the same log by the next step.
          npx brittle -r test/integration/all.js test/integration/vlm-matrix.test.js
          bare test/integration/all.js --exit 2>&1 | tee "vlm-matrix-linux-${{ matrix.backend }}.log"
          exit ${PIPESTATUS[0]}

      # several-sources only: build native fabric/upstream llama-mtmd-cli and run
      # them over the SAME fixture, appending [VLMROW]/[VLMSEG] markers to the addon
      # log so aggregate.js renders a 3-source comparison. Linux-only.
      - name: Cache CLI builds (several-sources)
        if: inputs.matrix_mode == 'several-sources'
        uses: actions/cache@v4
        with:
          path: ${{ env.WORKDIR }}/benchmarks/vlm-performance/cli-builds
          key: vlm-cli-v3-linux-${{ matrix.backend }}-${{ inputs.fabric_ref }}-${{ inputs.upstream_ref }}

      - name: Build + run fabric/upstream CLIs over the fixture (several-sources)
        if: inputs.matrix_mode == 'several-sources'
        working-directory: ${{ env.WORKDIR }}/benchmarks/vlm-performance
        shell: bash
        env:
          LOG: ${{ github.workspace }}/${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log
          MODEL_DIR: ${{ github.workspace }}/${{ env.WORKDIR }}/test/model
        run: |
          npm install --no-audit --no-fund
          node scripts/build-cli-sources.js --sources=fabric,upstream \
            --fabric-ref=${{ inputs.fabric_ref }} --upstream-ref=${{ inputs.upstream_ref }} \
            --backend=${{ matrix.backend }}
          FABRIC_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').fabric.binaryPath)")
          UPSTREAM_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').upstream.binaryPath)")
          # Model files were downloaded by the addon leg (names from vlm-matrix.config.cjs).
          LLM="$MODEL_DIR/reg-qwen-unsloth-Q8_0.gguf"
          MMPROJ="$MODEL_DIR/reg-qwen-mradermacher-mmproj-Q8_0.gguf"
          run_src () {
            echo ">> $1 over fixture ($2)"
            node ../vlm-matrix/cli-fixture-runner.cjs \
              --binary "$3" --source "$1" --llm "$LLM" --mmproj "$MMPROJ" \
              --backend "${{ matrix.backend }}" --samples 3 \
              --tasks textvqa,vizwiz,gqa,docvqa,ai2d \
              --main-origin "Qwen3.5-0.8B-Q8_0 (Registry)" \
              --mmproj-origin "Qwen3.5-0.8B mmproj-Q8_0 (Registry)" >> "$LOG" 2>&1 || echo "::warning::$1 run had errors"
          }
          run_src fabric-cli fabric "$FABRIC_BIN"
          run_src upstream-cli upstream "$UPSTREAM_BIN"
          echo "appended fabric-cli + upstream-cli to $LOG"

      # HW/SW provenance so a reader can reproduce the numbers. Rendered in the
      # report's Details section (passed to aggregate.js via --provenance).
      - name: Gather provenance
        if: always()
        working-directory: ${{ env.WORKDIR }}
        shell: bash
        run: |
          F="prov-linux-${{ matrix.backend }}.md"
          {
            echo "**linux · ${{ matrix.backend }}** (runner \`${{ matrix.runner }}\`)"
            echo "- addon: \`@qvac/llm-llamacpp@${ADDON_VERSION:-?}\` (published prebuild)"
            echo "- git: \`${{ needs.context.outputs.head_sha }}\` (ref \`${{ needs.context.outputs.ref }}\`)"
            echo "- node: $(node -v 2>/dev/null) · bare: $(bare --version 2>/dev/null || echo n/a)"
            echo "- os: $(. /etc/os-release 2>/dev/null; echo "$PRETTY_NAME") $(uname -m)"
            echo "- cpu: $(lscpu 2>/dev/null | sed -n 's/^Model name:[[:space:]]*//p' | head -1) ($(nproc) cores)"
            echo "- ram: $(free -h 2>/dev/null | awk '/^Mem:/{print $2}')"
            if [ "${{ matrix.backend }}" = "gpu" ]; then
              echo "- gpu: $(vulkaninfo --summary 2>/dev/null | sed -n 's/.*deviceName[[:space:]]*=[[:space:]]*//p' | head -1 || echo '?')"
            fi
          } > "$F"
          cat "$F"

      - name: Upload matrix log
        if: always()
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
        with:
          name: vlm-matrix-log-linux-${{ matrix.backend }}-${{ github.run_number }}
          path: |
            ${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log
            ${{ env.WORKDIR }}/prov-linux-${{ matrix.backend }}.md
          retention-days: 14
          if-no-files-found: warn

  # ── Matrix mode: Samsung S25 (AWS Device Farm) ─────────────────────
  # Reuses the Android-only mobile workflow to run the SAME matrix harness
  # on-device. qvac_perf_only restricts the run to perf-tests.json
  # (runVlmMatrixTest) → only the vlmMatrix group is scheduled. The active
  # preset on-device is config.defaultPreset (Device Farm forwards no custom
  # env), so vlm-matrix.config.cjs defaultPreset governs the S25 set. The
  # raw on-device log (bare_console.log) carries the [VLMROW] markers and is
  # uploaded by collect-and-upload-logs as console-logs-llamacpp-llm-Android.
  matrix-s25:
    needs: context
    # S25 runs the addon only; several-sources (native CLIs) is Linux-only.
    if: inputs.run_matrix && inputs.run_matrix_s25 && inputs.matrix_mode != 'several-sources'
    uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
    secrets: inherit
    with:
      ref: ${{ needs.context.outputs.ref }}
      repository: ${{ needs.context.outputs.repository }}
      qvac_perf_only: true

  # ── Matrix mode: combine ───────────────────────────────────────────
  # Aggregates [VLMROW]/[VLMSEG]/[VLMMETA] markers from every matrix log
  # (Linux .log + S25 bare_console.log) into one quality+speed report via
  # vlm-matrix/aggregate.js, surfaced to the step summary + PR comment.
  # This is the mechanism that makes mobile (Device Farm) results visible.
  matrix-combine:
    needs:
      - context
      - matrix-linux
      - matrix-s25
    if: always() && inputs.run_matrix && needs.context.result == 'success'
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
        with:
          repository: ${{ needs.context.outputs.repository }}
          ref: ${{ needs.context.outputs.ref }}
          sparse-checkout: |
            packages/llm-llamacpp/benchmarks/vlm-matrix
          sparse-checkout-cone-mode: false

      - name: Setup Node.js
        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
        with:
          node-version: 22

      - name: Download Linux matrix logs
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
        with:
          pattern: vlm-matrix-log-*-${{ github.run_number }}
          path: matrix-logs

      # S25 raw device logs (bare_console.log holds the [VLMROW] markers).
      # continue-on-error so a Linux-only run (no S25 artifact) still combines.
      - name: Download S25 device logs
        continue-on-error: true
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
        with:
          pattern: console-logs-*
          path: matrix-logs

      - name: Aggregate matrix logs
        shell: bash
        run: |
          mkdir -p consolidated
          # Tag each input with its platform host so S25 rows don't collapse
          # onto the Linux rows ([VLMROW].device is only cpu/gpu).
          ARGS=""
          # Linux legs: one log per backend (device field carries cpu/gpu).
          for f in $(find matrix-logs -name 'vlm-matrix-linux-*.log' 2>/dev/null | sort); do
            ARGS="$ARGS --in linux $f"
          done
          # S25: the Samsung device's full logcat carries the [VLMROW] markers
          # (the Android pool may also include a Pixel; we surface S25 here).
          for f in $(find matrix-logs -name '*Galaxy_S25*logcat_full*' 2>/dev/null | sort); do
            ARGS="$ARGS --in s25 $f"
          done
          # HW/SW provenance: Linux legs ship prov-linux-*.md; synthesize one for S25.
          PROV=""
          for p in $(find matrix-logs -name 'prov-*.md' 2>/dev/null | sort); do
            PROV="$PROV --provenance $p"
          done
          # S25 hardware provenance, parsed from the device's own logcat (model /
          # Android / ABI from the Play-store UA line, RAM from the JS totalMemory
          # line, GPU from the Adreno-Vulkan driver load).
          S25F=$(find matrix-logs -name '*Galaxy_S25*logcat_full*' 2>/dev/null | head -1)
          if [ -n "$S25F" ]; then
            MODEL=$(grep -oE 'model=SM-[A-Z0-9]+' "$S25F" | head -1 | cut -d= -f2)
            ANDROID=$(grep -oE 'platformVersionRelease=[0-9]+' "$S25F" | head -1 | cut -d= -f2)
            ABI=$(grep -oE 'supportedAbis=[a-z0-9-]+' "$S25F" | head -1 | cut -d= -f2)
            RAMB=$(grep -oE 'totalMemory: [0-9]+' "$S25F" | head -1 | grep -oE '[0-9]+$')
            RAMGB=$(awk -v b="${RAMB:-0}" 'BEGIN{ if (b>0) printf "%.1f GB", b/1073741824; else printf "?" }')
            GPU=$(grep -qiE 'AdrenoVK|vulkan\.adreno' "$S25F" && echo 'Adreno (Vulkan)' || echo '?')
            {
              echo "**s25** — Samsung Galaxy S25 Ultra (AWS Device Farm)"
              echo "- device: ${MODEL:-SM-?} · Android ${ANDROID:-?} · ${ABI:-arm64-v8a}"
              echo "- ram: ${RAMGB} · gpu: ${GPU}"
              echo "- engine: \`@qvac/llm-llamacpp\` addon (published prebuild)"
            } > prov-s25.md
            PROV="$PROV --provenance prov-s25.md"
          fi
          echo "aggregate inputs:$ARGS"
          echo "provenance:$PROV"
          if [ -z "$ARGS" ]; then
            echo "> No VLM matrix logs found for run #${{ github.run_number }}." > consolidated/vlm-matrix-consolidated.md
          else
            node packages/llm-llamacpp/benchmarks/vlm-matrix/aggregate.js \
              --title "VLM Matrix — ${{ inputs.matrix_mode }} / ${{ inputs.matrix_preset }} (run #${{ github.run_number }})" \
              --mode "${{ inputs.matrix_mode }}" --engine "${{ inputs.matrix_engine }}" --base f16 --candidate q8 \
              --out consolidated/vlm-matrix-consolidated.md \
              $PROV $ARGS
          fi

      - name: Post step summary
        if: always()
        shell: bash
        run: |
          {
            echo "# VLM Matrix — Consolidated"
            echo ""
            cat consolidated/vlm-matrix-consolidated.md 2>/dev/null || echo "No consolidated matrix report generated."
          } >> "$GITHUB_STEP_SUMMARY"