Benchmark (LLM) #48
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmark VLM (LLM) | |
| # Manually-triggered VLM benchmark. Runs | |
| # packages/llm-llamacpp/benchmarks/vlm-performance against Qwen3.5-VL | |
| # on a fixed object-listing task and uploads a consolidated report. | |
| # | |
| # 3-source comparison: addon (JS binding) vs fabric-cli (fork CLI) vs | |
| # upstream-cli (upstream llama.cpp CLI). Measures JS binding overhead | |
| # and fork divergence using the same model across all sources. | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| # ── Sources (what to compare) ───────────────────────────── | |
| run_addon: | |
| description: "── SOURCE 1 ── addon (@qvac/llm-llamacpp JS binding)" | |
| required: false | |
| type: boolean | |
| default: true | |
| ref: | |
| description: " addon ref — qvac branch / tag / SHA (default: current branch)" | |
| required: false | |
| type: string | |
| addon_from_source: | |
| description: " build addon from source (slow, but uses latest fabric)" | |
| required: false | |
| type: boolean | |
| default: false | |
| run_addon_source: | |
| description: " A/B: also build addon from source with the vcpkg overlay applied, run alongside npm addon in one cell (same runner). x86-CPU cells only." | |
| required: false | |
| type: boolean | |
| default: false | |
| addon_source_overlay: | |
| description: " apply the vcpkg overlay during the addon-source build (default true). Set false to rule out 'is it the overlay or something else?'" | |
| required: false | |
| type: boolean | |
| default: true | |
| run_fabric_cli: | |
| description: "── SOURCE 2 ── fabric (qvac fork, native CLI)" | |
| required: false | |
| type: boolean | |
| default: true | |
| fabric_ref: | |
| description: " fabric ref (default: v8189.0.2)" | |
| required: false | |
| type: string | |
| default: "v8189.0.2" | |
| run_upstream_cli: | |
| description: "── SOURCE 3 ── upstream (vanilla llama.cpp, native CLI)" | |
| required: false | |
| type: boolean | |
| default: true | |
| upstream_ref: | |
| description: " upstream ref (default: b8189)" | |
| required: false | |
| type: string | |
| default: "b8189" | |
| # ── Platforms × backends ────────────────────────────────── | |
| # Comma-separated selection. Tokens: linux-cpu, linux-gpu, | |
| # windows-cpu, windows-gpu, macos. "all" expands to every desktop | |
| # cell. GPU rows for Linux/Windows go to self-hosted Vulkan runners; | |
| # macOS uses Metal on GitHub-hosted macos-15-xlarge. | |
| platforms: | |
| description: "── PLATFORMS ── e.g. linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos (or 'all')" | |
| required: false | |
| type: string | |
| default: "linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos" | |
| run_android: | |
| description: "── PLATFORM ── Android (stub)" | |
| required: false | |
| type: boolean | |
| default: false | |
| # ── Run settings ────────────────────────────────────────── | |
| warmup_runs: | |
| description: "Warmup iterations (discarded)" | |
| required: false | |
| type: string | |
| default: "1" | |
| measured_runs: | |
| description: "Measured iterations (median reported)" | |
| required: false | |
| type: string | |
| default: "3" | |
| # ── Matrix mode (config-driven quality+speed matrix) ────── | |
| # Orthogonal to the source-engines benchmark above. When on, runs | |
| # the @qvac/llm-llamacpp addon over the vlm-matrix fixture (lmms-eval | |
| # quality + vision-encode speed) on Linux (and S25), driven by | |
| # packages/llm-llamacpp/test/integration/vlm-matrix.config.cjs. | |
| run_matrix: | |
| description: "── MATRIX ── run the config-driven VLM quality+speed matrix (addon, Linux + S25)" | |
| required: false | |
| type: boolean | |
| default: false | |
| matrix_mode: | |
| description: " matrix mode: two-models (f16 vs q8, addon) or several-sources (addon+fabric-cli+upstream-cli, Linux-only)" | |
| required: false | |
| type: string | |
| default: "two-models" | |
| matrix_preset: | |
| description: " matrix preset: compare (two-models), sources (several-sources), smoke, or full. Overrides config.defaultPreset on Linux." | |
| required: false | |
| type: string | |
| default: "compare" | |
| matrix_engine: | |
| description: " inference engine (two-models mode): addon | fabric-cli | upstream-cli. CLI engines are desktop-only; addon runs everywhere." | |
| required: false | |
| type: string | |
| default: "addon" | |
| matrix_linux: | |
| description: " Linux matrix legs, comma-sep: linux-cpu,linux-gpu" | |
| required: false | |
| type: string | |
| default: "linux-cpu,linux-gpu" | |
| run_matrix_s25: | |
| description: " also run the matrix on Samsung S25 (AWS Device Farm)" | |
| required: false | |
| type: boolean | |
| default: false | |
| permissions: | |
| contents: read | |
| packages: read | |
| pull-requests: write | |
| id-token: write | |
| jobs: | |
| # ── Context ──────────────────────────────────────────────────────── | |
| # Resolves the repo + ref so downstream jobs check out the right | |
| # commit even when the workflow_dispatch is invoked without `ref`, | |
| # and builds the desktop matrix from the per-platform input toggles. | |
| # Matrix is computed here (instead of via job-level `if:`) because | |
| # GitHub Actions doesn't allow `matrix.*` references in job-level | |
| # conditions — they're evaluated before the matrix is expanded. | |
| context: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| repository: ${{ steps.ctx.outputs.repository }} | |
| ref: ${{ steps.ctx.outputs.ref }} | |
| desktop_matrix: ${{ steps.matrix.outputs.value }} | |
| desktop_count: ${{ steps.matrix.outputs.count }} | |
| linux_matrix: ${{ steps.lmatrix.outputs.value }} | |
| linux_count: ${{ steps.lmatrix.outputs.count }} | |
| merge_base: ${{ steps.commits.outputs.merge_base }} | |
| head_sha: ${{ steps.commits.outputs.head_sha }} | |
| head_title: ${{ steps.commits.outputs.head_title }} | |
| head_date: ${{ steps.commits.outputs.head_date }} | |
| base_title: ${{ steps.commits.outputs.base_title }} | |
| base_date: ${{ steps.commits.outputs.base_date }} | |
| steps: | |
| - id: ctx | |
| shell: bash | |
| env: | |
| INPUT_REF: ${{ inputs.ref }} | |
| REPO: ${{ github.repository }} | |
| REF_NAME: ${{ github.ref_name }} | |
| run: | | |
| repo="$REPO" | |
| ref="${INPUT_REF:-$REF_NAME}" | |
| echo "repository=$repo" >> "$GITHUB_OUTPUT" | |
| echo "ref=$ref" >> "$GITHUB_OUTPUT" | |
| - id: matrix | |
| shell: bash | |
| env: | |
| RUN_ADDON: ${{ inputs.run_addon }} | |
| RUN_FABRIC: ${{ inputs.run_fabric_cli }} | |
| RUN_UPSTREAM: ${{ inputs.run_upstream_cli }} | |
| RUN_ADDON_SOURCE: ${{ inputs.run_addon_source }} | |
| PLATFORMS: ${{ inputs.platforms }} | |
| run: | | |
| # Sources | |
| sources='[]' | |
| if [[ "$RUN_ADDON" == "true" ]]; then | |
| sources=$(echo "$sources" | jq -c '. + ["addon"]') | |
| fi | |
| if [[ "$RUN_FABRIC" == "true" ]]; then | |
| sources=$(echo "$sources" | jq -c '. + ["fabric"]') | |
| fi | |
| if [[ "$RUN_UPSTREAM" == "true" ]]; then | |
| sources=$(echo "$sources" | jq -c '. + ["upstream"]') | |
| fi | |
| # addon-source is an opt-in 4th source that builds the addon | |
| # from local sources (with vcpkg overlay applied) and runs it | |
| # in the same cell as the npm addon. Gated to *-cpu cells | |
| # because llamafile is the primary thing this exists to A/B, | |
| # and that's x86-CPU-specific. | |
| if [[ "$RUN_ADDON_SOURCE" == "true" ]]; then | |
| sources=$(echo "$sources" | jq -c '. + ["addon-source"]') | |
| fi | |
| # Selected platform×backend tokens | |
| selected="${PLATFORMS:-linux-cpu}" | |
| if [[ "$selected" == "all" ]]; then | |
| selected="linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos" | |
| fi | |
| # Build cells from tokens. Each token maps to a fixed | |
| # (platform, arch, backend, runner) tuple. | |
| cells='[]' | |
| IFS=',' read -ra tokens <<< "$selected" | |
| for raw in "${tokens[@]}"; do | |
| sel=$(echo "$raw" | xargs) | |
| case "$sel" in | |
| linux-cpu) plat=linux-x64; arch=x64; backend=cpu; runner=ubuntu-latest ;; | |
| linux-gpu) plat=linux-x64; arch=x64; backend=gpu; runner=qvac-ubuntu2404-x64-gpu ;; | |
| windows-cpu) plat=windows-x64; arch=x64; backend=cpu; runner=windows-latest ;; | |
| windows-gpu) plat=windows-x64; arch=x64; backend=gpu; runner=qvac-win25-x64-gpu ;; | |
| macos) plat=macos-arm64; arch=arm64; backend=gpu; runner=macos-15-xlarge ;; | |
| "") continue ;; | |
| *) echo "::warning::Unknown platform token '$sel' (known: linux-cpu, linux-gpu, windows-cpu, windows-gpu, macos)"; continue ;; | |
| esac | |
| for src in $(echo "$sources" | jq -r '.[]'); do | |
| # windows-gpu currently only supports the addon leg: the | |
| # self-hosted qvac-win25-x64-gpu runner has Vulkan and | |
| # chocolatey but no MSVC and chocolatey can't install | |
| # LLVM at job time (lock/permission errors), so the | |
| # fabric/upstream CLI builds aren't viable there yet. | |
| # Re-enable once the runner image ships LLVM+Ninja. | |
| if [[ "$sel" == "windows-gpu" && "$src" != "addon" ]]; then | |
| continue | |
| fi | |
| # addon-source: llamafile is x86-CPU specific, so only | |
| # emit the from-source A/B cell on CPU cells (linux-cpu, | |
| # windows-cpu). On GPU cells the matmul path goes through | |
| # Vulkan/Metal shaders that don't change with llamafile. | |
| if [[ "$src" == "addon-source" ]]; then | |
| case "$sel" in | |
| linux-cpu|windows-cpu) ;; | |
| *) continue ;; | |
| esac | |
| fi | |
| # When the addon-source A/B is enabled on this cell, the | |
| # addon-source leg already runs --sources=addon,addon-source | |
| # in one process. The dedicated 'addon' cell would | |
| # produce a duplicate row on a different runner — skip | |
| # it so the consolidated report stays clean. | |
| if [[ "$src" == "addon" && "$RUN_ADDON_SOURCE" == "true" ]]; then | |
| case "$sel" in | |
| linux-cpu|windows-cpu) continue ;; | |
| esac | |
| fi | |
| cells=$(echo "$cells" | jq -c \ | |
| --arg p "$plat" --arg a "$arch" --arg b "$backend" --arg r "$runner" --arg s "$src" \ | |
| '. + [{"platform":$p,"arch":$a,"backend":$b,"runner":$r,"source":$s}]') | |
| done | |
| done | |
| count=$(echo "$cells" | jq 'length') | |
| echo "value=$cells" >> "$GITHUB_OUTPUT" | |
| echo "count=$count" >> "$GITHUB_OUTPUT" | |
| echo "Desktop matrix ($count entries): $cells" | |
| # Linux legs for the config-driven matrix mode (addon over the | |
| # vlm-matrix fixture). Independent of the source-engines matrix above. | |
| - id: lmatrix | |
| shell: bash | |
| env: | |
| RUN_MATRIX: ${{ inputs.run_matrix }} | |
| MATRIX_LINUX: ${{ inputs.matrix_linux }} | |
| MATRIX_MODE: ${{ inputs.matrix_mode }} | |
| run: | | |
| cells='[]' | |
| if [[ "$RUN_MATRIX" == "true" ]]; then | |
| IFS=',' read -ra tokens <<< "${MATRIX_LINUX:-linux-cpu}" | |
| for raw in "${tokens[@]}"; do | |
| sel=$(echo "$raw" | xargs) | |
| # several-sources builds native fabric/upstream CLIs, so the CPU leg | |
| # needs a runner with cmake+toolchain → GitHub-hosted ubuntu-latest. | |
| cpu_runner=qvac-ubuntu2204-x64 | |
| if [[ "$MATRIX_MODE" == "several-sources" ]]; then cpu_runner=ubuntu-latest; fi | |
| case "$sel" in | |
| linux-cpu) backend=cpu; runner=$cpu_runner; no_gpu=true ;; | |
| linux-gpu) backend=gpu; runner=qvac-ubuntu2404-x64-gpu; no_gpu=false ;; | |
| "") continue ;; | |
| *) echo "::warning::Unknown matrix_linux token '$sel' (known: linux-cpu, linux-gpu)"; continue ;; | |
| esac | |
| cells=$(echo "$cells" | jq -c \ | |
| --arg b "$backend" --arg r "$runner" --arg n "$no_gpu" \ | |
| '. + [{"backend":$b,"runner":$r,"no_gpu":$n}]') | |
| done | |
| fi | |
| count=$(echo "$cells" | jq 'length') | |
| echo "value=$cells" >> "$GITHUB_OUTPUT" | |
| echo "count=$count" >> "$GITHUB_OUTPUT" | |
| echo "Linux matrix ($count entries): $cells" | |
| # Resolve commit metadata so the consolidated report can show what | |
| # the candidate ref + merge-base actually point at (hash, title, | |
| # date). Needs a real clone — sparse checkout doesn't give us git | |
| # history. | |
| - name: Checkout for commit lookup | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 | |
| with: | |
| repository: ${{ steps.ctx.outputs.repository }} | |
| ref: ${{ steps.ctx.outputs.ref }} | |
| fetch-depth: 0 | |
| - id: commits | |
| shell: bash | |
| run: | | |
| git fetch origin main --quiet | |
| HEAD_SHA=$(git rev-parse HEAD) | |
| MERGE_BASE=$(git merge-base HEAD origin/main || echo "") | |
| HEAD_TITLE=$(git log -1 --pretty=%s "$HEAD_SHA") | |
| HEAD_DATE=$(git log -1 --pretty=%cI "$HEAD_SHA") | |
| if [[ -n "$MERGE_BASE" ]]; then | |
| BASE_TITLE=$(git log -1 --pretty=%s "$MERGE_BASE") | |
| BASE_DATE=$(git log -1 --pretty=%cI "$MERGE_BASE") | |
| else | |
| BASE_TITLE="" | |
| BASE_DATE="" | |
| fi | |
| { | |
| echo "head_sha=$HEAD_SHA" | |
| echo "merge_base=$MERGE_BASE" | |
| echo "head_title=$HEAD_TITLE" | |
| echo "head_date=$HEAD_DATE" | |
| echo "base_title=$BASE_TITLE" | |
| echo "base_date=$BASE_DATE" | |
| } >> "$GITHUB_OUTPUT" | |
| echo "HEAD: $HEAD_SHA - $HEAD_TITLE ($HEAD_DATE)" | |
| echo "merge-base: $MERGE_BASE - $BASE_TITLE ($BASE_DATE)" | |
| # ── Desktop benchmark matrix ─────────────────────────────────────── | |
| # Each leg is the same shape — pick the runner via matrix.runner. | |
| # GPU rows (linux-x64 / windows-x64) target self-hosted Vulkan | |
| # runners pre-provisioned with the Vulkan SDK; macOS arm64 uses | |
| # Metal on the GitHub-hosted macos-15-xlarge runner. The matrix | |
| # itself is built dynamically by the context job above from the | |
| # `platforms` input. | |
| desktop: | |
| needs: context | |
| if: needs.context.outputs.desktop_count != '0' | |
| name: vlm-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }} | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: ${{ fromJSON(needs.context.outputs.desktop_matrix) }} | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| WORKDIR: packages/llm-llamacpp/benchmarks/vlm-performance | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 | |
| with: | |
| repository: ${{ needs.context.outputs.repository }} | |
| ref: ${{ needs.context.outputs.ref }} | |
| # The addon-source cell needs the full native toolchain (LLVM, | |
| # vcpkg, bare-make, Vulkan SDK) because it builds the addon from | |
| # local sources with the vcpkg overlay applied. The 'addon', | |
| # 'fabric', and 'upstream' cells stay on the lighter npm path. | |
| - name: Setup Node.js and Bare tooling | |
| if: matrix.source == 'addon-source' | |
| uses: ./.github/actions/setup-bare-tooling | |
| - name: Setup Node.js | |
| if: matrix.source != 'addon-source' | |
| uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 | |
| with: | |
| node-version: 22 | |
| - name: Setup LLVM | |
| if: matrix.source == 'addon-source' | |
| uses: ./.github/actions/setup-llvm | |
| # Inline vcpkg bootstrap. We can't use the repo's setup-vcpkg | |
| # composite action because it hard-requires MODEL_S3_BUCKET for | |
| # the prebuilds-shared S3 binary cache — a secret we don't | |
| # plumb into the bench workflow. Set VCPKG_ROOT to the runner's | |
| # pre-installed vcpkg and leave VCPKG_BINARY_SOURCES at the | |
| # default (per-runner disk cache), which is fine for a one-off | |
| # bench build. | |
| - name: Configure vcpkg (addon-source) | |
| if: matrix.source == 'addon-source' && runner.os == 'Linux' | |
| shell: bash | |
| run: | | |
| echo "VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT" >> "$GITHUB_ENV" | |
| echo "VCPKG_BUILD_TYPE=release" >> "$GITHUB_ENV" | |
| echo "VCPKG_CMAKE_CONFIGURE_OPTIONS=--no-parallel-configure" >> "$GITHUB_ENV" | |
| # qvac-fabric defaults to the gpu-backends feature, which | |
| # transitively requires the Vulkan SDK at build time. ubuntu-latest | |
| # doesn't ship one; install upstream's prebuilt SDK and stamp the | |
| # env so cmake's FindVulkan picks it up. Same install pattern | |
| # benchmark-embed-llamacpp.yml uses. libvulkan-dev pulls | |
| # libvulkan.so + libvulkan1 from the distro — the LunarG SDK | |
| # 1.4.x no longer bundles the loader, so cmake's FindVulkan | |
| # can't see Vulkan_LIBRARY without it. | |
| - name: Install Vulkan SDK (addon-source on Linux) | |
| if: matrix.source == 'addon-source' && runner.os == 'Linux' | |
| shell: bash | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils libvulkan-dev | |
| wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz | |
| mkdir -p "$HOME/vulkan" | |
| tar -xf /tmp/vulkansdk.tar.xz -C "$HOME/vulkan" --strip-components=1 | |
| VULKAN_SDK="$HOME/vulkan/x86_64" | |
| echo "VULKAN_SDK=$VULKAN_SDK" >> "$GITHUB_ENV" | |
| echo "$VULKAN_SDK/bin" >> "$GITHUB_PATH" | |
| echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> "$GITHUB_ENV" | |
| echo "PKG_CONFIG_PATH=$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}" >> "$GITHUB_ENV" | |
| # Install npm @qvac/llm-llamacpp first so we have the published | |
| # addon (no llamafile) on disk, then snapshot it to /tmp before | |
| # the from-source build overwrites prebuilds/ in the workspace. | |
| # The bench then runs both addon variants in this cell on the | |
| # same runner: addon → /tmp/npm-addon-snapshot, addon-source → | |
| # the workspace's freshly-built artifact. | |
| - name: "Install benchmark deps (addon-source: npm baseline)" | |
| if: matrix.source == 'addon-source' | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| run: npm install --no-audit --no-fund | |
| # Copy the WHOLE node_modules tree so the snapshotted addon at | |
| # /tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp can still | |
| # resolve its sibling deps (bare-fs, bare-path, …) via standard | |
| # require-walking. A naked copy of just @qvac/llm-llamacpp leaves | |
| # those siblings unreachable and the snapshot fails to load. | |
| - name: Snapshot npm addon (addon-source A/B) | |
| if: matrix.source == 'addon-source' | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| run: | | |
| mkdir -p /tmp/npm-snapshot | |
| cp -r node_modules /tmp/npm-snapshot/ | |
| # Optional: drop the overlay before bare-make so the from-source | |
| # build matches the npm-published binary as closely as possible. | |
| # Used to test "is the addon-vs-fabric gap caused by the overlay | |
| # (llamafile) or by something else in the source build path?" | |
| - name: Disable vcpkg overlay for addon-source A/B | |
| if: matrix.source == 'addon-source' && !inputs.addon_source_overlay | |
| shell: bash | |
| run: | | |
| rm -rf packages/llm-llamacpp/vcpkg/ports/qvac-fabric | |
| echo "::notice::Overlay removed — addon-source will build with the registry version of qvac-fabric (llamafile OFF, same as npm)" | |
| - name: Build addon from source | |
| if: matrix.source == 'addon-source' | |
| shell: bash | |
| working-directory: packages/llm-llamacpp | |
| env: | |
| # vcpkg needs to clone the private qvac-registry-vcpkg repo | |
| # at configure time. Workflow-level token is enough for read. | |
| GH_TOKEN: ${{ secrets.GH_TOKEN || github.token }} | |
| GITHUB_TOKEN: ${{ secrets.GH_TOKEN || github.token }} | |
| run: | | |
| npm install --no-audit --no-fund | |
| bare-make generate | |
| bare-make build | |
| bare-make install | |
| - name: Re-link workspace addon (addon-source A/B) | |
| if: matrix.source == 'addon-source' | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| run: npm install --no-audit --no-fund --install-links ../../ | |
| - name: Install benchmark deps (addon from npm) | |
| if: matrix.source != 'addon-source' | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| run: npm install --no-audit --no-fund | |
| # Vulkan SDK is pre-installed on the self-hosted GPU runners | |
| # (qvac-ubuntu2404-x64-gpu, qvac-win25-x64-gpu) at the well-known | |
| # paths shown below — same convention reusable-prebuilds.yml uses. | |
| # macOS GPU goes through Metal and needs no SDK. | |
| - name: Configure Vulkan SDK env (Linux GPU) | |
| if: matrix.backend == 'gpu' && matrix.platform == 'linux-x64' | |
| shell: bash | |
| run: | | |
| echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV" | |
| echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH" | |
| - name: Configure Vulkan SDK env (Windows GPU) | |
| if: matrix.backend == 'gpu' && matrix.platform == 'windows-x64' | |
| shell: bash | |
| run: | | |
| # Single-quoted to preserve the backslash literally. | |
| echo 'VULKAN_SDK=C:\VulkanSDK' >> "$GITHUB_ENV" | |
| # The self-hosted qvac-win25-x64-gpu runner doesn't ship cmake on | |
| # PATH (windows-latest does, via the bundled VS install). Drop a | |
| # Kitware build in front of PATH so both Windows runners look the | |
| # same to build-cli-sources.js. Matches the bootstrap pattern in | |
| # pr-test-inference-addon-cpp-js.yml. Addon legs skip this — they | |
| # use the npm prebuild and never call cmake. | |
| - name: Setup CMake (Windows CLI builds) | |
| if: matrix.platform == 'windows-x64' && matrix.source != 'addon' | |
| shell: bash | |
| working-directory: ${{ runner.temp }} | |
| run: | | |
| curl -L https://github.com/Kitware/CMake/releases/download/v3.31.6/cmake-3.31.6-windows-x86_64.zip -o cmake.zip | |
| unzip -q cmake.zip | |
| echo "$PWD/cmake-3.31.6-windows-x86_64/bin" >> "$GITHUB_PATH" | |
| - name: Cache CLI builds | |
| if: matrix.source != 'addon' | |
| uses: actions/cache@v4 | |
| with: | |
| path: ${{ env.WORKDIR }}/cli-builds | |
| key: vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ matrix.source == 'fabric' && inputs.fabric_ref || inputs.upstream_ref }} | |
| restore-keys: | | |
| vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}- | |
| - name: Build CLI source | |
| if: matrix.source != 'addon' | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| env: | |
| # Windows-CPU runs on windows-latest where the bundled VS | |
| # install provides MSVC; force the VS multi-config generator | |
| # so cmake doesn't fall back to MinGW. Matches cpp-tests-*.yml. | |
| CMAKE_GENERATOR: ${{ matrix.platform == 'windows-x64' && 'Visual Studio 17 2022' || '' }} | |
| run: | | |
| REF=${{ matrix.source == 'fabric' && inputs.fabric_ref || inputs.upstream_ref }} | |
| node scripts/build-cli-sources.js \ | |
| --sources=${{ matrix.source }} \ | |
| --${{ matrix.source }}-ref=$REF \ | |
| --backend=${{ matrix.backend }} | |
| - name: Prepare models | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| run: npm run prepare:models | |
| - name: Run VLM benchmark | |
| shell: bash | |
| working-directory: ${{ env.WORKDIR }} | |
| run: | | |
| if [[ "${{ matrix.source }}" == "addon-source" ]]; then | |
| # Same-runner A/B: run npm addon (from the snapshot) AND | |
| # the freshly-built source addon (workspace via @qvac/...) | |
| # back-to-back in one process so they hit identical hardware. | |
| npm run run:vlm-bench -- \ | |
| --sources=addon,addon-source \ | |
| --addon-path=/tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp \ | |
| --backend=${{ matrix.backend }} \ | |
| --force-gpu-row \ | |
| --warmup-runs=${{ inputs.warmup_runs }} \ | |
| --measured-runs=${{ inputs.measured_runs }} | |
| else | |
| npm run run:vlm-bench -- \ | |
| --sources=${{ matrix.source }} \ | |
| --backend=${{ matrix.backend }} \ | |
| --force-gpu-row \ | |
| --warmup-runs=${{ inputs.warmup_runs }} \ | |
| --measured-runs=${{ inputs.measured_runs }} | |
| fi | |
| - name: Upload per-platform results | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 | |
| with: | |
| name: vlm-perf-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ github.run_number }} | |
| path: | | |
| ${{ env.WORKDIR }}/results/vlm-perf-*.md | |
| ${{ env.WORKDIR }}/results/vlm-perf-*.json | |
| ${{ env.WORKDIR }}/results/cell-*-stderr.log | |
| retention-days: 14 | |
| if-no-files-found: warn | |
| # ── Android (stub) ───────────────────────────────────────────────── | |
| # Earlier iteration tried to reuse integration-mobile-test-llm- | |
| # llamacpp.yml in perf-only mode, but the existing mobile workflow | |
| # is built for breadth (Android + iOS matrix, 3+12 Device-Farm | |
| # sessions covering many tests) — one full invocation took ~20 min | |
| # of mostly-irrelevant work for our use case. Until we either land a | |
| # leaner mobile workflow that runs just our benchmark, or bundle our | |
| # benchmark logic into the existing mobile test app, this job is a | |
| # placeholder so the workflow shape covers Android. | |
| # | |
| # Default is OFF — flip run_android to true to see the marker | |
| # artifact and confirm wiring. | |
| android: | |
| needs: context | |
| if: inputs.run_android | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| steps: | |
| - name: Stub notice | |
| shell: bash | |
| run: | | |
| mkdir -p android-stub | |
| cat > android-stub/README.txt <<'EOF' | |
| Android VLM benchmark - placeholder | |
| =================================== | |
| The full Android benchmark is not yet wired. The existing | |
| mobile workflow (integration-mobile-test-llm-llamacpp.yml) | |
| runs the broader integration test suite and is too heavy | |
| for the one-cell VLM benchmark; a dedicated leaner mobile | |
| path is planned. | |
| For Android perf numbers right now, run | |
| Actions -> Benchmark Performance (LLM) -> Run workflow | |
| (workflow file: benchmark-performance-infer-llm-llamacpp.yml) | |
| EOF | |
| echo "Android benchmark is a stub in this iteration." | |
| cat android-stub/README.txt | |
| - name: Upload Android stub marker | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 | |
| with: | |
| name: vlm-perf-android-${{ github.run_number }} | |
| path: android-stub/ | |
| retention-days: 14 | |
| # ── Summarize ────────────────────────────────────────────────────── | |
| # Downloads every desktop artifact and renders a consolidated table. | |
| # `if: always()` keeps the summary going when one matrix leg fails. | |
| summarize: | |
| needs: | |
| - context | |
| - desktop | |
| - android | |
| if: always() && needs.context.result == 'success' && needs.context.outputs.desktop_count != '0' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 | |
| with: | |
| repository: ${{ needs.context.outputs.repository }} | |
| ref: ${{ needs.context.outputs.ref }} | |
| sparse-checkout: | | |
| packages/llm-llamacpp/benchmarks/vlm-performance/scripts | |
| sparse-checkout-cone-mode: false | |
| - name: Setup Node.js | |
| uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 | |
| with: | |
| node-version: 22 | |
| - name: Download desktop per-platform artifacts | |
| uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 | |
| with: | |
| pattern: vlm-perf-*-${{ github.run_number }} | |
| path: per-platform | |
| - name: Build commit-info JSON | |
| shell: bash | |
| run: | | |
| cat > commit-info.json <<EOF | |
| { | |
| "head": { | |
| "sha": "${{ needs.context.outputs.head_sha }}", | |
| "title": ${{ toJSON(needs.context.outputs.head_title) }}, | |
| "date": "${{ needs.context.outputs.head_date }}" | |
| }, | |
| "merge_base": { | |
| "sha": "${{ needs.context.outputs.merge_base }}", | |
| "title": ${{ toJSON(needs.context.outputs.base_title) }}, | |
| "date": "${{ needs.context.outputs.base_date }}" | |
| }, | |
| "comparison_mode": "source-engines" | |
| } | |
| EOF | |
| cat commit-info.json | |
| - name: Aggregate into one report | |
| shell: bash | |
| run: | | |
| mkdir -p consolidated | |
| node packages/llm-llamacpp/benchmarks/vlm-performance/scripts/aggregate-platforms.js \ | |
| --inputs=per-platform \ | |
| --commit-info=commit-info.json \ | |
| --output-md=consolidated/vlm-perf-consolidated.md \ | |
| --output-json=consolidated/vlm-perf-consolidated.json | |
| - name: Post step summary | |
| if: always() | |
| shell: bash | |
| run: | | |
| { | |
| echo "## VLM Benchmark - Consolidated" | |
| echo "" | |
| if [ -f consolidated/vlm-perf-consolidated.md ]; then | |
| tail -n +2 consolidated/vlm-perf-consolidated.md | |
| else | |
| echo "No consolidated report generated." | |
| fi | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| - name: Post PR comment (View 2 summary) | |
| if: always() && hashFiles('consolidated/vlm-perf-consolidated.md') != '' | |
| shell: bash | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| REF: ${{ needs.context.outputs.ref }} | |
| run: | | |
| PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null || echo "") | |
| if [[ -z "$PR_NUMBER" ]]; then | |
| echo "No open PR found for ref $REF — skipping PR comment." | |
| exit 0 | |
| fi | |
| echo "Posting VLM benchmark summary to PR #$PR_NUMBER" | |
| { | |
| echo "## VLM Benchmark Summary" | |
| echo "" | |
| echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_" | |
| echo "" | |
| tail -n +2 consolidated/vlm-perf-consolidated.md | |
| } > /tmp/pr-comment-body.md | |
| gh pr comment "$PR_NUMBER" --body-file /tmp/pr-comment-body.md | |
| - name: Upload consolidated report | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 | |
| with: | |
| name: vlm-perf-consolidated-${{ github.run_number }} | |
| path: consolidated/ | |
| retention-days: 30 | |
| if-no-files-found: warn | |
| # ── Matrix mode: Linux legs ──────────────────────────────────────── | |
| # Runs the @qvac/llm-llamacpp addon (published linux-x64 prebuild, which | |
| # is CPU + Vulkan-GPU capable) over the vlm-matrix fixture. Branch JS | |
| # (harness/config/fixture) + published native prebuild. One leg per | |
| # backend; each emits [VLMROW]/[VLMSEG]/[VLMMETA] markers to a log that | |
| # matrix-combine aggregates with vlm-matrix/aggregate.js. | |
| matrix-linux: | |
| needs: context | |
| if: needs.context.outputs.linux_count != '0' | |
| name: vlm-matrix-linux-${{ matrix.backend }} | |
| runs-on: ${{ matrix.runner }} | |
| # several-sources builds two CLIs from source (first run, pre-cache) + per-image | |
| # CLI model reloads, so allow more wall-clock than the addon-only two-models path. | |
| timeout-minutes: 120 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: ${{ fromJSON(needs.context.outputs.linux_matrix) }} | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| WORKDIR: packages/llm-llamacpp | |
| steps: | |
| - name: Manual Workspace Cleanup | |
| if: startsWith(matrix.runner, 'qvac-') | |
| shell: bash | |
| run: rm -rf "$GITHUB_WORKSPACE" && mkdir -p "$GITHUB_WORKSPACE" | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 | |
| with: | |
| repository: ${{ needs.context.outputs.repository }} | |
| ref: ${{ needs.context.outputs.ref }} | |
| - name: Setup Node.js | |
| uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 | |
| with: | |
| node-version: 22 | |
| - name: Install addon deps | |
| working-directory: ${{ env.WORKDIR }} | |
| shell: bash | |
| run: npm install --no-audit --no-fund | |
| - name: Install bare tooling | |
| shell: bash | |
| run: npm install -g --force bare bare-make bare-runtime bare-https brittle | |
| # Branch JS + published native prebuild: pull the linux-x64 prebuild | |
| # from the public npm package into the workspace so the harness's | |
| # require('../../index.js') loads. The published prebuild handles both | |
| # cpu and (Vulkan) gpu via runtime device selection. | |
| - name: Fetch published prebuilds | |
| working-directory: ${{ env.WORKDIR }} | |
| shell: bash | |
| run: | | |
| npm pack @qvac/llm-llamacpp@latest | |
| tar -xzf *.tgz | |
| ADDON_VER=$(node -e "console.log(require('./package/package.json').version)") | |
| echo "ADDON_VERSION=$ADDON_VER" >> "$GITHUB_ENV" | |
| rm -rf prebuilds | |
| mv package/prebuilds ./prebuilds | |
| rm -rf package *.tgz | |
| echo "addon @qvac/llm-llamacpp@$ADDON_VER" | |
| ls -la prebuilds/ | |
| # GPU runners ship the Vulkan SDK at the well-known path; stamp the | |
| # env so the addon's loader finds it. No-op on the cpu leg. | |
| - name: Configure Vulkan SDK env (Linux GPU) | |
| if: matrix.backend == 'gpu' | |
| shell: bash | |
| run: | | |
| echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV" | |
| echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH" | |
| - name: Run VLM matrix | |
| working-directory: ${{ env.WORKDIR }} | |
| shell: bash | |
| env: | |
| QVAC_VLM_MATRIX: "1" | |
| QVAC_VLM_MODE: ${{ inputs.matrix_mode }} | |
| QVAC_VLM_PRESET: ${{ inputs.matrix_preset }} | |
| QVAC_VLM_ENGINE: addon # this leg is always the 'addon' source | |
| QVAC_VLM_DEVICES: ${{ matrix.backend }} | |
| NO_GPU: ${{ matrix.no_gpu }} | |
| run: | | |
| # Regenerate the brittle runner to include ONLY the matrix test, | |
| # then run it under bare (same pattern as the perf-only path). | |
| # In several-sources mode this is the addon leg; fabric/upstream CLIs | |
| # are appended to the same log by the next step. | |
| npx brittle -r test/integration/all.js test/integration/vlm-matrix.test.js | |
| bare test/integration/all.js --exit 2>&1 | tee "vlm-matrix-linux-${{ matrix.backend }}.log" | |
| exit ${PIPESTATUS[0]} | |
| # several-sources only: build native fabric/upstream llama-mtmd-cli and run | |
| # them over the SAME fixture, appending [VLMROW]/[VLMSEG] markers to the addon | |
| # log so aggregate.js renders a 3-source comparison. Linux-only. | |
| - name: Cache CLI builds (several-sources) | |
| if: inputs.matrix_mode == 'several-sources' | |
| uses: actions/cache@v4 | |
| with: | |
| path: ${{ env.WORKDIR }}/benchmarks/vlm-performance/cli-builds | |
| key: vlm-cli-v3-linux-${{ matrix.backend }}-${{ inputs.fabric_ref }}-${{ inputs.upstream_ref }} | |
| - name: Build + run fabric/upstream CLIs over the fixture (several-sources) | |
| if: inputs.matrix_mode == 'several-sources' | |
| working-directory: ${{ env.WORKDIR }}/benchmarks/vlm-performance | |
| shell: bash | |
| env: | |
| LOG: ${{ github.workspace }}/${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log | |
| MODEL_DIR: ${{ github.workspace }}/${{ env.WORKDIR }}/test/model | |
| run: | | |
| npm install --no-audit --no-fund | |
| node scripts/build-cli-sources.js --sources=fabric,upstream \ | |
| --fabric-ref=${{ inputs.fabric_ref }} --upstream-ref=${{ inputs.upstream_ref }} \ | |
| --backend=${{ matrix.backend }} | |
| FABRIC_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').fabric.binaryPath)") | |
| UPSTREAM_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').upstream.binaryPath)") | |
| # Model files were downloaded by the addon leg (names from vlm-matrix.config.cjs). | |
| LLM="$MODEL_DIR/reg-qwen-unsloth-Q8_0.gguf" | |
| MMPROJ="$MODEL_DIR/reg-qwen-mradermacher-mmproj-Q8_0.gguf" | |
| run_src () { | |
| echo ">> $1 over fixture ($2)" | |
| node ../vlm-matrix/cli-fixture-runner.cjs \ | |
| --binary "$3" --source "$1" --llm "$LLM" --mmproj "$MMPROJ" \ | |
| --backend "${{ matrix.backend }}" --samples 3 \ | |
| --tasks textvqa,vizwiz,gqa,docvqa,ai2d \ | |
| --main-origin "Qwen3.5-0.8B-Q8_0 (Registry)" \ | |
| --mmproj-origin "Qwen3.5-0.8B mmproj-Q8_0 (Registry)" >> "$LOG" 2>&1 || echo "::warning::$1 run had errors" | |
| } | |
| run_src fabric-cli fabric "$FABRIC_BIN" | |
| run_src upstream-cli upstream "$UPSTREAM_BIN" | |
| echo "appended fabric-cli + upstream-cli to $LOG" | |
| # HW/SW provenance so a reader can reproduce the numbers. Rendered in the | |
| # report's Details section (passed to aggregate.js via --provenance). | |
| - name: Gather provenance | |
| if: always() | |
| working-directory: ${{ env.WORKDIR }} | |
| shell: bash | |
| run: | | |
| F="prov-linux-${{ matrix.backend }}.md" | |
| { | |
| echo "**linux · ${{ matrix.backend }}** (runner \`${{ matrix.runner }}\`)" | |
| echo "- addon: \`@qvac/llm-llamacpp@${ADDON_VERSION:-?}\` (published prebuild)" | |
| echo "- git: \`${{ needs.context.outputs.head_sha }}\` (ref \`${{ needs.context.outputs.ref }}\`)" | |
| echo "- node: $(node -v 2>/dev/null) · bare: $(bare --version 2>/dev/null || echo n/a)" | |
| echo "- os: $(. /etc/os-release 2>/dev/null; echo "$PRETTY_NAME") $(uname -m)" | |
| echo "- cpu: $(lscpu 2>/dev/null | sed -n 's/^Model name:[[:space:]]*//p' | head -1) ($(nproc) cores)" | |
| echo "- ram: $(free -h 2>/dev/null | awk '/^Mem:/{print $2}')" | |
| if [ "${{ matrix.backend }}" = "gpu" ]; then | |
| echo "- gpu: $(vulkaninfo --summary 2>/dev/null | sed -n 's/.*deviceName[[:space:]]*=[[:space:]]*//p' | head -1 || echo '?')" | |
| fi | |
| } > "$F" | |
| cat "$F" | |
| - name: Upload matrix log | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 | |
| with: | |
| name: vlm-matrix-log-linux-${{ matrix.backend }}-${{ github.run_number }} | |
| path: | | |
| ${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log | |
| ${{ env.WORKDIR }}/prov-linux-${{ matrix.backend }}.md | |
| retention-days: 14 | |
| if-no-files-found: warn | |
| # ── Matrix mode: Samsung S25 (AWS Device Farm) ───────────────────── | |
| # Reuses the Android-only mobile workflow to run the SAME matrix harness | |
| # on-device. qvac_perf_only restricts the run to perf-tests.json | |
| # (runVlmMatrixTest) → only the vlmMatrix group is scheduled. The active | |
| # preset on-device is config.defaultPreset (Device Farm forwards no custom | |
| # env), so vlm-matrix.config.cjs defaultPreset governs the S25 set. The | |
| # raw on-device log (bare_console.log) carries the [VLMROW] markers and is | |
| # uploaded by collect-and-upload-logs as console-logs-llamacpp-llm-Android. | |
| matrix-s25: | |
| needs: context | |
| # S25 runs the addon only; several-sources (native CLIs) is Linux-only. | |
| if: inputs.run_matrix && inputs.run_matrix_s25 && inputs.matrix_mode != 'several-sources' | |
| uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml | |
| secrets: inherit | |
| with: | |
| ref: ${{ needs.context.outputs.ref }} | |
| repository: ${{ needs.context.outputs.repository }} | |
| qvac_perf_only: true | |
| # ── Matrix mode: combine ─────────────────────────────────────────── | |
| # Aggregates [VLMROW]/[VLMSEG]/[VLMMETA] markers from every matrix log | |
| # (Linux .log + S25 bare_console.log) into one quality+speed report via | |
| # vlm-matrix/aggregate.js, surfaced to the step summary + PR comment. | |
| # This is the mechanism that makes mobile (Device Farm) results visible. | |
| matrix-combine: | |
| needs: | |
| - context | |
| - matrix-linux | |
| - matrix-s25 | |
| if: always() && inputs.run_matrix && needs.context.result == 'success' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2 | |
| with: | |
| repository: ${{ needs.context.outputs.repository }} | |
| ref: ${{ needs.context.outputs.ref }} | |
| sparse-checkout: | | |
| packages/llm-llamacpp/benchmarks/vlm-matrix | |
| sparse-checkout-cone-mode: false | |
| - name: Setup Node.js | |
| uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0 | |
| with: | |
| node-version: 22 | |
| - name: Download Linux matrix logs | |
| uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 | |
| with: | |
| pattern: vlm-matrix-log-*-${{ github.run_number }} | |
| path: matrix-logs | |
| # S25 raw device logs (bare_console.log holds the [VLMROW] markers). | |
| # continue-on-error so a Linux-only run (no S25 artifact) still combines. | |
| - name: Download S25 device logs | |
| continue-on-error: true | |
| uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1 | |
| with: | |
| pattern: console-logs-* | |
| path: matrix-logs | |
| - name: Aggregate matrix logs | |
| shell: bash | |
| run: | | |
| mkdir -p consolidated | |
| # Tag each input with its platform host so S25 rows don't collapse | |
| # onto the Linux rows ([VLMROW].device is only cpu/gpu). | |
| ARGS="" | |
| # Linux legs: one log per backend (device field carries cpu/gpu). | |
| for f in $(find matrix-logs -name 'vlm-matrix-linux-*.log' 2>/dev/null | sort); do | |
| ARGS="$ARGS --in linux $f" | |
| done | |
| # S25: the Samsung device's full logcat carries the [VLMROW] markers | |
| # (the Android pool may also include a Pixel; we surface S25 here). | |
| for f in $(find matrix-logs -name '*Galaxy_S25*logcat_full*' 2>/dev/null | sort); do | |
| ARGS="$ARGS --in s25 $f" | |
| done | |
| # HW/SW provenance: Linux legs ship prov-linux-*.md; synthesize one for S25. | |
| PROV="" | |
| for p in $(find matrix-logs -name 'prov-*.md' 2>/dev/null | sort); do | |
| PROV="$PROV --provenance $p" | |
| done | |
| # S25 hardware provenance, parsed from the device's own logcat (model / | |
| # Android / ABI from the Play-store UA line, RAM from the JS totalMemory | |
| # line, GPU from the Adreno-Vulkan driver load). | |
| S25F=$(find matrix-logs -name '*Galaxy_S25*logcat_full*' 2>/dev/null | head -1) | |
| if [ -n "$S25F" ]; then | |
| MODEL=$(grep -oE 'model=SM-[A-Z0-9]+' "$S25F" | head -1 | cut -d= -f2) | |
| ANDROID=$(grep -oE 'platformVersionRelease=[0-9]+' "$S25F" | head -1 | cut -d= -f2) | |
| ABI=$(grep -oE 'supportedAbis=[a-z0-9-]+' "$S25F" | head -1 | cut -d= -f2) | |
| RAMB=$(grep -oE 'totalMemory: [0-9]+' "$S25F" | head -1 | grep -oE '[0-9]+$') | |
| RAMGB=$(awk -v b="${RAMB:-0}" 'BEGIN{ if (b>0) printf "%.1f GB", b/1073741824; else printf "?" }') | |
| GPU=$(grep -qiE 'AdrenoVK|vulkan\.adreno' "$S25F" && echo 'Adreno (Vulkan)' || echo '?') | |
| { | |
| echo "**s25** — Samsung Galaxy S25 Ultra (AWS Device Farm)" | |
| echo "- device: ${MODEL:-SM-?} · Android ${ANDROID:-?} · ${ABI:-arm64-v8a}" | |
| echo "- ram: ${RAMGB} · gpu: ${GPU}" | |
| echo "- engine: \`@qvac/llm-llamacpp\` addon (published prebuild)" | |
| } > prov-s25.md | |
| PROV="$PROV --provenance prov-s25.md" | |
| fi | |
| echo "aggregate inputs:$ARGS" | |
| echo "provenance:$PROV" | |
| if [ -z "$ARGS" ]; then | |
| echo "> No VLM matrix logs found for run #${{ github.run_number }}." > consolidated/vlm-matrix-consolidated.md | |
| else | |
| node packages/llm-llamacpp/benchmarks/vlm-matrix/aggregate.js \ | |
| --title "VLM Matrix — ${{ inputs.matrix_mode }} / ${{ inputs.matrix_preset }} (run #${{ github.run_number }})" \ | |
| --mode "${{ inputs.matrix_mode }}" --engine "${{ inputs.matrix_engine }}" --base f16 --candidate q8 \ | |
| --out consolidated/vlm-matrix-consolidated.md \ | |
| $PROV $ARGS | |
| fi | |
| - name: Post step summary | |
| if: always() | |
| shell: bash | |
| run: | | |
| { | |
| echo "# VLM Matrix — Consolidated" | |
| echo "" | |
| cat consolidated/vlm-matrix-consolidated.md 2>/dev/null || echo "No consolidated matrix report generated." | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| - name: Post PR comment | |
| if: always() && hashFiles('consolidated/vlm-matrix-consolidated.md') != '' | |
| shell: bash | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| REF: ${{ needs.context.outputs.ref }} | |
| run: | | |
| PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null || echo "") | |
| if [[ -z "$PR_NUMBER" ]]; then | |
| echo "No open PR found for ref $REF — skipping PR comment." | |
| exit 0 | |
| fi | |
| { | |
| echo "## VLM Matrix Benchmark" | |
| echo "" | |
| echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_" | |
| echo "" | |
| cat consolidated/vlm-matrix-consolidated.md | |
| } > /tmp/matrix-comment.md | |
| gh pr comment "$PR_NUMBER" --body-file /tmp/matrix-comment.md | |
| - name: Upload consolidated matrix report | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0 | |
| with: | |
| name: vlm-matrix-consolidated-${{ github.run_number }} | |
| path: consolidated/ | |
| retention-days: 30 | |
| if-no-files-found: warn |