Skip to content

Benchmark (LLM)

Benchmark (LLM) #48

name: Benchmark VLM (LLM)
# Manually-triggered VLM benchmark. Runs
# packages/llm-llamacpp/benchmarks/vlm-performance against Qwen3.5-VL
# on a fixed object-listing task and uploads a consolidated report.
#
# 3-source comparison: addon (JS binding) vs fabric-cli (fork CLI) vs
# upstream-cli (upstream llama.cpp CLI). Measures JS binding overhead
# and fork divergence using the same model across all sources.
on:
workflow_dispatch:
inputs:
# ── Sources (what to compare) ─────────────────────────────
run_addon:
description: "── SOURCE 1 ── addon (@qvac/llm-llamacpp JS binding)"
required: false
type: boolean
default: true
ref:
description: " addon ref — qvac branch / tag / SHA (default: current branch)"
required: false
type: string
addon_from_source:
description: " build addon from source (slow, but uses latest fabric)"
required: false
type: boolean
default: false
run_addon_source:
description: " A/B: also build addon from source with the vcpkg overlay applied, run alongside npm addon in one cell (same runner). x86-CPU cells only."
required: false
type: boolean
default: false
addon_source_overlay:
description: " apply the vcpkg overlay during the addon-source build (default true). Set false to rule out 'is it the overlay or something else?'"
required: false
type: boolean
default: true
run_fabric_cli:
description: "── SOURCE 2 ── fabric (qvac fork, native CLI)"
required: false
type: boolean
default: true
fabric_ref:
description: " fabric ref (default: v8189.0.2)"
required: false
type: string
default: "v8189.0.2"
run_upstream_cli:
description: "── SOURCE 3 ── upstream (vanilla llama.cpp, native CLI)"
required: false
type: boolean
default: true
upstream_ref:
description: " upstream ref (default: b8189)"
required: false
type: string
default: "b8189"
# ── Platforms × backends ──────────────────────────────────
# Comma-separated selection. Tokens: linux-cpu, linux-gpu,
# windows-cpu, windows-gpu, macos. "all" expands to every desktop
# cell. GPU rows for Linux/Windows go to self-hosted Vulkan runners;
# macOS uses Metal on GitHub-hosted macos-15-xlarge.
platforms:
description: "── PLATFORMS ── e.g. linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos (or 'all')"
required: false
type: string
default: "linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos"
run_android:
description: "── PLATFORM ── Android (stub)"
required: false
type: boolean
default: false
# ── Run settings ──────────────────────────────────────────
warmup_runs:
description: "Warmup iterations (discarded)"
required: false
type: string
default: "1"
measured_runs:
description: "Measured iterations (median reported)"
required: false
type: string
default: "3"
# ── Matrix mode (config-driven quality+speed matrix) ──────
# Orthogonal to the source-engines benchmark above. When on, runs
# the @qvac/llm-llamacpp addon over the vlm-matrix fixture (lmms-eval
# quality + vision-encode speed) on Linux (and S25), driven by
# packages/llm-llamacpp/test/integration/vlm-matrix.config.cjs.
run_matrix:
description: "── MATRIX ── run the config-driven VLM quality+speed matrix (addon, Linux + S25)"
required: false
type: boolean
default: false
matrix_mode:
description: " matrix mode: two-models (f16 vs q8, addon) or several-sources (addon+fabric-cli+upstream-cli, Linux-only)"
required: false
type: string
default: "two-models"
matrix_preset:
description: " matrix preset: compare (two-models), sources (several-sources), smoke, or full. Overrides config.defaultPreset on Linux."
required: false
type: string
default: "compare"
matrix_engine:
description: " inference engine (two-models mode): addon | fabric-cli | upstream-cli. CLI engines are desktop-only; addon runs everywhere."
required: false
type: string
default: "addon"
matrix_linux:
description: " Linux matrix legs, comma-sep: linux-cpu,linux-gpu"
required: false
type: string
default: "linux-cpu,linux-gpu"
run_matrix_s25:
description: " also run the matrix on Samsung S25 (AWS Device Farm)"
required: false
type: boolean
default: false
permissions:
contents: read
packages: read
pull-requests: write
id-token: write
jobs:
# ── Context ────────────────────────────────────────────────────────
# Resolves the repo + ref so downstream jobs check out the right
# commit even when the workflow_dispatch is invoked without `ref`,
# and builds the desktop matrix from the per-platform input toggles.
# Matrix is computed here (instead of via job-level `if:`) because
# GitHub Actions doesn't allow `matrix.*` references in job-level
# conditions — they're evaluated before the matrix is expanded.
context:
runs-on: ubuntu-latest
outputs:
repository: ${{ steps.ctx.outputs.repository }}
ref: ${{ steps.ctx.outputs.ref }}
desktop_matrix: ${{ steps.matrix.outputs.value }}
desktop_count: ${{ steps.matrix.outputs.count }}
linux_matrix: ${{ steps.lmatrix.outputs.value }}
linux_count: ${{ steps.lmatrix.outputs.count }}
merge_base: ${{ steps.commits.outputs.merge_base }}
head_sha: ${{ steps.commits.outputs.head_sha }}
head_title: ${{ steps.commits.outputs.head_title }}
head_date: ${{ steps.commits.outputs.head_date }}
base_title: ${{ steps.commits.outputs.base_title }}
base_date: ${{ steps.commits.outputs.base_date }}
steps:
- id: ctx
shell: bash
env:
INPUT_REF: ${{ inputs.ref }}
REPO: ${{ github.repository }}
REF_NAME: ${{ github.ref_name }}
run: |
repo="$REPO"
ref="${INPUT_REF:-$REF_NAME}"
echo "repository=$repo" >> "$GITHUB_OUTPUT"
echo "ref=$ref" >> "$GITHUB_OUTPUT"
- id: matrix
shell: bash
env:
RUN_ADDON: ${{ inputs.run_addon }}
RUN_FABRIC: ${{ inputs.run_fabric_cli }}
RUN_UPSTREAM: ${{ inputs.run_upstream_cli }}
RUN_ADDON_SOURCE: ${{ inputs.run_addon_source }}
PLATFORMS: ${{ inputs.platforms }}
run: |
# Sources
sources='[]'
if [[ "$RUN_ADDON" == "true" ]]; then
sources=$(echo "$sources" | jq -c '. + ["addon"]')
fi
if [[ "$RUN_FABRIC" == "true" ]]; then
sources=$(echo "$sources" | jq -c '. + ["fabric"]')
fi
if [[ "$RUN_UPSTREAM" == "true" ]]; then
sources=$(echo "$sources" | jq -c '. + ["upstream"]')
fi
# addon-source is an opt-in 4th source that builds the addon
# from local sources (with vcpkg overlay applied) and runs it
# in the same cell as the npm addon. Gated to *-cpu cells
# because llamafile is the primary thing this exists to A/B,
# and that's x86-CPU-specific.
if [[ "$RUN_ADDON_SOURCE" == "true" ]]; then
sources=$(echo "$sources" | jq -c '. + ["addon-source"]')
fi
# Selected platform×backend tokens
selected="${PLATFORMS:-linux-cpu}"
if [[ "$selected" == "all" ]]; then
selected="linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos"
fi
# Build cells from tokens. Each token maps to a fixed
# (platform, arch, backend, runner) tuple.
cells='[]'
IFS=',' read -ra tokens <<< "$selected"
for raw in "${tokens[@]}"; do
sel=$(echo "$raw" | xargs)
case "$sel" in
linux-cpu) plat=linux-x64; arch=x64; backend=cpu; runner=ubuntu-latest ;;
linux-gpu) plat=linux-x64; arch=x64; backend=gpu; runner=qvac-ubuntu2404-x64-gpu ;;
windows-cpu) plat=windows-x64; arch=x64; backend=cpu; runner=windows-latest ;;
windows-gpu) plat=windows-x64; arch=x64; backend=gpu; runner=qvac-win25-x64-gpu ;;
macos) plat=macos-arm64; arch=arm64; backend=gpu; runner=macos-15-xlarge ;;
"") continue ;;
*) echo "::warning::Unknown platform token '$sel' (known: linux-cpu, linux-gpu, windows-cpu, windows-gpu, macos)"; continue ;;
esac
for src in $(echo "$sources" | jq -r '.[]'); do
# windows-gpu currently only supports the addon leg: the
# self-hosted qvac-win25-x64-gpu runner has Vulkan and
# chocolatey but no MSVC and chocolatey can't install
# LLVM at job time (lock/permission errors), so the
# fabric/upstream CLI builds aren't viable there yet.
# Re-enable once the runner image ships LLVM+Ninja.
if [[ "$sel" == "windows-gpu" && "$src" != "addon" ]]; then
continue
fi
# addon-source: llamafile is x86-CPU specific, so only
# emit the from-source A/B cell on CPU cells (linux-cpu,
# windows-cpu). On GPU cells the matmul path goes through
# Vulkan/Metal shaders that don't change with llamafile.
if [[ "$src" == "addon-source" ]]; then
case "$sel" in
linux-cpu|windows-cpu) ;;
*) continue ;;
esac
fi
# When the addon-source A/B is enabled on this cell, the
# addon-source leg already runs --sources=addon,addon-source
# in one process. The dedicated 'addon' cell would
# produce a duplicate row on a different runner — skip
# it so the consolidated report stays clean.
if [[ "$src" == "addon" && "$RUN_ADDON_SOURCE" == "true" ]]; then
case "$sel" in
linux-cpu|windows-cpu) continue ;;
esac
fi
cells=$(echo "$cells" | jq -c \
--arg p "$plat" --arg a "$arch" --arg b "$backend" --arg r "$runner" --arg s "$src" \
'. + [{"platform":$p,"arch":$a,"backend":$b,"runner":$r,"source":$s}]')
done
done
count=$(echo "$cells" | jq 'length')
echo "value=$cells" >> "$GITHUB_OUTPUT"
echo "count=$count" >> "$GITHUB_OUTPUT"
echo "Desktop matrix ($count entries): $cells"
# Linux legs for the config-driven matrix mode (addon over the
# vlm-matrix fixture). Independent of the source-engines matrix above.
- id: lmatrix
shell: bash
env:
RUN_MATRIX: ${{ inputs.run_matrix }}
MATRIX_LINUX: ${{ inputs.matrix_linux }}
MATRIX_MODE: ${{ inputs.matrix_mode }}
run: |
cells='[]'
if [[ "$RUN_MATRIX" == "true" ]]; then
IFS=',' read -ra tokens <<< "${MATRIX_LINUX:-linux-cpu}"
for raw in "${tokens[@]}"; do
sel=$(echo "$raw" | xargs)
# several-sources builds native fabric/upstream CLIs, so the CPU leg
# needs a runner with cmake+toolchain → GitHub-hosted ubuntu-latest.
cpu_runner=qvac-ubuntu2204-x64
if [[ "$MATRIX_MODE" == "several-sources" ]]; then cpu_runner=ubuntu-latest; fi
case "$sel" in
linux-cpu) backend=cpu; runner=$cpu_runner; no_gpu=true ;;
linux-gpu) backend=gpu; runner=qvac-ubuntu2404-x64-gpu; no_gpu=false ;;
"") continue ;;
*) echo "::warning::Unknown matrix_linux token '$sel' (known: linux-cpu, linux-gpu)"; continue ;;
esac
cells=$(echo "$cells" | jq -c \
--arg b "$backend" --arg r "$runner" --arg n "$no_gpu" \
'. + [{"backend":$b,"runner":$r,"no_gpu":$n}]')
done
fi
count=$(echo "$cells" | jq 'length')
echo "value=$cells" >> "$GITHUB_OUTPUT"
echo "count=$count" >> "$GITHUB_OUTPUT"
echo "Linux matrix ($count entries): $cells"
# Resolve commit metadata so the consolidated report can show what
# the candidate ref + merge-base actually point at (hash, title,
# date). Needs a real clone — sparse checkout doesn't give us git
# history.
- name: Checkout for commit lookup
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
with:
repository: ${{ steps.ctx.outputs.repository }}
ref: ${{ steps.ctx.outputs.ref }}
fetch-depth: 0
- id: commits
shell: bash
run: |
git fetch origin main --quiet
HEAD_SHA=$(git rev-parse HEAD)
MERGE_BASE=$(git merge-base HEAD origin/main || echo "")
HEAD_TITLE=$(git log -1 --pretty=%s "$HEAD_SHA")
HEAD_DATE=$(git log -1 --pretty=%cI "$HEAD_SHA")
if [[ -n "$MERGE_BASE" ]]; then
BASE_TITLE=$(git log -1 --pretty=%s "$MERGE_BASE")
BASE_DATE=$(git log -1 --pretty=%cI "$MERGE_BASE")
else
BASE_TITLE=""
BASE_DATE=""
fi
{
echo "head_sha=$HEAD_SHA"
echo "merge_base=$MERGE_BASE"
echo "head_title=$HEAD_TITLE"
echo "head_date=$HEAD_DATE"
echo "base_title=$BASE_TITLE"
echo "base_date=$BASE_DATE"
} >> "$GITHUB_OUTPUT"
echo "HEAD: $HEAD_SHA - $HEAD_TITLE ($HEAD_DATE)"
echo "merge-base: $MERGE_BASE - $BASE_TITLE ($BASE_DATE)"
# ── Desktop benchmark matrix ───────────────────────────────────────
# Each leg is the same shape — pick the runner via matrix.runner.
# GPU rows (linux-x64 / windows-x64) target self-hosted Vulkan
# runners pre-provisioned with the Vulkan SDK; macOS arm64 uses
# Metal on the GitHub-hosted macos-15-xlarge runner. The matrix
# itself is built dynamically by the context job above from the
# `platforms` input.
desktop:
needs: context
if: needs.context.outputs.desktop_count != '0'
name: vlm-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}
runs-on: ${{ matrix.runner }}
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.context.outputs.desktop_matrix) }}
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WORKDIR: packages/llm-llamacpp/benchmarks/vlm-performance
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
with:
repository: ${{ needs.context.outputs.repository }}
ref: ${{ needs.context.outputs.ref }}
# The addon-source cell needs the full native toolchain (LLVM,
# vcpkg, bare-make, Vulkan SDK) because it builds the addon from
# local sources with the vcpkg overlay applied. The 'addon',
# 'fabric', and 'upstream' cells stay on the lighter npm path.
- name: Setup Node.js and Bare tooling
if: matrix.source == 'addon-source'
uses: ./.github/actions/setup-bare-tooling
- name: Setup Node.js
if: matrix.source != 'addon-source'
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
with:
node-version: 22
- name: Setup LLVM
if: matrix.source == 'addon-source'
uses: ./.github/actions/setup-llvm
# Inline vcpkg bootstrap. We can't use the repo's setup-vcpkg
# composite action because it hard-requires MODEL_S3_BUCKET for
# the prebuilds-shared S3 binary cache — a secret we don't
# plumb into the bench workflow. Set VCPKG_ROOT to the runner's
# pre-installed vcpkg and leave VCPKG_BINARY_SOURCES at the
# default (per-runner disk cache), which is fine for a one-off
# bench build.
- name: Configure vcpkg (addon-source)
if: matrix.source == 'addon-source' && runner.os == 'Linux'
shell: bash
run: |
echo "VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT" >> "$GITHUB_ENV"
echo "VCPKG_BUILD_TYPE=release" >> "$GITHUB_ENV"
echo "VCPKG_CMAKE_CONFIGURE_OPTIONS=--no-parallel-configure" >> "$GITHUB_ENV"
# qvac-fabric defaults to the gpu-backends feature, which
# transitively requires the Vulkan SDK at build time. ubuntu-latest
# doesn't ship one; install upstream's prebuilt SDK and stamp the
# env so cmake's FindVulkan picks it up. Same install pattern
# benchmark-embed-llamacpp.yml uses. libvulkan-dev pulls
# libvulkan.so + libvulkan1 from the distro — the LunarG SDK
# 1.4.x no longer bundles the loader, so cmake's FindVulkan
# can't see Vulkan_LIBRARY without it.
- name: Install Vulkan SDK (addon-source on Linux)
if: matrix.source == 'addon-source' && runner.os == 'Linux'
shell: bash
run: |
sudo apt-get update
sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils libvulkan-dev
wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
mkdir -p "$HOME/vulkan"
tar -xf /tmp/vulkansdk.tar.xz -C "$HOME/vulkan" --strip-components=1
VULKAN_SDK="$HOME/vulkan/x86_64"
echo "VULKAN_SDK=$VULKAN_SDK" >> "$GITHUB_ENV"
echo "$VULKAN_SDK/bin" >> "$GITHUB_PATH"
echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
echo "PKG_CONFIG_PATH=$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}" >> "$GITHUB_ENV"
# Install npm @qvac/llm-llamacpp first so we have the published
# addon (no llamafile) on disk, then snapshot it to /tmp before
# the from-source build overwrites prebuilds/ in the workspace.
# The bench then runs both addon variants in this cell on the
# same runner: addon → /tmp/npm-addon-snapshot, addon-source →
# the workspace's freshly-built artifact.
- name: "Install benchmark deps (addon-source: npm baseline)"
if: matrix.source == 'addon-source'
shell: bash
working-directory: ${{ env.WORKDIR }}
run: npm install --no-audit --no-fund
# Copy the WHOLE node_modules tree so the snapshotted addon at
# /tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp can still
# resolve its sibling deps (bare-fs, bare-path, …) via standard
# require-walking. A naked copy of just @qvac/llm-llamacpp leaves
# those siblings unreachable and the snapshot fails to load.
- name: Snapshot npm addon (addon-source A/B)
if: matrix.source == 'addon-source'
shell: bash
working-directory: ${{ env.WORKDIR }}
run: |
mkdir -p /tmp/npm-snapshot
cp -r node_modules /tmp/npm-snapshot/
# Optional: drop the overlay before bare-make so the from-source
# build matches the npm-published binary as closely as possible.
# Used to test "is the addon-vs-fabric gap caused by the overlay
# (llamafile) or by something else in the source build path?"
- name: Disable vcpkg overlay for addon-source A/B
if: matrix.source == 'addon-source' && !inputs.addon_source_overlay
shell: bash
run: |
rm -rf packages/llm-llamacpp/vcpkg/ports/qvac-fabric
echo "::notice::Overlay removed — addon-source will build with the registry version of qvac-fabric (llamafile OFF, same as npm)"
- name: Build addon from source
if: matrix.source == 'addon-source'
shell: bash
working-directory: packages/llm-llamacpp
env:
# vcpkg needs to clone the private qvac-registry-vcpkg repo
# at configure time. Workflow-level token is enough for read.
GH_TOKEN: ${{ secrets.GH_TOKEN || github.token }}
GITHUB_TOKEN: ${{ secrets.GH_TOKEN || github.token }}
run: |
npm install --no-audit --no-fund
bare-make generate
bare-make build
bare-make install
- name: Re-link workspace addon (addon-source A/B)
if: matrix.source == 'addon-source'
shell: bash
working-directory: ${{ env.WORKDIR }}
run: npm install --no-audit --no-fund --install-links ../../
- name: Install benchmark deps (addon from npm)
if: matrix.source != 'addon-source'
shell: bash
working-directory: ${{ env.WORKDIR }}
run: npm install --no-audit --no-fund
# Vulkan SDK is pre-installed on the self-hosted GPU runners
# (qvac-ubuntu2404-x64-gpu, qvac-win25-x64-gpu) at the well-known
# paths shown below — same convention reusable-prebuilds.yml uses.
# macOS GPU goes through Metal and needs no SDK.
- name: Configure Vulkan SDK env (Linux GPU)
if: matrix.backend == 'gpu' && matrix.platform == 'linux-x64'
shell: bash
run: |
echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV"
echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH"
- name: Configure Vulkan SDK env (Windows GPU)
if: matrix.backend == 'gpu' && matrix.platform == 'windows-x64'
shell: bash
run: |
# Single-quoted to preserve the backslash literally.
echo 'VULKAN_SDK=C:\VulkanSDK' >> "$GITHUB_ENV"
# The self-hosted qvac-win25-x64-gpu runner doesn't ship cmake on
# PATH (windows-latest does, via the bundled VS install). Drop a
# Kitware build in front of PATH so both Windows runners look the
# same to build-cli-sources.js. Matches the bootstrap pattern in
# pr-test-inference-addon-cpp-js.yml. Addon legs skip this — they
# use the npm prebuild and never call cmake.
- name: Setup CMake (Windows CLI builds)
if: matrix.platform == 'windows-x64' && matrix.source != 'addon'
shell: bash
working-directory: ${{ runner.temp }}
run: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.31.6/cmake-3.31.6-windows-x86_64.zip -o cmake.zip
unzip -q cmake.zip
echo "$PWD/cmake-3.31.6-windows-x86_64/bin" >> "$GITHUB_PATH"
- name: Cache CLI builds
if: matrix.source != 'addon'
uses: actions/cache@v4
with:
path: ${{ env.WORKDIR }}/cli-builds
key: vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ matrix.source == 'fabric' && inputs.fabric_ref || inputs.upstream_ref }}
restore-keys: |
vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-
- name: Build CLI source
if: matrix.source != 'addon'
shell: bash
working-directory: ${{ env.WORKDIR }}
env:
# Windows-CPU runs on windows-latest where the bundled VS
# install provides MSVC; force the VS multi-config generator
# so cmake doesn't fall back to MinGW. Matches cpp-tests-*.yml.
CMAKE_GENERATOR: ${{ matrix.platform == 'windows-x64' && 'Visual Studio 17 2022' || '' }}
run: |
REF=${{ matrix.source == 'fabric' && inputs.fabric_ref || inputs.upstream_ref }}
node scripts/build-cli-sources.js \
--sources=${{ matrix.source }} \
--${{ matrix.source }}-ref=$REF \
--backend=${{ matrix.backend }}
- name: Prepare models
shell: bash
working-directory: ${{ env.WORKDIR }}
run: npm run prepare:models
- name: Run VLM benchmark
shell: bash
working-directory: ${{ env.WORKDIR }}
run: |
if [[ "${{ matrix.source }}" == "addon-source" ]]; then
# Same-runner A/B: run npm addon (from the snapshot) AND
# the freshly-built source addon (workspace via @qvac/...)
# back-to-back in one process so they hit identical hardware.
npm run run:vlm-bench -- \
--sources=addon,addon-source \
--addon-path=/tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp \
--backend=${{ matrix.backend }} \
--force-gpu-row \
--warmup-runs=${{ inputs.warmup_runs }} \
--measured-runs=${{ inputs.measured_runs }}
else
npm run run:vlm-bench -- \
--sources=${{ matrix.source }} \
--backend=${{ matrix.backend }} \
--force-gpu-row \
--warmup-runs=${{ inputs.warmup_runs }} \
--measured-runs=${{ inputs.measured_runs }}
fi
- name: Upload per-platform results
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
with:
name: vlm-perf-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ github.run_number }}
path: |
${{ env.WORKDIR }}/results/vlm-perf-*.md
${{ env.WORKDIR }}/results/vlm-perf-*.json
${{ env.WORKDIR }}/results/cell-*-stderr.log
retention-days: 14
if-no-files-found: warn
# ── Android (stub) ─────────────────────────────────────────────────
# Earlier iteration tried to reuse integration-mobile-test-llm-
# llamacpp.yml in perf-only mode, but the existing mobile workflow
# is built for breadth (Android + iOS matrix, 3+12 Device-Farm
# sessions covering many tests) — one full invocation took ~20 min
# of mostly-irrelevant work for our use case. Until we either land a
# leaner mobile workflow that runs just our benchmark, or bundle our
# benchmark logic into the existing mobile test app, this job is a
# placeholder so the workflow shape covers Android.
#
# Default is OFF — flip run_android to true to see the marker
# artifact and confirm wiring.
android:
needs: context
if: inputs.run_android
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Stub notice
shell: bash
run: |
mkdir -p android-stub
cat > android-stub/README.txt <<'EOF'
Android VLM benchmark - placeholder
===================================
The full Android benchmark is not yet wired. The existing
mobile workflow (integration-mobile-test-llm-llamacpp.yml)
runs the broader integration test suite and is too heavy
for the one-cell VLM benchmark; a dedicated leaner mobile
path is planned.
For Android perf numbers right now, run
Actions -> Benchmark Performance (LLM) -> Run workflow
(workflow file: benchmark-performance-infer-llm-llamacpp.yml)
EOF
echo "Android benchmark is a stub in this iteration."
cat android-stub/README.txt
- name: Upload Android stub marker
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
with:
name: vlm-perf-android-${{ github.run_number }}
path: android-stub/
retention-days: 14
# ── Summarize ──────────────────────────────────────────────────────
# Downloads every desktop artifact and renders a consolidated table.
# `if: always()` keeps the summary going when one matrix leg fails.
summarize:
needs:
- context
- desktop
- android
if: always() && needs.context.result == 'success' && needs.context.outputs.desktop_count != '0'
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
with:
repository: ${{ needs.context.outputs.repository }}
ref: ${{ needs.context.outputs.ref }}
sparse-checkout: |
packages/llm-llamacpp/benchmarks/vlm-performance/scripts
sparse-checkout-cone-mode: false
- name: Setup Node.js
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
with:
node-version: 22
- name: Download desktop per-platform artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: vlm-perf-*-${{ github.run_number }}
path: per-platform
- name: Build commit-info JSON
shell: bash
run: |
cat > commit-info.json <<EOF
{
"head": {
"sha": "${{ needs.context.outputs.head_sha }}",
"title": ${{ toJSON(needs.context.outputs.head_title) }},
"date": "${{ needs.context.outputs.head_date }}"
},
"merge_base": {
"sha": "${{ needs.context.outputs.merge_base }}",
"title": ${{ toJSON(needs.context.outputs.base_title) }},
"date": "${{ needs.context.outputs.base_date }}"
},
"comparison_mode": "source-engines"
}
EOF
cat commit-info.json
- name: Aggregate into one report
shell: bash
run: |
mkdir -p consolidated
node packages/llm-llamacpp/benchmarks/vlm-performance/scripts/aggregate-platforms.js \
--inputs=per-platform \
--commit-info=commit-info.json \
--output-md=consolidated/vlm-perf-consolidated.md \
--output-json=consolidated/vlm-perf-consolidated.json
- name: Post step summary
if: always()
shell: bash
run: |
{
echo "## VLM Benchmark - Consolidated"
echo ""
if [ -f consolidated/vlm-perf-consolidated.md ]; then
tail -n +2 consolidated/vlm-perf-consolidated.md
else
echo "No consolidated report generated."
fi
} >> "$GITHUB_STEP_SUMMARY"
- name: Post PR comment (View 2 summary)
if: always() && hashFiles('consolidated/vlm-perf-consolidated.md') != ''
shell: bash
env:
GH_TOKEN: ${{ github.token }}
REF: ${{ needs.context.outputs.ref }}
run: |
PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null || echo "")
if [[ -z "$PR_NUMBER" ]]; then
echo "No open PR found for ref $REF — skipping PR comment."
exit 0
fi
echo "Posting VLM benchmark summary to PR #$PR_NUMBER"
{
echo "## VLM Benchmark Summary"
echo ""
echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_"
echo ""
tail -n +2 consolidated/vlm-perf-consolidated.md
} > /tmp/pr-comment-body.md
gh pr comment "$PR_NUMBER" --body-file /tmp/pr-comment-body.md
- name: Upload consolidated report
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
with:
name: vlm-perf-consolidated-${{ github.run_number }}
path: consolidated/
retention-days: 30
if-no-files-found: warn
# ── Matrix mode: Linux legs ────────────────────────────────────────
# Runs the @qvac/llm-llamacpp addon (published linux-x64 prebuild, which
# is CPU + Vulkan-GPU capable) over the vlm-matrix fixture. Branch JS
# (harness/config/fixture) + published native prebuild. One leg per
# backend; each emits [VLMROW]/[VLMSEG]/[VLMMETA] markers to a log that
# matrix-combine aggregates with vlm-matrix/aggregate.js.
matrix-linux:
needs: context
if: needs.context.outputs.linux_count != '0'
name: vlm-matrix-linux-${{ matrix.backend }}
runs-on: ${{ matrix.runner }}
# several-sources builds two CLIs from source (first run, pre-cache) + per-image
# CLI model reloads, so allow more wall-clock than the addon-only two-models path.
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.context.outputs.linux_matrix) }}
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WORKDIR: packages/llm-llamacpp
steps:
- name: Manual Workspace Cleanup
if: startsWith(matrix.runner, 'qvac-')
shell: bash
run: rm -rf "$GITHUB_WORKSPACE" && mkdir -p "$GITHUB_WORKSPACE"
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
with:
repository: ${{ needs.context.outputs.repository }}
ref: ${{ needs.context.outputs.ref }}
- name: Setup Node.js
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
with:
node-version: 22
- name: Install addon deps
working-directory: ${{ env.WORKDIR }}
shell: bash
run: npm install --no-audit --no-fund
- name: Install bare tooling
shell: bash
run: npm install -g --force bare bare-make bare-runtime bare-https brittle
# Branch JS + published native prebuild: pull the linux-x64 prebuild
# from the public npm package into the workspace so the harness's
# require('../../index.js') loads. The published prebuild handles both
# cpu and (Vulkan) gpu via runtime device selection.
- name: Fetch published prebuilds
working-directory: ${{ env.WORKDIR }}
shell: bash
run: |
npm pack @qvac/llm-llamacpp@latest
tar -xzf *.tgz
ADDON_VER=$(node -e "console.log(require('./package/package.json').version)")
echo "ADDON_VERSION=$ADDON_VER" >> "$GITHUB_ENV"
rm -rf prebuilds
mv package/prebuilds ./prebuilds
rm -rf package *.tgz
echo "addon @qvac/llm-llamacpp@$ADDON_VER"
ls -la prebuilds/
# GPU runners ship the Vulkan SDK at the well-known path; stamp the
# env so the addon's loader finds it. No-op on the cpu leg.
- name: Configure Vulkan SDK env (Linux GPU)
if: matrix.backend == 'gpu'
shell: bash
run: |
echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV"
echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH"
- name: Run VLM matrix
working-directory: ${{ env.WORKDIR }}
shell: bash
env:
QVAC_VLM_MATRIX: "1"
QVAC_VLM_MODE: ${{ inputs.matrix_mode }}
QVAC_VLM_PRESET: ${{ inputs.matrix_preset }}
QVAC_VLM_ENGINE: addon # this leg is always the 'addon' source
QVAC_VLM_DEVICES: ${{ matrix.backend }}
NO_GPU: ${{ matrix.no_gpu }}
run: |
# Regenerate the brittle runner to include ONLY the matrix test,
# then run it under bare (same pattern as the perf-only path).
# In several-sources mode this is the addon leg; fabric/upstream CLIs
# are appended to the same log by the next step.
npx brittle -r test/integration/all.js test/integration/vlm-matrix.test.js
bare test/integration/all.js --exit 2>&1 | tee "vlm-matrix-linux-${{ matrix.backend }}.log"
exit ${PIPESTATUS[0]}
# several-sources only: build native fabric/upstream llama-mtmd-cli and run
# them over the SAME fixture, appending [VLMROW]/[VLMSEG] markers to the addon
# log so aggregate.js renders a 3-source comparison. Linux-only.
- name: Cache CLI builds (several-sources)
if: inputs.matrix_mode == 'several-sources'
uses: actions/cache@v4
with:
path: ${{ env.WORKDIR }}/benchmarks/vlm-performance/cli-builds
key: vlm-cli-v3-linux-${{ matrix.backend }}-${{ inputs.fabric_ref }}-${{ inputs.upstream_ref }}
- name: Build + run fabric/upstream CLIs over the fixture (several-sources)
if: inputs.matrix_mode == 'several-sources'
working-directory: ${{ env.WORKDIR }}/benchmarks/vlm-performance
shell: bash
env:
LOG: ${{ github.workspace }}/${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log
MODEL_DIR: ${{ github.workspace }}/${{ env.WORKDIR }}/test/model
run: |
npm install --no-audit --no-fund
node scripts/build-cli-sources.js --sources=fabric,upstream \
--fabric-ref=${{ inputs.fabric_ref }} --upstream-ref=${{ inputs.upstream_ref }} \
--backend=${{ matrix.backend }}
FABRIC_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').fabric.binaryPath)")
UPSTREAM_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').upstream.binaryPath)")
# Model files were downloaded by the addon leg (names from vlm-matrix.config.cjs).
LLM="$MODEL_DIR/reg-qwen-unsloth-Q8_0.gguf"
MMPROJ="$MODEL_DIR/reg-qwen-mradermacher-mmproj-Q8_0.gguf"
run_src () {
echo ">> $1 over fixture ($2)"
node ../vlm-matrix/cli-fixture-runner.cjs \
--binary "$3" --source "$1" --llm "$LLM" --mmproj "$MMPROJ" \
--backend "${{ matrix.backend }}" --samples 3 \
--tasks textvqa,vizwiz,gqa,docvqa,ai2d \
--main-origin "Qwen3.5-0.8B-Q8_0 (Registry)" \
--mmproj-origin "Qwen3.5-0.8B mmproj-Q8_0 (Registry)" >> "$LOG" 2>&1 || echo "::warning::$1 run had errors"
}
run_src fabric-cli fabric "$FABRIC_BIN"
run_src upstream-cli upstream "$UPSTREAM_BIN"
echo "appended fabric-cli + upstream-cli to $LOG"
# HW/SW provenance so a reader can reproduce the numbers. Rendered in the
# report's Details section (passed to aggregate.js via --provenance).
- name: Gather provenance
if: always()
working-directory: ${{ env.WORKDIR }}
shell: bash
run: |
F="prov-linux-${{ matrix.backend }}.md"
{
echo "**linux · ${{ matrix.backend }}** (runner \`${{ matrix.runner }}\`)"
echo "- addon: \`@qvac/llm-llamacpp@${ADDON_VERSION:-?}\` (published prebuild)"
echo "- git: \`${{ needs.context.outputs.head_sha }}\` (ref \`${{ needs.context.outputs.ref }}\`)"
echo "- node: $(node -v 2>/dev/null) · bare: $(bare --version 2>/dev/null || echo n/a)"
echo "- os: $(. /etc/os-release 2>/dev/null; echo "$PRETTY_NAME") $(uname -m)"
echo "- cpu: $(lscpu 2>/dev/null | sed -n 's/^Model name:[[:space:]]*//p' | head -1) ($(nproc) cores)"
echo "- ram: $(free -h 2>/dev/null | awk '/^Mem:/{print $2}')"
if [ "${{ matrix.backend }}" = "gpu" ]; then
echo "- gpu: $(vulkaninfo --summary 2>/dev/null | sed -n 's/.*deviceName[[:space:]]*=[[:space:]]*//p' | head -1 || echo '?')"
fi
} > "$F"
cat "$F"
- name: Upload matrix log
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
with:
name: vlm-matrix-log-linux-${{ matrix.backend }}-${{ github.run_number }}
path: |
${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log
${{ env.WORKDIR }}/prov-linux-${{ matrix.backend }}.md
retention-days: 14
if-no-files-found: warn
# ── Matrix mode: Samsung S25 (AWS Device Farm) ─────────────────────
# Reuses the Android-only mobile workflow to run the SAME matrix harness
# on-device. qvac_perf_only restricts the run to perf-tests.json
# (runVlmMatrixTest) → only the vlmMatrix group is scheduled. The active
# preset on-device is config.defaultPreset (Device Farm forwards no custom
# env), so vlm-matrix.config.cjs defaultPreset governs the S25 set. The
# raw on-device log (bare_console.log) carries the [VLMROW] markers and is
# uploaded by collect-and-upload-logs as console-logs-llamacpp-llm-Android.
matrix-s25:
needs: context
# S25 runs the addon only; several-sources (native CLIs) is Linux-only.
if: inputs.run_matrix && inputs.run_matrix_s25 && inputs.matrix_mode != 'several-sources'
uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
secrets: inherit
with:
ref: ${{ needs.context.outputs.ref }}
repository: ${{ needs.context.outputs.repository }}
qvac_perf_only: true
# ── Matrix mode: combine ───────────────────────────────────────────
# Aggregates [VLMROW]/[VLMSEG]/[VLMMETA] markers from every matrix log
# (Linux .log + S25 bare_console.log) into one quality+speed report via
# vlm-matrix/aggregate.js, surfaced to the step summary + PR comment.
# This is the mechanism that makes mobile (Device Farm) results visible.
matrix-combine:
needs:
- context
- matrix-linux
- matrix-s25
if: always() && inputs.run_matrix && needs.context.result == 'success'
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
with:
repository: ${{ needs.context.outputs.repository }}
ref: ${{ needs.context.outputs.ref }}
sparse-checkout: |
packages/llm-llamacpp/benchmarks/vlm-matrix
sparse-checkout-cone-mode: false
- name: Setup Node.js
uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
with:
node-version: 22
- name: Download Linux matrix logs
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: vlm-matrix-log-*-${{ github.run_number }}
path: matrix-logs
# S25 raw device logs (bare_console.log holds the [VLMROW] markers).
# continue-on-error so a Linux-only run (no S25 artifact) still combines.
- name: Download S25 device logs
continue-on-error: true
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
with:
pattern: console-logs-*
path: matrix-logs
- name: Aggregate matrix logs
shell: bash
run: |
mkdir -p consolidated
# Tag each input with its platform host so S25 rows don't collapse
# onto the Linux rows ([VLMROW].device is only cpu/gpu).
ARGS=""
# Linux legs: one log per backend (device field carries cpu/gpu).
for f in $(find matrix-logs -name 'vlm-matrix-linux-*.log' 2>/dev/null | sort); do
ARGS="$ARGS --in linux $f"
done
# S25: the Samsung device's full logcat carries the [VLMROW] markers
# (the Android pool may also include a Pixel; we surface S25 here).
for f in $(find matrix-logs -name '*Galaxy_S25*logcat_full*' 2>/dev/null | sort); do
ARGS="$ARGS --in s25 $f"
done
# HW/SW provenance: Linux legs ship prov-linux-*.md; synthesize one for S25.
PROV=""
for p in $(find matrix-logs -name 'prov-*.md' 2>/dev/null | sort); do
PROV="$PROV --provenance $p"
done
# S25 hardware provenance, parsed from the device's own logcat (model /
# Android / ABI from the Play-store UA line, RAM from the JS totalMemory
# line, GPU from the Adreno-Vulkan driver load).
S25F=$(find matrix-logs -name '*Galaxy_S25*logcat_full*' 2>/dev/null | head -1)
if [ -n "$S25F" ]; then
MODEL=$(grep -oE 'model=SM-[A-Z0-9]+' "$S25F" | head -1 | cut -d= -f2)
ANDROID=$(grep -oE 'platformVersionRelease=[0-9]+' "$S25F" | head -1 | cut -d= -f2)
ABI=$(grep -oE 'supportedAbis=[a-z0-9-]+' "$S25F" | head -1 | cut -d= -f2)
RAMB=$(grep -oE 'totalMemory: [0-9]+' "$S25F" | head -1 | grep -oE '[0-9]+$')
RAMGB=$(awk -v b="${RAMB:-0}" 'BEGIN{ if (b>0) printf "%.1f GB", b/1073741824; else printf "?" }')
GPU=$(grep -qiE 'AdrenoVK|vulkan\.adreno' "$S25F" && echo 'Adreno (Vulkan)' || echo '?')
{
echo "**s25** — Samsung Galaxy S25 Ultra (AWS Device Farm)"
echo "- device: ${MODEL:-SM-?} · Android ${ANDROID:-?} · ${ABI:-arm64-v8a}"
echo "- ram: ${RAMGB} · gpu: ${GPU}"
echo "- engine: \`@qvac/llm-llamacpp\` addon (published prebuild)"
} > prov-s25.md
PROV="$PROV --provenance prov-s25.md"
fi
echo "aggregate inputs:$ARGS"
echo "provenance:$PROV"
if [ -z "$ARGS" ]; then
echo "> No VLM matrix logs found for run #${{ github.run_number }}." > consolidated/vlm-matrix-consolidated.md
else
node packages/llm-llamacpp/benchmarks/vlm-matrix/aggregate.js \
--title "VLM Matrix — ${{ inputs.matrix_mode }} / ${{ inputs.matrix_preset }} (run #${{ github.run_number }})" \
--mode "${{ inputs.matrix_mode }}" --engine "${{ inputs.matrix_engine }}" --base f16 --candidate q8 \
--out consolidated/vlm-matrix-consolidated.md \
$PROV $ARGS
fi
- name: Post step summary
if: always()
shell: bash
run: |
{
echo "# VLM Matrix — Consolidated"
echo ""
cat consolidated/vlm-matrix-consolidated.md 2>/dev/null || echo "No consolidated matrix report generated."
} >> "$GITHUB_STEP_SUMMARY"
- name: Post PR comment
if: always() && hashFiles('consolidated/vlm-matrix-consolidated.md') != ''
shell: bash
env:
GH_TOKEN: ${{ github.token }}
REF: ${{ needs.context.outputs.ref }}
run: |
PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null || echo "")
if [[ -z "$PR_NUMBER" ]]; then
echo "No open PR found for ref $REF — skipping PR comment."
exit 0
fi
{
echo "## VLM Matrix Benchmark"
echo ""
echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_"
echo ""
cat consolidated/vlm-matrix-consolidated.md
} > /tmp/matrix-comment.md
gh pr comment "$PR_NUMBER" --body-file /tmp/matrix-comment.md
- name: Upload consolidated matrix report
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
with:
name: vlm-matrix-consolidated-${{ github.run_number }}
path: consolidated/
retention-days: 30
if-no-files-found: warn