Benchmark (LLM) #48

Workflow file for this run

.github/workflows/benchmark-llm-llamacpp.yml at 5cdc65d

	name: Benchmark VLM (LLM)

	# Manually-triggered VLM benchmark. Runs
	# packages/llm-llamacpp/benchmarks/vlm-performance against Qwen3.5-VL
	# on a fixed object-listing task and uploads a consolidated report.
	#
	# 3-source comparison: addon (JS binding) vs fabric-cli (fork CLI) vs
	# upstream-cli (upstream llama.cpp CLI). Measures JS binding overhead
	# and fork divergence using the same model across all sources.

	on:
	workflow_dispatch:
	inputs:
	# ── Sources (what to compare) ─────────────────────────────
	run_addon:
	description: "── SOURCE 1 ── addon (@qvac/llm-llamacpp JS binding)"
	required: false
	type: boolean
	default: true
	ref:
	description: " addon ref — qvac branch / tag / SHA (default: current branch)"
	required: false
	type: string
	addon_from_source:
	description: " build addon from source (slow, but uses latest fabric)"
	required: false
	type: boolean
	default: false
	run_addon_source:
	description: " A/B: also build addon from source with the vcpkg overlay applied, run alongside npm addon in one cell (same runner). x86-CPU cells only."
	required: false
	type: boolean
	default: false
	addon_source_overlay:
	description: " apply the vcpkg overlay during the addon-source build (default true). Set false to rule out 'is it the overlay or something else?'"
	required: false
	type: boolean
	default: true
	run_fabric_cli:
	description: "── SOURCE 2 ── fabric (qvac fork, native CLI)"
	required: false
	type: boolean
	default: true
	fabric_ref:
	description: " fabric ref (default: v8189.0.2)"
	required: false
	type: string
	default: "v8189.0.2"
	run_upstream_cli:
	description: "── SOURCE 3 ── upstream (vanilla llama.cpp, native CLI)"
	required: false
	type: boolean
	default: true
	upstream_ref:
	description: " upstream ref (default: b8189)"
	required: false
	type: string
	default: "b8189"
	# ── Platforms × backends ──────────────────────────────────
	# Comma-separated selection. Tokens: linux-cpu, linux-gpu,
	# windows-cpu, windows-gpu, macos. "all" expands to every desktop
	# cell. GPU rows for Linux/Windows go to self-hosted Vulkan runners;
	# macOS uses Metal on GitHub-hosted macos-15-xlarge.
	platforms:
	description: "── PLATFORMS ── e.g. linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos (or 'all')"
	required: false
	type: string
	default: "linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos"
	run_android:
	description: "── PLATFORM ── Android (stub)"
	required: false
	type: boolean
	default: false
	# ── Run settings ──────────────────────────────────────────
	warmup_runs:
	description: "Warmup iterations (discarded)"
	required: false
	type: string
	default: "1"
	measured_runs:
	description: "Measured iterations (median reported)"
	required: false
	type: string
	default: "3"
	# ── Matrix mode (config-driven quality+speed matrix) ──────
	# Orthogonal to the source-engines benchmark above. When on, runs
	# the @qvac/llm-llamacpp addon over the vlm-matrix fixture (lmms-eval
	# quality + vision-encode speed) on Linux (and S25), driven by
	# packages/llm-llamacpp/test/integration/vlm-matrix.config.cjs.
	run_matrix:
	description: "── MATRIX ── run the config-driven VLM quality+speed matrix (addon, Linux + S25)"
	required: false
	type: boolean
	default: false
	matrix_mode:
	description: " matrix mode: two-models (f16 vs q8, addon) or several-sources (addon+fabric-cli+upstream-cli, Linux-only)"
	required: false
	type: string
	default: "two-models"
	matrix_preset:
	description: " matrix preset: compare (two-models), sources (several-sources), smoke, or full. Overrides config.defaultPreset on Linux."
	required: false
	type: string
	default: "compare"
	matrix_engine:
	description: " inference engine (two-models mode): addon \| fabric-cli \| upstream-cli. CLI engines are desktop-only; addon runs everywhere."
	required: false
	type: string
	default: "addon"
	matrix_linux:
	description: " Linux matrix legs, comma-sep: linux-cpu,linux-gpu"
	required: false
	type: string
	default: "linux-cpu,linux-gpu"
	run_matrix_s25:
	description: " also run the matrix on Samsung S25 (AWS Device Farm)"
	required: false
	type: boolean
	default: false
	permissions:
	contents: read
	packages: read
	pull-requests: write
	id-token: write

	jobs:
	# ── Context ────────────────────────────────────────────────────────
	# Resolves the repo + ref so downstream jobs check out the right
	# commit even when the workflow_dispatch is invoked without `ref`,
	# and builds the desktop matrix from the per-platform input toggles.
	# Matrix is computed here (instead of via job-level `if:`) because
	# GitHub Actions doesn't allow `matrix.*` references in job-level
	# conditions — they're evaluated before the matrix is expanded.
	context:
	runs-on: ubuntu-latest
	outputs:
	repository: ${{ steps.ctx.outputs.repository }}
	ref: ${{ steps.ctx.outputs.ref }}
	desktop_matrix: ${{ steps.matrix.outputs.value }}
	desktop_count: ${{ steps.matrix.outputs.count }}
	linux_matrix: ${{ steps.lmatrix.outputs.value }}
	linux_count: ${{ steps.lmatrix.outputs.count }}
	merge_base: ${{ steps.commits.outputs.merge_base }}
	head_sha: ${{ steps.commits.outputs.head_sha }}
	head_title: ${{ steps.commits.outputs.head_title }}
	head_date: ${{ steps.commits.outputs.head_date }}
	base_title: ${{ steps.commits.outputs.base_title }}
	base_date: ${{ steps.commits.outputs.base_date }}
	steps:
	- id: ctx
	shell: bash
	env:
	INPUT_REF: ${{ inputs.ref }}
	REPO: ${{ github.repository }}
	REF_NAME: ${{ github.ref_name }}
	run: \|
	repo="$REPO"
	ref="${INPUT_REF:-$REF_NAME}"
	echo "repository=$repo" >> "$GITHUB_OUTPUT"
	echo "ref=$ref" >> "$GITHUB_OUTPUT"
	- id: matrix
	shell: bash
	env:
	RUN_ADDON: ${{ inputs.run_addon }}
	RUN_FABRIC: ${{ inputs.run_fabric_cli }}
	RUN_UPSTREAM: ${{ inputs.run_upstream_cli }}
	RUN_ADDON_SOURCE: ${{ inputs.run_addon_source }}
	PLATFORMS: ${{ inputs.platforms }}
	run: \|
	# Sources
	sources='[]'
	if [[ "$RUN_ADDON" == "true" ]]; then
	sources=$(echo "$sources" \| jq -c '. + ["addon"]')
	fi
	if [[ "$RUN_FABRIC" == "true" ]]; then
	sources=$(echo "$sources" \| jq -c '. + ["fabric"]')
	fi
	if [[ "$RUN_UPSTREAM" == "true" ]]; then
	sources=$(echo "$sources" \| jq -c '. + ["upstream"]')
	fi
	# addon-source is an opt-in 4th source that builds the addon
	# from local sources (with vcpkg overlay applied) and runs it
	# in the same cell as the npm addon. Gated to *-cpu cells
	# because llamafile is the primary thing this exists to A/B,
	# and that's x86-CPU-specific.
	if [[ "$RUN_ADDON_SOURCE" == "true" ]]; then
	sources=$(echo "$sources" \| jq -c '. + ["addon-source"]')
	fi
	# Selected platform×backend tokens
	selected="${PLATFORMS:-linux-cpu}"
	if [[ "$selected" == "all" ]]; then
	selected="linux-cpu,linux-gpu,windows-cpu,windows-gpu,macos"
	fi
	# Build cells from tokens. Each token maps to a fixed
	# (platform, arch, backend, runner) tuple.
	cells='[]'
	IFS=',' read -ra tokens <<< "$selected"
	for raw in "${tokens[@]}"; do
	sel=$(echo "$raw" \| xargs)
	case "$sel" in
	linux-cpu) plat=linux-x64; arch=x64; backend=cpu; runner=ubuntu-latest ;;
	linux-gpu) plat=linux-x64; arch=x64; backend=gpu; runner=qvac-ubuntu2404-x64-gpu ;;
	windows-cpu) plat=windows-x64; arch=x64; backend=cpu; runner=windows-latest ;;
	windows-gpu) plat=windows-x64; arch=x64; backend=gpu; runner=qvac-win25-x64-gpu ;;
	macos) plat=macos-arm64; arch=arm64; backend=gpu; runner=macos-15-xlarge ;;
	"") continue ;;
	*) echo "::warning::Unknown platform token '$sel' (known: linux-cpu, linux-gpu, windows-cpu, windows-gpu, macos)"; continue ;;
	esac
	for src in $(echo "$sources" \| jq -r '.[]'); do
	# windows-gpu currently only supports the addon leg: the
	# self-hosted qvac-win25-x64-gpu runner has Vulkan and
	# chocolatey but no MSVC and chocolatey can't install
	# LLVM at job time (lock/permission errors), so the
	# fabric/upstream CLI builds aren't viable there yet.
	# Re-enable once the runner image ships LLVM+Ninja.
	if [[ "$sel" == "windows-gpu" && "$src" != "addon" ]]; then
	continue
	fi
	# addon-source: llamafile is x86-CPU specific, so only
	# emit the from-source A/B cell on CPU cells (linux-cpu,
	# windows-cpu). On GPU cells the matmul path goes through
	# Vulkan/Metal shaders that don't change with llamafile.
	if [[ "$src" == "addon-source" ]]; then
	case "$sel" in
	linux-cpu\|windows-cpu) ;;
	*) continue ;;
	esac
	fi
	# When the addon-source A/B is enabled on this cell, the
	# addon-source leg already runs --sources=addon,addon-source
	# in one process. The dedicated 'addon' cell would
	# produce a duplicate row on a different runner — skip
	# it so the consolidated report stays clean.
	if [[ "$src" == "addon" && "$RUN_ADDON_SOURCE" == "true" ]]; then
	case "$sel" in
	linux-cpu\|windows-cpu) continue ;;
	esac
	fi
	cells=$(echo "$cells" \| jq -c \
	--arg p "$plat" --arg a "$arch" --arg b "$backend" --arg r "$runner" --arg s "$src" \
	'. + [{"platform":$p,"arch":$a,"backend":$b,"runner":$r,"source":$s}]')
	done
	done
	count=$(echo "$cells" \| jq 'length')
	echo "value=$cells" >> "$GITHUB_OUTPUT"
	echo "count=$count" >> "$GITHUB_OUTPUT"
	echo "Desktop matrix ($count entries): $cells"
	# Linux legs for the config-driven matrix mode (addon over the
	# vlm-matrix fixture). Independent of the source-engines matrix above.
	- id: lmatrix
	shell: bash
	env:
	RUN_MATRIX: ${{ inputs.run_matrix }}
	MATRIX_LINUX: ${{ inputs.matrix_linux }}
	MATRIX_MODE: ${{ inputs.matrix_mode }}
	run: \|
	cells='[]'
	if [[ "$RUN_MATRIX" == "true" ]]; then
	IFS=',' read -ra tokens <<< "${MATRIX_LINUX:-linux-cpu}"
	for raw in "${tokens[@]}"; do
	sel=$(echo "$raw" \| xargs)
	# several-sources builds native fabric/upstream CLIs, so the CPU leg
	# needs a runner with cmake+toolchain → GitHub-hosted ubuntu-latest.
	cpu_runner=qvac-ubuntu2204-x64
	if [[ "$MATRIX_MODE" == "several-sources" ]]; then cpu_runner=ubuntu-latest; fi
	case "$sel" in
	linux-cpu) backend=cpu; runner=$cpu_runner; no_gpu=true ;;
	linux-gpu) backend=gpu; runner=qvac-ubuntu2404-x64-gpu; no_gpu=false ;;
	"") continue ;;
	*) echo "::warning::Unknown matrix_linux token '$sel' (known: linux-cpu, linux-gpu)"; continue ;;
	esac
	cells=$(echo "$cells" \| jq -c \
	--arg b "$backend" --arg r "$runner" --arg n "$no_gpu" \
	'. + [{"backend":$b,"runner":$r,"no_gpu":$n}]')
	done
	fi
	count=$(echo "$cells" \| jq 'length')
	echo "value=$cells" >> "$GITHUB_OUTPUT"
	echo "count=$count" >> "$GITHUB_OUTPUT"
	echo "Linux matrix ($count entries): $cells"
	# Resolve commit metadata so the consolidated report can show what
	# the candidate ref + merge-base actually point at (hash, title,
	# date). Needs a real clone — sparse checkout doesn't give us git
	# history.
	- name: Checkout for commit lookup
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
	with:
	repository: ${{ steps.ctx.outputs.repository }}
	ref: ${{ steps.ctx.outputs.ref }}
	fetch-depth: 0
	- id: commits
	shell: bash
	run: \|
	git fetch origin main --quiet
	HEAD_SHA=$(git rev-parse HEAD)
	MERGE_BASE=$(git merge-base HEAD origin/main \|\| echo "")
	HEAD_TITLE=$(git log -1 --pretty=%s "$HEAD_SHA")
	HEAD_DATE=$(git log -1 --pretty=%cI "$HEAD_SHA")
	if [[ -n "$MERGE_BASE" ]]; then
	BASE_TITLE=$(git log -1 --pretty=%s "$MERGE_BASE")
	BASE_DATE=$(git log -1 --pretty=%cI "$MERGE_BASE")
	else
	BASE_TITLE=""
	BASE_DATE=""
	fi
	{
	echo "head_sha=$HEAD_SHA"
	echo "merge_base=$MERGE_BASE"
	echo "head_title=$HEAD_TITLE"
	echo "head_date=$HEAD_DATE"
	echo "base_title=$BASE_TITLE"
	echo "base_date=$BASE_DATE"
	} >> "$GITHUB_OUTPUT"
	echo "HEAD: $HEAD_SHA - $HEAD_TITLE ($HEAD_DATE)"
	echo "merge-base: $MERGE_BASE - $BASE_TITLE ($BASE_DATE)"

	# ── Desktop benchmark matrix ───────────────────────────────────────
	# Each leg is the same shape — pick the runner via matrix.runner.
	# GPU rows (linux-x64 / windows-x64) target self-hosted Vulkan
	# runners pre-provisioned with the Vulkan SDK; macOS arm64 uses
	# Metal on the GitHub-hosted macos-15-xlarge runner. The matrix
	# itself is built dynamically by the context job above from the
	# `platforms` input.
	desktop:
	needs: context
	if: needs.context.outputs.desktop_count != '0'
	name: vlm-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}
	runs-on: ${{ matrix.runner }}
	timeout-minutes: 30
	strategy:
	fail-fast: false
	matrix:
	include: ${{ fromJSON(needs.context.outputs.desktop_matrix) }}
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	WORKDIR: packages/llm-llamacpp/benchmarks/vlm-performance
	steps:
	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
	with:
	repository: ${{ needs.context.outputs.repository }}
	ref: ${{ needs.context.outputs.ref }}

	# The addon-source cell needs the full native toolchain (LLVM,
	# vcpkg, bare-make, Vulkan SDK) because it builds the addon from
	# local sources with the vcpkg overlay applied. The 'addon',
	# 'fabric', and 'upstream' cells stay on the lighter npm path.
	- name: Setup Node.js and Bare tooling
	if: matrix.source == 'addon-source'
	uses: ./.github/actions/setup-bare-tooling

	- name: Setup Node.js
	if: matrix.source != 'addon-source'
	uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
	with:
	node-version: 22

	- name: Setup LLVM
	if: matrix.source == 'addon-source'
	uses: ./.github/actions/setup-llvm

	# Inline vcpkg bootstrap. We can't use the repo's setup-vcpkg
	# composite action because it hard-requires MODEL_S3_BUCKET for
	# the prebuilds-shared S3 binary cache — a secret we don't
	# plumb into the bench workflow. Set VCPKG_ROOT to the runner's
	# pre-installed vcpkg and leave VCPKG_BINARY_SOURCES at the
	# default (per-runner disk cache), which is fine for a one-off
	# bench build.
	- name: Configure vcpkg (addon-source)
	if: matrix.source == 'addon-source' && runner.os == 'Linux'
	shell: bash
	run: \|
	echo "VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT" >> "$GITHUB_ENV"
	echo "VCPKG_BUILD_TYPE=release" >> "$GITHUB_ENV"
	echo "VCPKG_CMAKE_CONFIGURE_OPTIONS=--no-parallel-configure" >> "$GITHUB_ENV"

	# qvac-fabric defaults to the gpu-backends feature, which
	# transitively requires the Vulkan SDK at build time. ubuntu-latest
	# doesn't ship one; install upstream's prebuilt SDK and stamp the
	# env so cmake's FindVulkan picks it up. Same install pattern
	# benchmark-embed-llamacpp.yml uses. libvulkan-dev pulls
	# libvulkan.so + libvulkan1 from the distro — the LunarG SDK
	# 1.4.x no longer bundles the loader, so cmake's FindVulkan
	# can't see Vulkan_LIBRARY without it.
	- name: Install Vulkan SDK (addon-source on Linux)
	if: matrix.source == 'addon-source' && runner.os == 'Linux'
	shell: bash
	run: \|
	sudo apt-get update
	sudo apt-get install -y libxi-dev libxtst-dev libxrandr-dev xz-utils libvulkan-dev
	wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
	mkdir -p "$HOME/vulkan"
	tar -xf /tmp/vulkansdk.tar.xz -C "$HOME/vulkan" --strip-components=1
	VULKAN_SDK="$HOME/vulkan/x86_64"
	echo "VULKAN_SDK=$VULKAN_SDK" >> "$GITHUB_ENV"
	echo "$VULKAN_SDK/bin" >> "$GITHUB_PATH"
	echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
	echo "PKG_CONFIG_PATH=$VULKAN_SDK/share/pkgconfig:$VULKAN_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}" >> "$GITHUB_ENV"

	# Install npm @qvac/llm-llamacpp first so we have the published
	# addon (no llamafile) on disk, then snapshot it to /tmp before
	# the from-source build overwrites prebuilds/ in the workspace.
	# The bench then runs both addon variants in this cell on the
	# same runner: addon → /tmp/npm-addon-snapshot, addon-source →
	# the workspace's freshly-built artifact.
	- name: "Install benchmark deps (addon-source: npm baseline)"
	if: matrix.source == 'addon-source'
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	run: npm install --no-audit --no-fund

	# Copy the WHOLE node_modules tree so the snapshotted addon at
	# /tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp can still
	# resolve its sibling deps (bare-fs, bare-path, …) via standard
	# require-walking. A naked copy of just @qvac/llm-llamacpp leaves
	# those siblings unreachable and the snapshot fails to load.
	- name: Snapshot npm addon (addon-source A/B)
	if: matrix.source == 'addon-source'
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	run: \|
	mkdir -p /tmp/npm-snapshot
	cp -r node_modules /tmp/npm-snapshot/

	# Optional: drop the overlay before bare-make so the from-source
	# build matches the npm-published binary as closely as possible.
	# Used to test "is the addon-vs-fabric gap caused by the overlay
	# (llamafile) or by something else in the source build path?"
	- name: Disable vcpkg overlay for addon-source A/B
	if: matrix.source == 'addon-source' && !inputs.addon_source_overlay
	shell: bash
	run: \|
	rm -rf packages/llm-llamacpp/vcpkg/ports/qvac-fabric
	echo "::notice::Overlay removed — addon-source will build with the registry version of qvac-fabric (llamafile OFF, same as npm)"

	- name: Build addon from source
	if: matrix.source == 'addon-source'
	shell: bash
	working-directory: packages/llm-llamacpp
	env:
	# vcpkg needs to clone the private qvac-registry-vcpkg repo
	# at configure time. Workflow-level token is enough for read.
	GH_TOKEN: ${{ secrets.GH_TOKEN \|\| github.token }}
	GITHUB_TOKEN: ${{ secrets.GH_TOKEN \|\| github.token }}
	run: \|
	npm install --no-audit --no-fund
	bare-make generate
	bare-make build
	bare-make install

	- name: Re-link workspace addon (addon-source A/B)
	if: matrix.source == 'addon-source'
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	run: npm install --no-audit --no-fund --install-links ../../

	- name: Install benchmark deps (addon from npm)
	if: matrix.source != 'addon-source'
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	run: npm install --no-audit --no-fund

	# Vulkan SDK is pre-installed on the self-hosted GPU runners
	# (qvac-ubuntu2404-x64-gpu, qvac-win25-x64-gpu) at the well-known
	# paths shown below — same convention reusable-prebuilds.yml uses.
	# macOS GPU goes through Metal and needs no SDK.
	- name: Configure Vulkan SDK env (Linux GPU)
	if: matrix.backend == 'gpu' && matrix.platform == 'linux-x64'
	shell: bash
	run: \|
	echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV"
	echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH"
	- name: Configure Vulkan SDK env (Windows GPU)
	if: matrix.backend == 'gpu' && matrix.platform == 'windows-x64'
	shell: bash
	run: \|
	# Single-quoted to preserve the backslash literally.
	echo 'VULKAN_SDK=C:\VulkanSDK' >> "$GITHUB_ENV"

	# The self-hosted qvac-win25-x64-gpu runner doesn't ship cmake on
	# PATH (windows-latest does, via the bundled VS install). Drop a
	# Kitware build in front of PATH so both Windows runners look the
	# same to build-cli-sources.js. Matches the bootstrap pattern in
	# pr-test-inference-addon-cpp-js.yml. Addon legs skip this — they
	# use the npm prebuild and never call cmake.
	- name: Setup CMake (Windows CLI builds)
	if: matrix.platform == 'windows-x64' && matrix.source != 'addon'
	shell: bash
	working-directory: ${{ runner.temp }}
	run: \|
	curl -L https://github.com/Kitware/CMake/releases/download/v3.31.6/cmake-3.31.6-windows-x86_64.zip -o cmake.zip
	unzip -q cmake.zip
	echo "$PWD/cmake-3.31.6-windows-x86_64/bin" >> "$GITHUB_PATH"

	- name: Cache CLI builds
	if: matrix.source != 'addon'
	uses: actions/cache@v4
	with:
	path: ${{ env.WORKDIR }}/cli-builds
	key: vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ matrix.source == 'fabric' && inputs.fabric_ref \|\| inputs.upstream_ref }}
	restore-keys: \|
	vlm-cli-v3-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-

	- name: Build CLI source
	if: matrix.source != 'addon'
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	env:
	# Windows-CPU runs on windows-latest where the bundled VS
	# install provides MSVC; force the VS multi-config generator
	# so cmake doesn't fall back to MinGW. Matches cpp-tests-*.yml.
	CMAKE_GENERATOR: ${{ matrix.platform == 'windows-x64' && 'Visual Studio 17 2022' \|\| '' }}
	run: \|
	REF=${{ matrix.source == 'fabric' && inputs.fabric_ref \|\| inputs.upstream_ref }}
	node scripts/build-cli-sources.js \
	--sources=${{ matrix.source }} \
	--${{ matrix.source }}-ref=$REF \
	--backend=${{ matrix.backend }}

	- name: Prepare models
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	run: npm run prepare:models

	- name: Run VLM benchmark
	shell: bash
	working-directory: ${{ env.WORKDIR }}
	run: \|
	if [[ "${{ matrix.source }}" == "addon-source" ]]; then
	# Same-runner A/B: run npm addon (from the snapshot) AND
	# the freshly-built source addon (workspace via @qvac/...)
	# back-to-back in one process so they hit identical hardware.
	npm run run:vlm-bench -- \
	--sources=addon,addon-source \
	--addon-path=/tmp/npm-snapshot/node_modules/@qvac/llm-llamacpp \
	--backend=${{ matrix.backend }} \
	--force-gpu-row \
	--warmup-runs=${{ inputs.warmup_runs }} \
	--measured-runs=${{ inputs.measured_runs }}
	else
	npm run run:vlm-bench -- \
	--sources=${{ matrix.source }} \
	--backend=${{ matrix.backend }} \
	--force-gpu-row \
	--warmup-runs=${{ inputs.warmup_runs }} \
	--measured-runs=${{ inputs.measured_runs }}
	fi

	- name: Upload per-platform results
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
	with:
	name: vlm-perf-${{ matrix.platform }}-${{ matrix.backend }}-${{ matrix.source }}-${{ github.run_number }}
	path: \|
	${{ env.WORKDIR }}/results/vlm-perf-*.md
	${{ env.WORKDIR }}/results/vlm-perf-*.json
	${{ env.WORKDIR }}/results/cell-*-stderr.log
	retention-days: 14
	if-no-files-found: warn

	# ── Android (stub) ─────────────────────────────────────────────────
	# Earlier iteration tried to reuse integration-mobile-test-llm-
	# llamacpp.yml in perf-only mode, but the existing mobile workflow
	# is built for breadth (Android + iOS matrix, 3+12 Device-Farm
	# sessions covering many tests) — one full invocation took ~20 min
	# of mostly-irrelevant work for our use case. Until we either land a
	# leaner mobile workflow that runs just our benchmark, or bundle our
	# benchmark logic into the existing mobile test app, this job is a
	# placeholder so the workflow shape covers Android.
	#
	# Default is OFF — flip run_android to true to see the marker
	# artifact and confirm wiring.
	android:
	needs: context
	if: inputs.run_android
	runs-on: ubuntu-latest
	timeout-minutes: 5
	steps:
	- name: Stub notice
	shell: bash
	run: \|
	mkdir -p android-stub
	cat > android-stub/README.txt <<'EOF'
	Android VLM benchmark - placeholder
	===================================
	The full Android benchmark is not yet wired. The existing
	mobile workflow (integration-mobile-test-llm-llamacpp.yml)
	runs the broader integration test suite and is too heavy
	for the one-cell VLM benchmark; a dedicated leaner mobile
	path is planned.

	For Android perf numbers right now, run
	Actions -> Benchmark Performance (LLM) -> Run workflow
	(workflow file: benchmark-performance-infer-llm-llamacpp.yml)
	EOF
	echo "Android benchmark is a stub in this iteration."
	cat android-stub/README.txt
	- name: Upload Android stub marker
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
	with:
	name: vlm-perf-android-${{ github.run_number }}
	path: android-stub/
	retention-days: 14

	# ── Summarize ──────────────────────────────────────────────────────
	# Downloads every desktop artifact and renders a consolidated table.
	# `if: always()` keeps the summary going when one matrix leg fails.
	summarize:
	needs:
	- context
	- desktop
	- android
	if: always() && needs.context.result == 'success' && needs.context.outputs.desktop_count != '0'
	runs-on: ubuntu-latest
	timeout-minutes: 10
	steps:
	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
	with:
	repository: ${{ needs.context.outputs.repository }}
	ref: ${{ needs.context.outputs.ref }}
	sparse-checkout: \|
	packages/llm-llamacpp/benchmarks/vlm-performance/scripts
	sparse-checkout-cone-mode: false

	- name: Setup Node.js
	uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
	with:
	node-version: 22

	- name: Download desktop per-platform artifacts
	uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
	with:
	pattern: vlm-perf-*-${{ github.run_number }}
	path: per-platform

	- name: Build commit-info JSON
	shell: bash
	run: \|
	cat > commit-info.json <<EOF
	{
	"head": {
	"sha": "${{ needs.context.outputs.head_sha }}",
	"title": ${{ toJSON(needs.context.outputs.head_title) }},
	"date": "${{ needs.context.outputs.head_date }}"
	},
	"merge_base": {
	"sha": "${{ needs.context.outputs.merge_base }}",
	"title": ${{ toJSON(needs.context.outputs.base_title) }},
	"date": "${{ needs.context.outputs.base_date }}"
	},
	"comparison_mode": "source-engines"
	}
	EOF
	cat commit-info.json

	- name: Aggregate into one report
	shell: bash
	run: \|
	mkdir -p consolidated
	node packages/llm-llamacpp/benchmarks/vlm-performance/scripts/aggregate-platforms.js \
	--inputs=per-platform \
	--commit-info=commit-info.json \
	--output-md=consolidated/vlm-perf-consolidated.md \
	--output-json=consolidated/vlm-perf-consolidated.json

	- name: Post step summary
	if: always()
	shell: bash
	run: \|
	{
	echo "## VLM Benchmark - Consolidated"
	echo ""
	if [ -f consolidated/vlm-perf-consolidated.md ]; then
	tail -n +2 consolidated/vlm-perf-consolidated.md
	else
	echo "No consolidated report generated."
	fi
	} >> "$GITHUB_STEP_SUMMARY"

	- name: Post PR comment (View 2 summary)
	if: always() && hashFiles('consolidated/vlm-perf-consolidated.md') != ''
	shell: bash
	env:
	GH_TOKEN: ${{ github.token }}
	REF: ${{ needs.context.outputs.ref }}
	run: \|
	PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null \|\| echo "")
	if [[ -z "$PR_NUMBER" ]]; then
	echo "No open PR found for ref $REF — skipping PR comment."
	exit 0
	fi
	echo "Posting VLM benchmark summary to PR #$PR_NUMBER"
	{
	echo "## VLM Benchmark Summary"
	echo ""
	echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_"
	echo ""
	tail -n +2 consolidated/vlm-perf-consolidated.md
	} > /tmp/pr-comment-body.md
	gh pr comment "$PR_NUMBER" --body-file /tmp/pr-comment-body.md

	- name: Upload consolidated report
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
	with:
	name: vlm-perf-consolidated-${{ github.run_number }}
	path: consolidated/
	retention-days: 30
	if-no-files-found: warn

	# ── Matrix mode: Linux legs ────────────────────────────────────────
	# Runs the @qvac/llm-llamacpp addon (published linux-x64 prebuild, which
	# is CPU + Vulkan-GPU capable) over the vlm-matrix fixture. Branch JS
	# (harness/config/fixture) + published native prebuild. One leg per
	# backend; each emits [VLMROW]/[VLMSEG]/[VLMMETA] markers to a log that
	# matrix-combine aggregates with vlm-matrix/aggregate.js.
	matrix-linux:
	needs: context
	if: needs.context.outputs.linux_count != '0'
	name: vlm-matrix-linux-${{ matrix.backend }}
	runs-on: ${{ matrix.runner }}
	# several-sources builds two CLIs from source (first run, pre-cache) + per-image
	# CLI model reloads, so allow more wall-clock than the addon-only two-models path.
	timeout-minutes: 120
	strategy:
	fail-fast: false
	matrix:
	include: ${{ fromJSON(needs.context.outputs.linux_matrix) }}
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	WORKDIR: packages/llm-llamacpp
	steps:
	- name: Manual Workspace Cleanup
	if: startsWith(matrix.runner, 'qvac-')
	shell: bash
	run: rm -rf "$GITHUB_WORKSPACE" && mkdir -p "$GITHUB_WORKSPACE"

	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
	with:
	repository: ${{ needs.context.outputs.repository }}
	ref: ${{ needs.context.outputs.ref }}

	- name: Setup Node.js
	uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
	with:
	node-version: 22

	- name: Install addon deps
	working-directory: ${{ env.WORKDIR }}
	shell: bash
	run: npm install --no-audit --no-fund

	- name: Install bare tooling
	shell: bash
	run: npm install -g --force bare bare-make bare-runtime bare-https brittle

	# Branch JS + published native prebuild: pull the linux-x64 prebuild
	# from the public npm package into the workspace so the harness's
	# require('../../index.js') loads. The published prebuild handles both
	# cpu and (Vulkan) gpu via runtime device selection.
	- name: Fetch published prebuilds
	working-directory: ${{ env.WORKDIR }}
	shell: bash
	run: \|
	npm pack @qvac/llm-llamacpp@latest
	tar -xzf *.tgz
	ADDON_VER=$(node -e "console.log(require('./package/package.json').version)")
	echo "ADDON_VERSION=$ADDON_VER" >> "$GITHUB_ENV"
	rm -rf prebuilds
	mv package/prebuilds ./prebuilds
	rm -rf package *.tgz
	echo "addon @qvac/llm-llamacpp@$ADDON_VER"
	ls -la prebuilds/

	# GPU runners ship the Vulkan SDK at the well-known path; stamp the
	# env so the addon's loader finds it. No-op on the cpu leg.
	- name: Configure Vulkan SDK env (Linux GPU)
	if: matrix.backend == 'gpu'
	shell: bash
	run: \|
	echo "VULKAN_SDK=/opt/vulkansdk/x86_64" >> "$GITHUB_ENV"
	echo "/opt/vulkansdk/x86_64/bin" >> "$GITHUB_PATH"

	- name: Run VLM matrix
	working-directory: ${{ env.WORKDIR }}
	shell: bash
	env:
	QVAC_VLM_MATRIX: "1"
	QVAC_VLM_MODE: ${{ inputs.matrix_mode }}
	QVAC_VLM_PRESET: ${{ inputs.matrix_preset }}
	QVAC_VLM_ENGINE: addon # this leg is always the 'addon' source
	QVAC_VLM_DEVICES: ${{ matrix.backend }}
	NO_GPU: ${{ matrix.no_gpu }}
	run: \|
	# Regenerate the brittle runner to include ONLY the matrix test,
	# then run it under bare (same pattern as the perf-only path).
	# In several-sources mode this is the addon leg; fabric/upstream CLIs
	# are appended to the same log by the next step.
	npx brittle -r test/integration/all.js test/integration/vlm-matrix.test.js
	bare test/integration/all.js --exit 2>&1 \| tee "vlm-matrix-linux-${{ matrix.backend }}.log"
	exit ${PIPESTATUS[0]}

	# several-sources only: build native fabric/upstream llama-mtmd-cli and run
	# them over the SAME fixture, appending [VLMROW]/[VLMSEG] markers to the addon
	# log so aggregate.js renders a 3-source comparison. Linux-only.
	- name: Cache CLI builds (several-sources)
	if: inputs.matrix_mode == 'several-sources'
	uses: actions/cache@v4
	with:
	path: ${{ env.WORKDIR }}/benchmarks/vlm-performance/cli-builds
	key: vlm-cli-v3-linux-${{ matrix.backend }}-${{ inputs.fabric_ref }}-${{ inputs.upstream_ref }}

	- name: Build + run fabric/upstream CLIs over the fixture (several-sources)
	if: inputs.matrix_mode == 'several-sources'
	working-directory: ${{ env.WORKDIR }}/benchmarks/vlm-performance
	shell: bash
	env:
	LOG: ${{ github.workspace }}/${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log
	MODEL_DIR: ${{ github.workspace }}/${{ env.WORKDIR }}/test/model
	run: \|
	npm install --no-audit --no-fund
	node scripts/build-cli-sources.js --sources=fabric,upstream \
	--fabric-ref=${{ inputs.fabric_ref }} --upstream-ref=${{ inputs.upstream_ref }} \
	--backend=${{ matrix.backend }}
	FABRIC_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').fabric.binaryPath)")
	UPSTREAM_BIN=$(node -e "console.log(require('./cli-sources-resolved.json').upstream.binaryPath)")
	# Model files were downloaded by the addon leg (names from vlm-matrix.config.cjs).
	LLM="$MODEL_DIR/reg-qwen-unsloth-Q8_0.gguf"
	MMPROJ="$MODEL_DIR/reg-qwen-mradermacher-mmproj-Q8_0.gguf"
	run_src () {
	echo ">> $1 over fixture ($2)"
	node ../vlm-matrix/cli-fixture-runner.cjs \
	--binary "$3" --source "$1" --llm "$LLM" --mmproj "$MMPROJ" \
	--backend "${{ matrix.backend }}" --samples 3 \
	--tasks textvqa,vizwiz,gqa,docvqa,ai2d \
	--main-origin "Qwen3.5-0.8B-Q8_0 (Registry)" \
	--mmproj-origin "Qwen3.5-0.8B mmproj-Q8_0 (Registry)" >> "$LOG" 2>&1 \|\| echo "::warning::$1 run had errors"
	}
	run_src fabric-cli fabric "$FABRIC_BIN"
	run_src upstream-cli upstream "$UPSTREAM_BIN"
	echo "appended fabric-cli + upstream-cli to $LOG"

	# HW/SW provenance so a reader can reproduce the numbers. Rendered in the
	# report's Details section (passed to aggregate.js via --provenance).
	- name: Gather provenance
	if: always()
	working-directory: ${{ env.WORKDIR }}
	shell: bash
	run: \|
	F="prov-linux-${{ matrix.backend }}.md"
	{
	echo "linux · ${{ matrix.backend }} (runner \`${{ matrix.runner }}\`)"
	echo "- addon: \`@qvac/llm-llamacpp@${ADDON_VERSION:-?}\` (published prebuild)"
	echo "- git: \`${{ needs.context.outputs.head_sha }}\` (ref \`${{ needs.context.outputs.ref }}\`)"
	echo "- node: $(node -v 2>/dev/null) · bare: $(bare --version 2>/dev/null \|\| echo n/a)"
	echo "- os: $(. /etc/os-release 2>/dev/null; echo "$PRETTY_NAME") $(uname -m)"
	echo "- cpu: $(lscpu 2>/dev/null \| sed -n 's/^Model name:[[:space:]]*//p' \| head -1) ($(nproc) cores)"
	echo "- ram: $(free -h 2>/dev/null \| awk '/^Mem:/{print $2}')"
	if [ "${{ matrix.backend }}" = "gpu" ]; then
	echo "- gpu: $(vulkaninfo --summary 2>/dev/null \| sed -n 's/.deviceName[[:space:]]=[[:space:]]*//p' \| head -1 \|\| echo '?')"
	fi
	} > "$F"
	cat "$F"

	- name: Upload matrix log
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
	with:
	name: vlm-matrix-log-linux-${{ matrix.backend }}-${{ github.run_number }}
	path: \|
	${{ env.WORKDIR }}/vlm-matrix-linux-${{ matrix.backend }}.log
	${{ env.WORKDIR }}/prov-linux-${{ matrix.backend }}.md
	retention-days: 14
	if-no-files-found: warn

	# ── Matrix mode: Samsung S25 (AWS Device Farm) ─────────────────────
	# Reuses the Android-only mobile workflow to run the SAME matrix harness
	# on-device. qvac_perf_only restricts the run to perf-tests.json
	# (runVlmMatrixTest) → only the vlmMatrix group is scheduled. The active
	# preset on-device is config.defaultPreset (Device Farm forwards no custom
	# env), so vlm-matrix.config.cjs defaultPreset governs the S25 set. The
	# raw on-device log (bare_console.log) carries the [VLMROW] markers and is
	# uploaded by collect-and-upload-logs as console-logs-llamacpp-llm-Android.
	matrix-s25:
	needs: context
	# S25 runs the addon only; several-sources (native CLIs) is Linux-only.
	if: inputs.run_matrix && inputs.run_matrix_s25 && inputs.matrix_mode != 'several-sources'
	uses: ./.github/workflows/integration-mobile-test-llm-llamacpp.yml
	secrets: inherit
	with:
	ref: ${{ needs.context.outputs.ref }}
	repository: ${{ needs.context.outputs.repository }}
	qvac_perf_only: true

	# ── Matrix mode: combine ───────────────────────────────────────────
	# Aggregates [VLMROW]/[VLMSEG]/[VLMMETA] markers from every matrix log
	# (Linux .log + S25 bare_console.log) into one quality+speed report via
	# vlm-matrix/aggregate.js, surfaced to the step summary + PR comment.
	# This is the mechanism that makes mobile (Device Farm) results visible.
	matrix-combine:
	needs:
	- context
	- matrix-linux
	- matrix-s25
	if: always() && inputs.run_matrix && needs.context.result == 'success'
	runs-on: ubuntu-latest
	timeout-minutes: 10
	steps:
	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # 6.0.2
	with:
	repository: ${{ needs.context.outputs.repository }}
	ref: ${{ needs.context.outputs.ref }}
	sparse-checkout: \|
	packages/llm-llamacpp/benchmarks/vlm-matrix
	sparse-checkout-cone-mode: false

	- name: Setup Node.js
	uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # 6.3.0
	with:
	node-version: 22

	- name: Download Linux matrix logs
	uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
	with:
	pattern: vlm-matrix-log-*-${{ github.run_number }}
	path: matrix-logs

	# S25 raw device logs (bare_console.log holds the [VLMROW] markers).
	# continue-on-error so a Linux-only run (no S25 artifact) still combines.
	- name: Download S25 device logs
	continue-on-error: true
	uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # 8.0.1
	with:
	pattern: console-logs-*
	path: matrix-logs

	- name: Aggregate matrix logs
	shell: bash
	run: \|
	mkdir -p consolidated
	# Tag each input with its platform host so S25 rows don't collapse
	# onto the Linux rows ([VLMROW].device is only cpu/gpu).
	ARGS=""
	# Linux legs: one log per backend (device field carries cpu/gpu).
	for f in $(find matrix-logs -name 'vlm-matrix-linux-*.log' 2>/dev/null \| sort); do
	ARGS="$ARGS --in linux $f"
	done
	# S25: the Samsung device's full logcat carries the [VLMROW] markers
	# (the Android pool may also include a Pixel; we surface S25 here).
	for f in $(find matrix-logs -name 'Galaxy_S25logcat_full*' 2>/dev/null \| sort); do
	ARGS="$ARGS --in s25 $f"
	done
	# HW/SW provenance: Linux legs ship prov-linux-*.md; synthesize one for S25.
	PROV=""
	for p in $(find matrix-logs -name 'prov-*.md' 2>/dev/null \| sort); do
	PROV="$PROV --provenance $p"
	done
	# S25 hardware provenance, parsed from the device's own logcat (model /
	# Android / ABI from the Play-store UA line, RAM from the JS totalMemory
	# line, GPU from the Adreno-Vulkan driver load).
	S25F=$(find matrix-logs -name 'Galaxy_S25logcat_full*' 2>/dev/null \| head -1)
	if [ -n "$S25F" ]; then
	MODEL=$(grep -oE 'model=SM-[A-Z0-9]+' "$S25F" \| head -1 \| cut -d= -f2)
	ANDROID=$(grep -oE 'platformVersionRelease=[0-9]+' "$S25F" \| head -1 \| cut -d= -f2)
	ABI=$(grep -oE 'supportedAbis=[a-z0-9-]+' "$S25F" \| head -1 \| cut -d= -f2)
	RAMB=$(grep -oE 'totalMemory: [0-9]+' "$S25F" \| head -1 \| grep -oE '[0-9]+$')
	RAMGB=$(awk -v b="${RAMB:-0}" 'BEGIN{ if (b>0) printf "%.1f GB", b/1073741824; else printf "?" }')
	GPU=$(grep -qiE 'AdrenoVK\|vulkan\.adreno' "$S25F" && echo 'Adreno (Vulkan)' \|\| echo '?')
	{
	echo "s25 — Samsung Galaxy S25 Ultra (AWS Device Farm)"
	echo "- device: ${MODEL:-SM-?} · Android ${ANDROID:-?} · ${ABI:-arm64-v8a}"
	echo "- ram: ${RAMGB} · gpu: ${GPU}"
	echo "- engine: \`@qvac/llm-llamacpp\` addon (published prebuild)"
	} > prov-s25.md
	PROV="$PROV --provenance prov-s25.md"
	fi
	echo "aggregate inputs:$ARGS"
	echo "provenance:$PROV"
	if [ -z "$ARGS" ]; then
	echo "> No VLM matrix logs found for run #${{ github.run_number }}." > consolidated/vlm-matrix-consolidated.md
	else
	node packages/llm-llamacpp/benchmarks/vlm-matrix/aggregate.js \
	--title "VLM Matrix — ${{ inputs.matrix_mode }} / ${{ inputs.matrix_preset }} (run #${{ github.run_number }})" \
	--mode "${{ inputs.matrix_mode }}" --engine "${{ inputs.matrix_engine }}" --base f16 --candidate q8 \
	--out consolidated/vlm-matrix-consolidated.md \
	$PROV $ARGS
	fi

	- name: Post step summary
	if: always()
	shell: bash
	run: \|
	{
	echo "# VLM Matrix — Consolidated"
	echo ""
	cat consolidated/vlm-matrix-consolidated.md 2>/dev/null \|\| echo "No consolidated matrix report generated."
	} >> "$GITHUB_STEP_SUMMARY"

	- name: Post PR comment
	if: always() && hashFiles('consolidated/vlm-matrix-consolidated.md') != ''
	shell: bash
	env:
	GH_TOKEN: ${{ github.token }}
	REF: ${{ needs.context.outputs.ref }}
	run: \|
	PR_NUMBER=$(gh pr list --head "$REF" --state open --json number --jq '.[0].number' 2>/dev/null \|\| echo "")
	if [[ -z "$PR_NUMBER" ]]; then
	echo "No open PR found for ref $REF — skipping PR comment."
	exit 0
	fi
	{
	echo "## VLM Matrix Benchmark"
	echo ""
	echo "_Run #${{ github.run_number }} — [full report](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})_"
	echo ""
	cat consolidated/vlm-matrix-consolidated.md
	} > /tmp/matrix-comment.md
	gh pr comment "$PR_NUMBER" --body-file /tmp/matrix-comment.md

	- name: Upload consolidated matrix report
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
	with:
	name: vlm-matrix-consolidated-${{ github.run_number }}
	path: consolidated/
	retention-days: 30
	if-no-files-found: warn

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Benchmark (LLM) #48

Workflow file

Benchmark (LLM) #48

Uh oh!

Workflow file for this run