feat: Enhanced Vision kernel implementations — Copy, NMS, HoughLinesP… #144

Workflow file for this run

.github/workflows/conformance.yml at 4258032

	name: OpenVX Conformance Tests

	on:
	push:
	branches: [master, main, develop]
	paths-ignore:
	- '*/.md'
	- 'docs/**'
	- 'LICENSE'
	- '.gitignore'
	- '.gitattributes'
	- '.editorconfig'
	- '*/.svg'
	- '*/.png'
	- '*/.jpg'
	- '*/.jpeg'
	- '*/.gif'
	- '*/.webp'
	pull_request:
	branches: [master, main, develop]
	paths-ignore:
	- '*/.md'
	- 'docs/**'
	- 'LICENSE'
	- '.gitignore'
	- '.gitattributes'
	- '.editorconfig'
	- '*/.svg'
	- '*/.png'
	- '*/.jpg'
	- '*/.jpeg'
	- '*/.gif'
	- '*/.webp'

	env:
	CARGO_TERM_COLOR: always
	RUST_BACKTRACE: 1

	jobs:
	build:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0
	submodules: recursive
	- name: Install system dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y build-essential cmake
	- name: Install Rust
	run: \|
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- -y --default-toolchain stable
	source $HOME/.cargo/env
	rustc --version
	cargo --version
	- name: Build rustVX
	# Explicit AVX2-targeted build for AMD Zen 2+ / Intel Haswell+.
	#
	# We deliberately do NOT auto-detect CPU features from
	# /proc/cpuinfo any more — the previous auto-detection design
	# produced subtly mismatched binaries between the parallel
	# `build` (PR) and `build-main` jobs whenever the two jobs
	# happened to land on different VM pools, which silently
	# poisoned the perf-gate's PR-vs-main comparison (see
	# `Magnitude` regression on a no-op PR in CI run 25615947512).
	#
	# Hardcoding the feature set guarantees both rustVX binaries
	# the perf gate compares are produced with identical:
	# * Cargo `--features` flags (sse2 + avx2 on both crates)
	# * RUSTFLAGS (`-C target-cpu=x86-64-v3`)
	#
	# The `x86-64-v3` microarch level (SSE4.2 + AVX + AVX2 +
	# BMI1+2 + FMA + F16C) is supported by every Linux runner in
	# GitHub's Azure pool today (AMD EPYC Milan/Genoa, Intel
	# Cascade Lake / Ice Lake) and is the right baseline for AMD
	# Zen-tuned performance work — it lets rustc auto-vectorise
	# the rest of the workspace using AVX2 / FMA / BMI2 on top of
	# the hand-tuned `#[target_feature(enable = "avx2")]`
	# intrinsic kernels gated by the `avx2` Cargo feature.
	#
	# Falls back to NEON-only on aarch64 and scalar on anything
	# else — those paths exist for completeness and aren't the
	# gated targets.
	run: \|
	set -euo pipefail
	source $HOME/.cargo/env
	case "$(uname -m)" in
	x86_64\|amd64)
	FEATURES="openvx-core/sse2 openvx-core/avx2 openvx-vision/sse2 openvx-vision/avx2"
	# Pin code-layout alignment so the perf-gate's PR-vs-main
	# bench comparison is invariant to link-order shifts caused
	# by upstream additive code. Without these flags, a purely
	# additive PR that grows `.text` by O(10 KB) shifts every
	# downstream hot kernel forward by the same offset and
	# lands it (or its hot inner loop) at less-favourable
	# cache-line alignment — producing reproducible 20%+
	# "regressions" on tight ~5 ms kernels whose compiled
	# machine code is bit-identical between PR and main
	# (verified by matching Rust mangling hash + objdump
	# comparison).
	#
	# * `-align-all-functions=6` — pad each function entry
	# to 2^6 = 64 bytes (cache-line aligned).
	# * `-align-all-nofallthru-blocks=4` — pad basic blocks
	# reached only via branches (loop headers and join
	# points) to 2^4 = 16 bytes. Catches inner-loop
	# alignment penalties that function-entry padding
	# alone misses.
	#
	# Cost: ~100–200 KB of NOP-pad bloat in libopenvx_ffi.so.
	# Both bench binaries pay it equally so the gate is fair.
	export RUSTFLAGS="-C target-cpu=x86-64-v3 -C llvm-args=-align-all-functions=6 -C llvm-args=-align-all-nofallthru-blocks=4"
	;;
	aarch64\|arm64)
	FEATURES="openvx-core/neon openvx-vision/neon"
	export RUSTFLAGS=""
	;;
	*)
	FEATURES=""
	export RUSTFLAGS=""
	;;
	esac
	echo "Architecture: $(uname -m)"
	echo "Cargo features: ${FEATURES:-<none>}"
	echo "RUSTFLAGS : ${RUSTFLAGS:-<none>}"
	if [ -n "$FEATURES" ]; then
	cargo build --release -p openvx-ffi --features "$FEATURES"
	else
	cargo build --release -p openvx-ffi
	fi
	- name: Build OpenVX CTS
	run: \|
	cd OpenVX-cts
	mkdir -p include
	if [ -d "../include" ]; then
	cp -r ../include/* include/ 2>/dev/null \|\| true
	fi
	mkdir -p build
	cd build
	cmake .. \
	-DCMAKE_BUILD_TYPE=Release \
	-DCMAKE_C_STANDARD_LIBRARIES="-lm" \
	-DCMAKE_CXX_STANDARD_LIBRARIES="-lm" \
	-DOPENVX_INCLUDES="${{ github.workspace }}/include;${{ github.workspace }}/OpenVX-cts/include" \
	-DOPENVX_LIBRARIES="${{ github.workspace }}/target/release/libopenvx_ffi.so;m" \
	-DOPENVX_CONFORMANCE_VISION=ON \
	-DOPENVX_USE_ENHANCED_VISION=ON \
	-DOPENVX_USE_USER_DATA_OBJECT=ON
	make -j$(nproc)
	- name: Upload build artifacts
	uses: actions/upload-artifact@v4
	with:
	name: build-artifacts
	# `include/` is bundled so the downstream benchmark job can build
	# openvx-mark against rustVX without needing to check out the
	# rustVX source tree.
	path: \|
	target/release/libopenvx_ffi.so
	OpenVX-cts/build/bin/vx_test_conformance
	OpenVX-cts/test_data/
	include/
	retention-days: 1

	# Build rustVX from the merge-target ref (i.e. main, in practice) in
	# its own phase, in parallel with the PR's `build` job. The downstream
	# `perf-gate` job pulls both archives down onto a single runner so
	# the PR-vs-main bench comparison runs on identical hardware against
	# binaries that were each built with their own branch's source tree.
	#
	# Both this job and `build` run on the same `ubuntu-22.04` runner pool
	# with the same auto-detection logic, so the resulting libopenvx_ffi.so
	# binaries should have matching compile-time SIMD feature sets in
	# practice. (If GitHub's pool ever produces a heterogeneous mix, the
	# gate will surface it as an obvious cross-the-board regression rather
	# than silently report nonsense; we have not seen this happen yet.)
	#
	# Skipped on push events to main (no merge target to diff against).
	build-main:
	name: Build rustVX (main)
	if: github.event_name == 'pull_request'
	runs-on: ubuntu-22.04
	steps:
	- name: Checkout merge target ref
	uses: actions/checkout@v4
	with:
	ref: ${{ github.base_ref }}
	fetch-depth: 0
	submodules: recursive
	- name: Install system dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y build-essential cmake
	- name: Install Rust
	run: \|
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- -y --default-toolchain stable
	source $HOME/.cargo/env
	rustc --version
	cargo --version
	- name: Build rustVX (main)
	# Identical hardcoded build configuration as the `build` job —
	# see that step for the full rationale. The whole point of
	# making both builds explicit (rather than per-VM auto-
	# detected) is that the `perf-gate` job downstream is
	# guaranteed to compare two binaries with matching
	# compile-time SIMD features and RUSTFLAGS.
	run: \|
	set -euo pipefail
	source $HOME/.cargo/env
	case "$(uname -m)" in
	x86_64\|amd64)
	FEATURES="openvx-core/sse2 openvx-core/avx2 openvx-vision/sse2 openvx-vision/avx2"
	# Pin code-layout alignment so the perf-gate's PR-vs-main
	# bench comparison is invariant to link-order shifts caused
	# by upstream additive code. Without these flags, a purely
	# additive PR that grows `.text` by O(10 KB) shifts every
	# downstream hot kernel forward by the same offset and
	# lands it (or its hot inner loop) at less-favourable
	# cache-line alignment — producing reproducible 20%+
	# "regressions" on tight ~5 ms kernels whose compiled
	# machine code is bit-identical between PR and main
	# (verified by matching Rust mangling hash + objdump
	# comparison).
	#
	# * `-align-all-functions=6` — pad each function entry
	# to 2^6 = 64 bytes (cache-line aligned).
	# * `-align-all-nofallthru-blocks=4` — pad basic blocks
	# reached only via branches (loop headers and join
	# points) to 2^4 = 16 bytes. Catches inner-loop
	# alignment penalties that function-entry padding
	# alone misses.
	#
	# Cost: ~100–200 KB of NOP-pad bloat in libopenvx_ffi.so.
	# Both bench binaries pay it equally so the gate is fair.
	export RUSTFLAGS="-C target-cpu=x86-64-v3 -C llvm-args=-align-all-functions=6 -C llvm-args=-align-all-nofallthru-blocks=4"
	;;
	aarch64\|arm64)
	FEATURES="openvx-core/neon openvx-vision/neon"
	export RUSTFLAGS=""
	;;
	*)
	FEATURES=""
	export RUSTFLAGS=""
	;;
	esac
	echo "Architecture: $(uname -m)"
	echo "Cargo features: ${FEATURES:-<none>}"
	echo "RUSTFLAGS : ${RUSTFLAGS:-<none>}"
	if [ -n "$FEATURES" ]; then
	cargo build --release -p openvx-ffi --features "$FEATURES"
	else
	cargo build --release -p openvx-ffi
	fi
	- name: Upload main build artifacts
	uses: actions/upload-artifact@v4
	with:
	name: build-artifacts-main
	# No CTS payload here — only the perf-gate job consumes this
	# artifact, and it only needs the libopenvx_ffi.so + the
	# standard headers (for openvx-mark to compile against).
	path: \|
	target/release/libopenvx_ffi.so
	include/
	retention-days: 1

	# Build the Khronos OpenVX sample implementation in its own phase, in
	# parallel with the rustVX `build` job, and upload the resulting library
	# + headers as a self-contained archive. The benchmark job below pulls
	# both archives down onto a single runner so rustVX and the Khronos
	# sample are exercised on identical hardware.
	build-khronos-sample:
	name: Build Khronos OpenVX sample
	runs-on: ubuntu-22.04
	steps:
	- name: Install system dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y build-essential cmake git python3
	- name: Build Khronos OpenVX sample
	run: \|
	git clone --recursive --depth 1 \
	https://github.com/KhronosGroup/OpenVX-sample-impl.git khronos-sample
	cd khronos-sample
	python3 Build.py --os=Linux --arch=64 --conf=Release
	- name: Stage Khronos sample archive
	run: \|
	set -euo pipefail
	LIB_SRC=$(dirname $(find khronos-sample -name "libopenvx.so" -not -path "/build/" \| head -1))
	echo "Khronos libraries discovered in: $LIB_SRC"
	mkdir -p khronos-stage/lib
	cp "$LIB_SRC"/libopenvx.so "$LIB_SRC"/libvxu.so khronos-stage/lib/
	cp -r khronos-sample/api-docs/include khronos-stage/include
	ls -R khronos-stage
	- name: Upload Khronos sample artifacts
	uses: actions/upload-artifact@v4
	with:
	name: khronos-sample-artifacts
	path: khronos-stage/
	retention-days: 1

	baseline:
	runs-on: ubuntu-22.04
	needs: build
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run baseline tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 300 ./bin/vx_test_conformance --filter="GraphBase.:Logging.:SmokeTestBase.:SmokeTest.:TargetBase.:Target."

	graph:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run graph tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 600 ./bin/vx_test_conformance --filter="Graph.:GraphCallback.:GraphDelay.:GraphROI.:UserNode.*"

	data-objects:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run data object tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 300 ./bin/vx_test_conformance --filter="Scalar.:Array.:ObjectArray.:Matrix.:Convolution.:Distribution.:LUT.:Histogram."

	# User Data Object KHR extension — runs the upstream Khronos
	# `test_user_data_object.c` suite, gated at build time by
	# `OPENVX_USE_USER_DATA_OBJECT=ON` (set in the `Build OpenVX CTS`
	# step above). Covers all 7 functions from
	# `include/VX/vx_khr_user_data_object.h`:
	#
	# * vxCreateUserDataObject / vxCreateVirtualUserDataObject / vxReleaseUserDataObject
	# * vxQueryUserDataObject
	# * vxCopyUserDataObject
	# * vxMapUserDataObject / vxUnmapUserDataObject
	#
	# plus the user-kernel-with-UDO graph integration paths
	# (`UserKernel/`, `UserKernelObjectArray/`, `RemoveKernel`,
	# `OutDelay`). Split out of `data-objects` so the extension's status
	# is visible at a glance in the PR check rollup, matching the
	# treatment of `enhanced-vision` for the Enhanced Vision feature set.
	user-data-object:
	name: "KHR extension: user-data-object"
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run User Data Object KHR extension tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 120 ./bin/vx_test_conformance --filter="UserDataObject.*"

	image-ops:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run image operation tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 600 ./bin/vx_test_conformance --filter="Image.:vxCopyImagePatch.:vxMapImagePatch.:vxCreateImageFromChannel.:vxCopyRemapPatch.:vxMapRemapPatch."

	vision-color:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run color and channel tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 300 ./bin/vx_test_conformance --filter="ColorConvert.:ChannelExtract.:ChannelCombine.:vxConvertDepth.:vxuConvertDepth.*"

	vision-filters:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run filter and morphology tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 600 ./bin/vx_test_conformance --filter="Box3x3.:Gaussian3x3.:Median3x3.:Dilate3x3.:Erode3x3.:Sobel3x3.:Magnitude.:Phase.:NonLinearFilter.:Convolve.:EqualizeHistogram.*"

	vision-arithmetic:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run arithmetic and bitwise tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 600 ./bin/vx_test_conformance --filter="vxAddSub.:vxuAddSub.:vxMultiply.:vxuMultiply.:vxBinOp8u.:vxuBinOp8u.:vxBinOp16s.:vxuBinOp16s.:vxNot.:vxuNot.:WeightedAverage.:Threshold."

	vision-geometric:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run geometric transform tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 600 ./bin/vx_test_conformance --filter="Scale.:WarpAffine.:WarpPerspective.:Remap.:HalfScaleGaussian.*"

	vision-features:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run feature and edge detection tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 600 ./bin/vx_test_conformance --filter="HarrisCorners.:FastCorners.:vxCanny.:vxuCanny."

	vision-statistics:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run statistics and analysis tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 300 ./bin/vx_test_conformance --filter="MeanStdDev.:MinMaxLoc.:Integral.*"

	vision-pyramid:
	runs-on: ubuntu-22.04
	needs: build
	continue-on-error: true
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run pyramid and optical flow tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 300 ./bin/vx_test_conformance --filter="GaussianPyramid.:LaplacianPyramid.:LaplacianReconstruct.:OptFlowPyrLK."

	# Enhanced Vision Phase 1 — only the kernels rustVX has actually
	# implemented from the OpenVX 1.3 Enhanced Vision feature set. The CTS
	# binary is built with `OPENVX_USE_ENHANCED_VISION=ON`, but this job
	# filters strictly to the kernels Phase 1 ships (vxMin / vxMax). The
	# remaining Enhanced Vision symbols are exposed as link stubs in
	# rustVX so the binary can build; they are not exercised here and will
	# be replaced by real kernels in subsequent phases.
	# Enhanced Vision — all implemented kernels.
	#
	# Currently covers: Copy, NonMaxSuppression, HoughLinesP,
	# MatchTemplate, LBP, plus the baseline Min/Max.
	#
	# All 106 Enhanced Vision tests pass (36 Copy + 33 NMS + 15 HoughLinesP
	# + 11 MatchTemplate + 11 LBP + Min/Max).
	enhanced-vision:
	name: "enhanced-vision (106/106 passing)"
	runs-on: ubuntu-22.04
	needs: build
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- name: Download build artifacts
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	- name: Run Enhanced Vision tests
	run: \|
	chmod +x OpenVX-cts/build/bin/vx_test_conformance
	cd OpenVX-cts/build
	export LD_LIBRARY_PATH=${{ github.workspace }}/target/release
	export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/
	timeout 300 ./bin/vx_test_conformance --filter="Copy.:Nonmaxsuppression.:Houghlinesp.:MatchTemplate.:LBP.:Min.:Max.*"

	# Performance benchmark using openvx-mark, comparing rustVX against the
	# Khronos OpenVX sample implementation on the SAME runner so the two
	# numbers come from identical hardware. This job does NOT rebuild either
	# implementation — it just downloads the archives produced by the
	# `build` and `build-khronos-sample` phases above, builds the openvx-mark
	# tool against each, runs the same workload, and compares the JSON
	# reports. The CTS jobs above use `continue-on-error: true`, so this
	# job effectively gates on `build`, `build-khronos-sample`, and
	# `baseline` succeeding (matching the existing CTS gate).
	benchmark:
	name: Benchmark & compare (rustVX vs Khronos sample)
	runs-on: ubuntu-22.04
	needs:
	- build
	- build-khronos-sample
	- baseline
	- graph
	- data-objects
	- image-ops
	- vision-color
	- vision-filters
	- vision-arithmetic
	- vision-geometric
	- vision-features
	- vision-statistics
	- vision-pyramid
	continue-on-error: true
	steps:
	- name: Install system dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y build-essential cmake git python3

	- name: Download rustVX archive
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	path: ${{ github.workspace }}/rustvx-pkg

	- name: Download Khronos sample archive
	uses: actions/download-artifact@v4
	with:
	name: khronos-sample-artifacts
	path: ${{ github.workspace }}/khronos-pkg

	- name: Expose rustVX as libopenvx / libvxu
	id: rustvx
	# openvx-mark uses `find_library(NAMES openvx)` and
	# `find_library(NAMES vxu)`. rustVX ships a single
	# `libopenvx_ffi.so` that exports the full set of `vx`/`vxu`
	# symbols, so symlink the two classic Khronos library names to
	# it without changing rustVX's own build output.
	run: \|
	set -euo pipefail
	LIB_DIR=${{ github.workspace }}/rustvx-pkg/target/release
	chmod -R u+rwX "$LIB_DIR"
	cd "$LIB_DIR"
	ln -sf libopenvx_ffi.so libopenvx.so
	ln -sf libopenvx_ffi.so libvxu.so
	ls -la libopenvx.so libvxu.so
	echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT"
	echo "include_dir=${{ github.workspace }}/rustvx-pkg/include" >> "$GITHUB_OUTPUT"

	- name: Inspect Khronos sample archive
	id: khronos
	run: \|
	set -euo pipefail
	LIB_DIR=${{ github.workspace }}/khronos-pkg/lib
	INCLUDE_DIR=${{ github.workspace }}/khronos-pkg/include
	ls -la "$LIB_DIR"
	echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT"
	echo "include_dir=$INCLUDE_DIR" >> "$GITHUB_OUTPUT"

	- name: Clone openvx-mark
	run: \|
	git clone --depth 1 https://github.com/kiritigowda/openvx-mark.git \
	${{ github.workspace }}/openvx-mark

	# ---------------------------------------------------------------------
	# rustVX benchmark
	# ---------------------------------------------------------------------
	- name: Build openvx-mark against rustVX
	run: \|
	mkdir -p ${{ github.workspace }}/openvx-mark/build-rustvx
	cd ${{ github.workspace }}/openvx-mark/build-rustvx
	cmake \
	-DCMAKE_BUILD_TYPE=Release \
	-DOPENVX_INCLUDES=${{ steps.rustvx.outputs.include_dir }} \
	-DOPENVX_LIB_DIR=${{ steps.rustvx.outputs.lib_dir }} \
	..
	cmake --build . -j$(nproc)

	- name: Run benchmark (rustVX)
	run: \|
	cd ${{ github.workspace }}/openvx-mark/build-rustvx
	export LD_LIBRARY_PATH=${{ steps.rustvx.outputs.lib_dir }}:$LD_LIBRARY_PATH
	./openvx-mark --resolution FHD --iterations 20 --warmup 5

	# ---------------------------------------------------------------------
	# Khronos sample benchmark
	# ---------------------------------------------------------------------
	- name: Build openvx-mark against Khronos sample
	run: \|
	mkdir -p ${{ github.workspace }}/openvx-mark/build-khronos
	cd ${{ github.workspace }}/openvx-mark/build-khronos
	cmake \
	-DCMAKE_BUILD_TYPE=Release \
	-DOPENVX_INCLUDES=${{ steps.khronos.outputs.include_dir }} \
	-DOPENVX_LIB_DIR=${{ steps.khronos.outputs.lib_dir }} \
	..
	cmake --build . -j$(nproc)

	- name: Run benchmark (Khronos sample)
	run: \|
	cd ${{ github.workspace }}/openvx-mark/build-khronos
	export LD_LIBRARY_PATH=${{ steps.khronos.outputs.lib_dir }}:$LD_LIBRARY_PATH
	./openvx-mark --resolution FHD --iterations 20 --warmup 5

	# ---------------------------------------------------------------------
	# Compare results
	# ---------------------------------------------------------------------
	- name: Compare benchmark results (rustVX vs Khronos)
	run: \|
	RUSTVX=${{ github.workspace }}/openvx-mark/build-rustvx/benchmark_results/benchmark_results.json
	KHRONOS=${{ github.workspace }}/openvx-mark/build-khronos/benchmark_results/benchmark_results.json

	if [ ! -f "$RUSTVX" ] \|\| [ ! -f "$KHRONOS" ]; then
	echo "Skipping comparison — one or both benchmark results missing"
	ls -la "$(dirname $RUSTVX)" 2>/dev/null \|\| true
	ls -la "$(dirname $KHRONOS)" 2>/dev/null \|\| true
	exit 0
	fi

	# `compare_reports.py` defines Speedup as
	# speedup = throughput(report_b) / throughput(report_a)
	# i.e. ">1.00 means report_b is faster". To make the Speedup
	# column read as "rustVX over Khronos" (>1.00x = rustVX wins),
	# pass Khronos first (baseline / report_a) and rustVX second
	# (candidate / report_b).
	python3 ${{ github.workspace }}/openvx-mark/scripts/compare_reports.py \
	"$KHRONOS" "$RUSTVX" \
	--output ${{ github.workspace }}/openvx-mark/comparison

	- name: Post comparison to job summary
	if: always()
	run: \|
	COMPARISON=${{ github.workspace }}/openvx-mark/comparison.md
	RUSTVX=${{ github.workspace }}/openvx-mark/build-rustvx/benchmark_results/benchmark_results.json
	KHRONOS=${{ github.workspace }}/openvx-mark/build-khronos/benchmark_results/benchmark_results.json

	# ----- Headline: aggregate speedup of rustVX over Khronos sample -----
	if [ -f "$RUSTVX" ] && [ -f "$KHRONOS" ]; then
	python3 - "$RUSTVX" "$KHRONOS" >> "$GITHUB_STEP_SUMMARY" <<'PY'
	import json, math, sys

	rustvx_path, khronos_path = sys.argv[1], sys.argv[2]
	with open(rustvx_path) as f: rustvx = json.load(f)
	with open(khronos_path) as f: khronos = json.load(f)

	def by_key(report):
	return {(r['name'], r['mode'], r['resolution']): r
	for r in report.get('results', [])}

	a = by_key(rustvx)
	b = by_key(khronos)
	shared = sorted(set(a) & set(b))

	speedups = []
	wins, losses = 0, 0
	best = (None, 0.0)
	worst = (None, math.inf)

	for key in shared:
	ra, rb = a[key], b[key]
	if not (ra.get('verified', True) and rb.get('verified', True)):
	continue
	mps_r = ra.get('megapixels_per_sec', 0)
	mps_k = rb.get('megapixels_per_sec', 0)
	if mps_r <= 0 or mps_k <= 0:
	continue
	s = mps_r / mps_k # >1.0 = rustVX faster than Khronos
	speedups.append(s)
	if s > 1.0: wins += 1
	elif s < 1.0: losses += 1
	if s > best[1]: best = (key, s)
	if s < worst[1]: worst = (key, s)

	print('# rustVX vs Khronos sample — headline')
	print()
	if not speedups:
	print('_No verified benchmarks were directly comparable._')
	else:
	geomean = math.exp(sum(math.log(s) for s in speedups) / len(speedups))
	median = sorted(speedups)[len(speedups) // 2]
	print('\| Metric \| Value \|')
	print('\|:---\|---:\|')
	print(f'\| Geomean speedup (rustVX / Khronos) \| {geomean:.2f}x \|')
	print(f'\| Median speedup (rustVX / Khronos) \| {median:.2f}x \|')
	print(f'\| Benchmarks compared \| {len(speedups)} \|')
	print(f'\| rustVX faster \| {wins} \|')
	print(f'\| Khronos sample faster \| {losses} \|')
	if best[0]:
	bk, bv = best
	print(f'\| Best rustVX speedup \| {bv:.2f}x ({bk[0]} / {bk[1]} / {bk[2]}) \|')
	if worst[0] and worst[1] != math.inf:
	wk, wv = worst
	print(f'\| Worst rustVX speedup \| {wv:.2f}x ({wk[0]} / {wk[1]} / {wk[2]}) \|')
	print()
	if geomean >= 1.0:
	print(f'> rustVX is {geomean:.2f}x faster than the Khronos sample on average (geomean across {len(speedups)} verified benchmarks).')
	else:
	print(f'> rustVX is {1.0/geomean:.2f}x slower than the Khronos sample on average (geomean across {len(speedups)} verified benchmarks).')
	print()
	PY
	fi

	# ----- Detailed comparison table from compare_reports.py -----
	if [ -f "$COMPARISON" ]; then
	cat "$COMPARISON" >> "$GITHUB_STEP_SUMMARY"
	else
	echo "_No comparison report was produced._" >> "$GITHUB_STEP_SUMMARY"
	fi

	- name: Upload rustVX benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results-rustvx
	path: ${{ github.workspace }}/openvx-mark/build-rustvx/benchmark_results/
	if-no-files-found: ignore

	- name: Upload Khronos sample benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results-khronos-sample
	path: ${{ github.workspace }}/openvx-mark/build-khronos/benchmark_results/
	if-no-files-found: ignore

	- name: Upload comparison report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-comparison
	path: ${{ github.workspace }}/openvx-mark/comparison.*
	if-no-files-found: ignore

	# ---------------------------------------------------------------------------
	# Perf gate (PR rustVX vs main rustVX)
	#
	# This job is self-contained and runs in parallel with the existing
	# `benchmark` job (which still does the rustVX-vs-Khronos comparison
	# report). It pulls down the two rustVX `libopenvx_ffi.so` archives
	# produced in Phase 1 (`build` for the PR, `build-main` for the merge
	# target), builds openvx-mark twice (once per library), runs both
	# benches back-to-back on this single runner VM, and compares the
	# results with `.github/scripts/perf_gate.py`. Same-VM bench is the
	# whole point — hardware variance between separate runs would swamp
	# any real regression.
	#
	# Both rustVX binaries are now built with explicit AVX2 features +
	# `-C target-cpu=x86-64-v3` in Phase 1 (no per-VM auto-detection),
	# so any noise that remains is genuine same-VM jitter rather than
	# divergent compile-time configuration. This lets us run the
	# per-kernel floor much tighter than the previous 25% threshold.
	#
	# Threshold rationale (see `.github/scripts/perf_gate.py` for full
	# docstring and per-flag semantics):
	#
	# * --geomean-floor 0.97 -> aggregate move > 3% slower fails;
	# the real signal for actual perf
	# bugs that touch multiple kernels.
	# * --kernel-floor 0.90 -> a SINGLE-kernel hard fail requires
	# > 10% regression. With explicit-
	# AVX2 binaries the same-VM noise
	# floor sits well below this, so
	# anything tripping the gate is a
	# real regression worth investigating.
	# * --warn-floor 0.95 -> soft-warn band [0.90, 0.95) — kernels
	# in the 5-10% slower range get an
	# advisory annotation but don't block
	# merge.
	# * --max-cv 5.0 -> auto-skip kernels above this within-
	# run CV%.
	#
	# Trigger:
	# * pull_request only — push events to main do not gate against
	# themselves (there's no merge target to diff against).
	# ---------------------------------------------------------------------------
	perf-gate:
	name: Perf gate (PR vs main)
	if: github.event_name == 'pull_request'
	runs-on: ubuntu-22.04
	needs:
	- build
	- build-main
	steps:
	- name: Checkout repo (for the perf_gate script)
	uses: actions/checkout@v4
	with:
	fetch-depth: 1

	- name: Install system dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y build-essential cmake git python3

	- name: Download PR rustVX archive
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts
	path: ${{ github.workspace }}/rustvx-pr-pkg

	- name: Download main rustVX archive
	uses: actions/download-artifact@v4
	with:
	name: build-artifacts-main
	path: ${{ github.workspace }}/rustvx-main-pkg

	- name: Stage PR rustVX (libopenvx / libvxu symlinks for openvx-mark)
	id: pr_rustvx
	run: \|
	set -euo pipefail
	LIB_DIR=${{ github.workspace }}/rustvx-pr-pkg/target/release
	chmod -R u+rwX "$LIB_DIR"
	cd "$LIB_DIR"
	ln -sf libopenvx_ffi.so libopenvx.so
	ln -sf libopenvx_ffi.so libvxu.so
	ls -la libopenvx.so libvxu.so
	echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT"
	echo "include_dir=${{ github.workspace }}/rustvx-pr-pkg/include" >> "$GITHUB_OUTPUT"

	- name: Stage main rustVX (libopenvx / libvxu symlinks for openvx-mark)
	id: main_rustvx
	run: \|
	set -euo pipefail
	LIB_DIR=${{ github.workspace }}/rustvx-main-pkg/target/release
	chmod -R u+rwX "$LIB_DIR"
	cd "$LIB_DIR"
	ln -sf libopenvx_ffi.so libopenvx.so
	ln -sf libopenvx_ffi.so libvxu.so
	ls -la libopenvx.so libvxu.so
	echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT"
	echo "include_dir=${{ github.workspace }}/rustvx-main-pkg/include" >> "$GITHUB_OUTPUT"

	- name: Clone openvx-mark
	run: \|
	git clone --depth 1 https://github.com/kiritigowda/openvx-mark.git \
	${{ github.workspace }}/openvx-mark

	# Build openvx-mark once per library. The two CMake configs differ
	# only in the include / lib paths, so we keep them in separate
	# build trees to avoid any re-link confusion.
	- name: Build openvx-mark against rustVX-PR
	run: \|
	mkdir -p ${{ github.workspace }}/openvx-mark/build-pr
	cd ${{ github.workspace }}/openvx-mark/build-pr
	cmake \
	-DCMAKE_BUILD_TYPE=Release \
	-DOPENVX_INCLUDES=${{ steps.pr_rustvx.outputs.include_dir }} \
	-DOPENVX_LIB_DIR=${{ steps.pr_rustvx.outputs.lib_dir }} \
	..
	cmake --build . -j$(nproc)

	- name: Build openvx-mark against rustVX-main
	run: \|
	mkdir -p ${{ github.workspace }}/openvx-mark/build-main
	cd ${{ github.workspace }}/openvx-mark/build-main
	cmake \
	-DCMAKE_BUILD_TYPE=Release \
	-DOPENVX_INCLUDES=${{ steps.main_rustvx.outputs.include_dir }} \
	-DOPENVX_LIB_DIR=${{ steps.main_rustvx.outputs.lib_dir }} \
	..
	cmake --build . -j$(nproc)

	# Per-library "warmup + measure" cycles, back-to-back for each lib.
	#
	# The previous design did all warmups first (PR warmup, main
	# warmup) and then both real measurements (PR measure, main
	# measure). That sequence is asymmetric: by the time the PR's
	# real measurement runs, the main warmup has just evicted the
	# PR-side instruction/data caches; by the time main's real
	# measurement runs, the PR measurement has been thrashing for
	# ~30s and the system is "warm" overall. The result was tight
	# kernels like Box3x3 / Gaussian3x3 (~1.8 ms) consistently
	# showing 25-30% slower numbers on the PR side on no-op PRs
	# — a methodology artefact, not a real regression.
	#
	# New design: run each lib's throwaway warmup IMMEDIATELY
	# before its measurement, in the same step. Both
	# warmup-then-measure cycles run back-to-back with no other
	# lib's bench process in between, so each measurement sees a
	# comparable warm-VM state.
	- name: Bench rustVX-PR (warmup + measure)
	run: \|
	set -eo pipefail
	# NB: not using `-u` because `$LD_LIBRARY_PATH` is unset in a
	# fresh step and the trailing `:${LD_LIBRARY_PATH:-}` default
	# would still trip `-u` in some bash variants.
	cd ${{ github.workspace }}/openvx-mark/build-pr
	export LD_LIBRARY_PATH=${{ steps.pr_rustvx.outputs.lib_dir }}
	# Throwaway: prime instruction/data caches and any
	# is_x86_feature_detected! one-time-dispatch overhead.
	./openvx-mark --resolution FHD --iterations 5 --warmup 0 \
	--output /tmp/warmup-pr-throwaway >/dev/null 2>&1 \|\| true
	# Real measurement.
	./openvx-mark --resolution FHD --iterations 20 --warmup 5

	- name: Bench rustVX-main (warmup + measure)
	run: \|
	set -eo pipefail
	cd ${{ github.workspace }}/openvx-mark/build-main
	export LD_LIBRARY_PATH=${{ steps.main_rustvx.outputs.lib_dir }}
	./openvx-mark --resolution FHD --iterations 5 --warmup 0 \
	--output /tmp/warmup-main-throwaway >/dev/null 2>&1 \|\| true
	./openvx-mark --resolution FHD --iterations 20 --warmup 5

	- name: Run perf gate
	run: \|
	set -euo pipefail
	PR=${{ github.workspace }}/openvx-mark/build-pr/benchmark_results/benchmark_results.json
	MAIN=${{ github.workspace }}/openvx-mark/build-main/benchmark_results/benchmark_results.json
	if [ ! -f "$PR" ] \|\| [ ! -f "$MAIN" ]; then
	echo "::error::Missing benchmark JSONs (PR=$PR, MAIN=$MAIN)."
	ls -la "$(dirname "$PR")" "$(dirname "$MAIN")" 2>/dev/null \|\| true
	exit 1
	fi

	python3 ${{ github.workspace }}/.github/scripts/perf_gate.py \
	"$MAIN" "$PR" \
	--geomean-floor 0.97 \
	--kernel-floor 0.90 \
	--warn-floor 0.95 \
	--max-cv 5.0 \
	--summary-out "$GITHUB_STEP_SUMMARY"

	- name: Upload PR rustVX benchmark results (perf-gate)
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: perf-gate-results-pr
	path: ${{ github.workspace }}/openvx-mark/build-pr/benchmark_results/
	if-no-files-found: ignore

	- name: Upload main rustVX benchmark results (perf-gate)
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: perf-gate-results-main
	path: ${{ github.workspace }}/openvx-mark/build-main/benchmark_results/
	if-no-files-found: ignore

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: Enhanced Vision kernel implementations — Copy, NMS, HoughLinesP… #144

Workflow file

feat: Enhanced Vision kernel implementations — Copy, NMS, HoughLinesP… #144

Uh oh!

Workflow file for this run