perf(filters): optimize IntegralImage, ChannelCombine, CustomConvolution #122
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: OpenVX Conformance Tests | |
| on: | |
| push: | |
| branches: [master, main, develop] | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| - '.gitattributes' | |
| - '.editorconfig' | |
| - '**/*.svg' | |
| - '**/*.png' | |
| - '**/*.jpg' | |
| - '**/*.jpeg' | |
| - '**/*.gif' | |
| - '**/*.webp' | |
| pull_request: | |
| branches: [master, main, develop] | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| - '.gitattributes' | |
| - '.editorconfig' | |
| - '**/*.svg' | |
| - '**/*.png' | |
| - '**/*.jpg' | |
| - '**/*.jpeg' | |
| - '**/*.gif' | |
| - '**/*.webp' | |
| env: | |
| CARGO_TERM_COLOR: always | |
| RUST_BACKTRACE: 1 | |
| jobs: | |
| build: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| submodules: recursive | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake | |
| - name: Install Rust | |
| run: | | |
| curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable | |
| source $HOME/.cargo/env | |
| rustc --version | |
| cargo --version | |
| - name: Build rustVX | |
| # Explicit AVX2-targeted build for AMD Zen 2+ / Intel Haswell+. | |
| # | |
| # We deliberately do NOT auto-detect CPU features from | |
| # /proc/cpuinfo any more — the previous auto-detection design | |
| # produced subtly mismatched binaries between the parallel | |
| # `build` (PR) and `build-main` jobs whenever the two jobs | |
| # happened to land on different VM pools, which silently | |
| # poisoned the perf-gate's PR-vs-main comparison (see | |
| # `Magnitude` regression on a no-op PR in CI run 25615947512). | |
| # | |
| # Hardcoding the feature set guarantees both rustVX binaries | |
| # the perf gate compares are produced with identical: | |
| # * Cargo `--features` flags (sse2 + avx2 on both crates) | |
| # * RUSTFLAGS (`-C target-cpu=x86-64-v3`) | |
| # | |
| # The `x86-64-v3` microarch level (SSE4.2 + AVX + AVX2 + | |
| # BMI1+2 + FMA + F16C) is supported by every Linux runner in | |
| # GitHub's Azure pool today (AMD EPYC Milan/Genoa, Intel | |
| # Cascade Lake / Ice Lake) and is the right baseline for AMD | |
| # Zen-tuned performance work — it lets rustc auto-vectorise | |
| # the rest of the workspace using AVX2 / FMA / BMI2 on top of | |
| # the hand-tuned `#[target_feature(enable = "avx2")]` | |
| # intrinsic kernels gated by the `avx2` Cargo feature. | |
| # | |
| # Falls back to NEON-only on aarch64 and scalar on anything | |
| # else — those paths exist for completeness and aren't the | |
| # gated targets. | |
| run: | | |
| set -euo pipefail | |
| source $HOME/.cargo/env | |
| case "$(uname -m)" in | |
| x86_64|amd64) | |
| FEATURES="openvx-core/sse2 openvx-core/avx2 openvx-vision/sse2 openvx-vision/avx2" | |
| # Pin code-layout alignment so the perf-gate's PR-vs-main | |
| # bench comparison is invariant to link-order shifts caused | |
| # by upstream additive code. Without these flags, a purely | |
| # additive PR that grows `.text` by O(10 KB) shifts every | |
| # downstream hot kernel forward by the same offset and | |
| # lands it (or its hot inner loop) at less-favourable | |
| # cache-line alignment — producing reproducible 20%+ | |
| # "regressions" on tight ~5 ms kernels whose compiled | |
| # machine code is bit-identical between PR and main | |
| # (verified by matching Rust mangling hash + objdump | |
| # comparison). | |
| # | |
| # * `-align-all-functions=6` — pad each function entry | |
| # to 2^6 = 64 bytes (cache-line aligned). | |
| # * `-align-all-nofallthru-blocks=4` — pad basic blocks | |
| # reached only via branches (loop headers and join | |
| # points) to 2^4 = 16 bytes. Catches inner-loop | |
| # alignment penalties that function-entry padding | |
| # alone misses. | |
| # | |
| # Cost: ~100–200 KB of NOP-pad bloat in libopenvx_ffi.so. | |
| # Both bench binaries pay it equally so the gate is fair. | |
| export RUSTFLAGS="-C target-cpu=x86-64-v3 -C llvm-args=-align-all-functions=6 -C llvm-args=-align-all-nofallthru-blocks=4" | |
| ;; | |
| aarch64|arm64) | |
| FEATURES="openvx-core/neon openvx-vision/neon" | |
| export RUSTFLAGS="" | |
| ;; | |
| *) | |
| FEATURES="" | |
| export RUSTFLAGS="" | |
| ;; | |
| esac | |
| echo "Architecture: $(uname -m)" | |
| echo "Cargo features: ${FEATURES:-<none>}" | |
| echo "RUSTFLAGS : ${RUSTFLAGS:-<none>}" | |
| if [ -n "$FEATURES" ]; then | |
| cargo build --release -p openvx-ffi --features "$FEATURES" | |
| else | |
| cargo build --release -p openvx-ffi | |
| fi | |
| - name: Build OpenVX CTS | |
| run: | | |
| cd OpenVX-cts | |
| mkdir -p include | |
| if [ -d "../include" ]; then | |
| cp -r ../include/* include/ 2>/dev/null || true | |
| fi | |
| mkdir -p build | |
| cd build | |
| cmake .. \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DCMAKE_C_STANDARD_LIBRARIES="-lm" \ | |
| -DCMAKE_CXX_STANDARD_LIBRARIES="-lm" \ | |
| -DOPENVX_INCLUDES="${{ github.workspace }}/include;${{ github.workspace }}/OpenVX-cts/include" \ | |
| -DOPENVX_LIBRARIES="${{ github.workspace }}/target/release/libopenvx_ffi.so;m" \ | |
| -DOPENVX_CONFORMANCE_VISION=ON \ | |
| -DOPENVX_USE_ENHANCED_VISION=ON \ | |
| -DOPENVX_USE_USER_DATA_OBJECT=ON | |
| make -j$(nproc) | |
| - name: Upload build artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| # `include/` is bundled so the downstream benchmark job can build | |
| # openvx-mark against rustVX without needing to check out the | |
| # rustVX source tree. | |
| path: | | |
| target/release/libopenvx_ffi.so | |
| OpenVX-cts/build/bin/vx_test_conformance | |
| OpenVX-cts/test_data/ | |
| include/ | |
| retention-days: 1 | |
| # Build rustVX from the merge-target ref (i.e. main, in practice) in | |
| # its own phase, in parallel with the PR's `build` job. The downstream | |
| # `perf-gate` job pulls *both* archives down onto a single runner so | |
| # the PR-vs-main bench comparison runs on identical hardware against | |
| # binaries that were each built with their own branch's source tree. | |
| # | |
| # Both this job and `build` run on the same `ubuntu-22.04` runner pool | |
| # with the same auto-detection logic, so the resulting libopenvx_ffi.so | |
| # binaries should have matching compile-time SIMD feature sets in | |
| # practice. (If GitHub's pool ever produces a heterogeneous mix, the | |
| # gate will surface it as an obvious cross-the-board regression rather | |
| # than silently report nonsense; we have not seen this happen yet.) | |
| # | |
| # Skipped on push events to main (no merge target to diff against). | |
| build-main: | |
| name: Build rustVX (main) | |
| if: github.event_name == 'pull_request' | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Checkout merge target ref | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.base_ref }} | |
| fetch-depth: 0 | |
| submodules: recursive | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake | |
| - name: Install Rust | |
| run: | | |
| curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable | |
| source $HOME/.cargo/env | |
| rustc --version | |
| cargo --version | |
| - name: Build rustVX (main) | |
| # Identical hardcoded build configuration as the `build` job — | |
| # see that step for the full rationale. The whole point of | |
| # making both builds explicit (rather than per-VM auto- | |
| # detected) is that the `perf-gate` job downstream is | |
| # guaranteed to compare two binaries with matching | |
| # compile-time SIMD features and RUSTFLAGS. | |
| run: | | |
| set -euo pipefail | |
| source $HOME/.cargo/env | |
| case "$(uname -m)" in | |
| x86_64|amd64) | |
| FEATURES="openvx-core/sse2 openvx-core/avx2 openvx-vision/sse2 openvx-vision/avx2" | |
| # Pin code-layout alignment so the perf-gate's PR-vs-main | |
| # bench comparison is invariant to link-order shifts caused | |
| # by upstream additive code. Without these flags, a purely | |
| # additive PR that grows `.text` by O(10 KB) shifts every | |
| # downstream hot kernel forward by the same offset and | |
| # lands it (or its hot inner loop) at less-favourable | |
| # cache-line alignment — producing reproducible 20%+ | |
| # "regressions" on tight ~5 ms kernels whose compiled | |
| # machine code is bit-identical between PR and main | |
| # (verified by matching Rust mangling hash + objdump | |
| # comparison). | |
| # | |
| # * `-align-all-functions=6` — pad each function entry | |
| # to 2^6 = 64 bytes (cache-line aligned). | |
| # * `-align-all-nofallthru-blocks=4` — pad basic blocks | |
| # reached only via branches (loop headers and join | |
| # points) to 2^4 = 16 bytes. Catches inner-loop | |
| # alignment penalties that function-entry padding | |
| # alone misses. | |
| # | |
| # Cost: ~100–200 KB of NOP-pad bloat in libopenvx_ffi.so. | |
| # Both bench binaries pay it equally so the gate is fair. | |
| export RUSTFLAGS="-C target-cpu=x86-64-v3 -C llvm-args=-align-all-functions=6 -C llvm-args=-align-all-nofallthru-blocks=4" | |
| ;; | |
| aarch64|arm64) | |
| FEATURES="openvx-core/neon openvx-vision/neon" | |
| export RUSTFLAGS="" | |
| ;; | |
| *) | |
| FEATURES="" | |
| export RUSTFLAGS="" | |
| ;; | |
| esac | |
| echo "Architecture: $(uname -m)" | |
| echo "Cargo features: ${FEATURES:-<none>}" | |
| echo "RUSTFLAGS : ${RUSTFLAGS:-<none>}" | |
| if [ -n "$FEATURES" ]; then | |
| cargo build --release -p openvx-ffi --features "$FEATURES" | |
| else | |
| cargo build --release -p openvx-ffi | |
| fi | |
| - name: Upload main build artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: build-artifacts-main | |
| # No CTS payload here — only the perf-gate job consumes this | |
| # artifact, and it only needs the libopenvx_ffi.so + the | |
| # standard headers (for openvx-mark to compile against). | |
| path: | | |
| target/release/libopenvx_ffi.so | |
| include/ | |
| retention-days: 1 | |
| # Build the Khronos OpenVX sample implementation in its own phase, in | |
| # parallel with the rustVX `build` job, and upload the resulting library | |
| # + headers as a self-contained archive. The benchmark job below pulls | |
| # both archives down onto a single runner so rustVX and the Khronos | |
| # sample are exercised on identical hardware. | |
| build-khronos-sample: | |
| name: Build Khronos OpenVX sample | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake git python3 | |
| - name: Build Khronos OpenVX sample | |
| run: | | |
| git clone --recursive --depth 1 \ | |
| https://github.com/KhronosGroup/OpenVX-sample-impl.git khronos-sample | |
| cd khronos-sample | |
| python3 Build.py --os=Linux --arch=64 --conf=Release | |
| - name: Stage Khronos sample archive | |
| run: | | |
| set -euo pipefail | |
| LIB_SRC=$(dirname $(find khronos-sample -name "libopenvx.so" -not -path "*/build/*" | head -1)) | |
| echo "Khronos libraries discovered in: $LIB_SRC" | |
| mkdir -p khronos-stage/lib | |
| cp "$LIB_SRC"/libopenvx*.so "$LIB_SRC"/libvxu*.so khronos-stage/lib/ | |
| cp -r khronos-sample/api-docs/include khronos-stage/include | |
| ls -R khronos-stage | |
| - name: Upload Khronos sample artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: khronos-sample-artifacts | |
| path: khronos-stage/ | |
| retention-days: 1 | |
| baseline: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run baseline tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 300 ./bin/vx_test_conformance --filter="GraphBase.*:Logging.*:SmokeTestBase.*:SmokeTest.*:TargetBase.*:Target.*" | |
| graph: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run graph tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 600 ./bin/vx_test_conformance --filter="Graph.*:GraphCallback.*:GraphDelay.*:GraphROI.*:UserNode.*" | |
| data-objects: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run data object tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 300 ./bin/vx_test_conformance --filter="Scalar.*:Array.*:ObjectArray.*:Matrix.*:Convolution.*:Distribution.*:LUT.*:Histogram.*" | |
| # User Data Object KHR extension — runs the upstream Khronos | |
| # `test_user_data_object.c` suite, gated at build time by | |
| # `OPENVX_USE_USER_DATA_OBJECT=ON` (set in the `Build OpenVX CTS` | |
| # step above). Covers all 7 functions from | |
| # `include/VX/vx_khr_user_data_object.h`: | |
| # | |
| # * vxCreateUserDataObject / vxCreateVirtualUserDataObject / vxReleaseUserDataObject | |
| # * vxQueryUserDataObject | |
| # * vxCopyUserDataObject | |
| # * vxMapUserDataObject / vxUnmapUserDataObject | |
| # | |
| # plus the user-kernel-with-UDO graph integration paths | |
| # (`UserKernel/*`, `UserKernelObjectArray/*`, `RemoveKernel`, | |
| # `OutDelay`). Split out of `data-objects` so the extension's status | |
| # is visible at a glance in the PR check rollup, matching the | |
| # treatment of `enhanced-vision` for the Enhanced Vision feature set. | |
| user-data-object: | |
| name: "KHR extension: user-data-object" | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run User Data Object KHR extension tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 120 ./bin/vx_test_conformance --filter="UserDataObject.*" | |
| image-ops: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run image operation tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 600 ./bin/vx_test_conformance --filter="Image.*:vxCopyImagePatch.*:vxMapImagePatch.*:vxCreateImageFromChannel.*:vxCopyRemapPatch.*:vxMapRemapPatch.*" | |
| vision-color: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run color and channel tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 300 ./bin/vx_test_conformance --filter="ColorConvert.*:ChannelExtract.*:ChannelCombine.*:vxConvertDepth.*:vxuConvertDepth.*" | |
| vision-filters: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run filter and morphology tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 600 ./bin/vx_test_conformance --filter="Box3x3.*:Gaussian3x3.*:Median3x3.*:Dilate3x3.*:Erode3x3.*:Sobel3x3.*:Magnitude.*:Phase.*:NonLinearFilter.*:Convolve.*:EqualizeHistogram.*" | |
| vision-arithmetic: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run arithmetic and bitwise tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 600 ./bin/vx_test_conformance --filter="vxAddSub.*:vxuAddSub.*:vxMultiply.*:vxuMultiply.*:vxBinOp8u.*:vxuBinOp8u.*:vxBinOp16s.*:vxuBinOp16s.*:vxNot.*:vxuNot.*:WeightedAverage.*:Threshold.*" | |
| vision-geometric: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run geometric transform tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 600 ./bin/vx_test_conformance --filter="Scale.*:WarpAffine.*:WarpPerspective.*:Remap.*:HalfScaleGaussian.*" | |
| vision-features: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run feature and edge detection tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 600 ./bin/vx_test_conformance --filter="HarrisCorners.*:FastCorners.*:vxCanny.*:vxuCanny.*" | |
| vision-statistics: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run statistics and analysis tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 300 ./bin/vx_test_conformance --filter="MeanStdDev.*:MinMaxLoc.*:Integral.*" | |
| vision-pyramid: | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run pyramid and optical flow tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 300 ./bin/vx_test_conformance --filter="GaussianPyramid.*:LaplacianPyramid.*:LaplacianReconstruct.*:OptFlowPyrLK.*" | |
| # Enhanced Vision Phase 1 — only the kernels rustVX has actually | |
| # implemented from the OpenVX 1.3 Enhanced Vision feature set. The CTS | |
| # binary is built with `OPENVX_USE_ENHANCED_VISION=ON`, but this job | |
| # filters strictly to the kernels Phase 1 ships (vxMin / vxMax). The | |
| # remaining Enhanced Vision symbols are exposed as link stubs in | |
| # rustVX so the binary can build; they are not exercised here and will | |
| # be replaced by real kernels in subsequent phases. | |
| enhanced-vision: | |
| name: "enhanced-vision (Phase 1 — Min/Max)" | |
| runs-on: ubuntu-22.04 | |
| needs: build | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| - name: Run Enhanced Vision Phase 1 tests | |
| run: | | |
| chmod +x OpenVX-cts/build/bin/vx_test_conformance | |
| cd OpenVX-cts/build | |
| export LD_LIBRARY_PATH=${{ github.workspace }}/target/release | |
| export VX_TEST_DATA_PATH=${{ github.workspace }}/OpenVX-cts/test_data/ | |
| timeout 120 ./bin/vx_test_conformance --filter="Min.*:Max.*" | |
| # Performance benchmark using openvx-mark, comparing rustVX against the | |
| # Khronos OpenVX sample implementation on the SAME runner so the two | |
| # numbers come from identical hardware. This job does NOT rebuild either | |
| # implementation — it just downloads the archives produced by the | |
| # `build` and `build-khronos-sample` phases above, builds the openvx-mark | |
| # tool against each, runs the same workload, and compares the JSON | |
| # reports. The CTS jobs above use `continue-on-error: true`, so this | |
| # job effectively gates on `build`, `build-khronos-sample`, and | |
| # `baseline` succeeding (matching the existing CTS gate). | |
| benchmark: | |
| name: Benchmark & compare (rustVX vs Khronos sample) | |
| runs-on: ubuntu-22.04 | |
| needs: | |
| - build | |
| - build-khronos-sample | |
| - baseline | |
| - graph | |
| - data-objects | |
| - image-ops | |
| - vision-color | |
| - vision-filters | |
| - vision-arithmetic | |
| - vision-geometric | |
| - vision-features | |
| - vision-statistics | |
| - vision-pyramid | |
| continue-on-error: true | |
| steps: | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake git python3 | |
| - name: Download rustVX archive | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| path: ${{ github.workspace }}/rustvx-pkg | |
| - name: Download Khronos sample archive | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: khronos-sample-artifacts | |
| path: ${{ github.workspace }}/khronos-pkg | |
| - name: Expose rustVX as libopenvx / libvxu | |
| id: rustvx | |
| # openvx-mark uses `find_library(NAMES openvx)` and | |
| # `find_library(NAMES vxu)`. rustVX ships a single | |
| # `libopenvx_ffi.so` that exports the full set of `vx*`/`vxu*` | |
| # symbols, so symlink the two classic Khronos library names to | |
| # it without changing rustVX's own build output. | |
| run: | | |
| set -euo pipefail | |
| LIB_DIR=${{ github.workspace }}/rustvx-pkg/target/release | |
| chmod -R u+rwX "$LIB_DIR" | |
| cd "$LIB_DIR" | |
| ln -sf libopenvx_ffi.so libopenvx.so | |
| ln -sf libopenvx_ffi.so libvxu.so | |
| ls -la libopenvx*.so libvxu*.so | |
| echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT" | |
| echo "include_dir=${{ github.workspace }}/rustvx-pkg/include" >> "$GITHUB_OUTPUT" | |
| - name: Inspect Khronos sample archive | |
| id: khronos | |
| run: | | |
| set -euo pipefail | |
| LIB_DIR=${{ github.workspace }}/khronos-pkg/lib | |
| INCLUDE_DIR=${{ github.workspace }}/khronos-pkg/include | |
| ls -la "$LIB_DIR" | |
| echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT" | |
| echo "include_dir=$INCLUDE_DIR" >> "$GITHUB_OUTPUT" | |
| - name: Clone openvx-mark | |
| run: | | |
| git clone --depth 1 https://github.com/kiritigowda/openvx-mark.git \ | |
| ${{ github.workspace }}/openvx-mark | |
| # --------------------------------------------------------------------- | |
| # rustVX benchmark | |
| # --------------------------------------------------------------------- | |
| - name: Build openvx-mark against rustVX | |
| run: | | |
| mkdir -p ${{ github.workspace }}/openvx-mark/build-rustvx | |
| cd ${{ github.workspace }}/openvx-mark/build-rustvx | |
| cmake \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DOPENVX_INCLUDES=${{ steps.rustvx.outputs.include_dir }} \ | |
| -DOPENVX_LIB_DIR=${{ steps.rustvx.outputs.lib_dir }} \ | |
| .. | |
| cmake --build . -j$(nproc) | |
| - name: Run benchmark (rustVX) | |
| run: | | |
| cd ${{ github.workspace }}/openvx-mark/build-rustvx | |
| export LD_LIBRARY_PATH=${{ steps.rustvx.outputs.lib_dir }}:$LD_LIBRARY_PATH | |
| ./openvx-mark --resolution FHD --iterations 20 --warmup 5 | |
| # --------------------------------------------------------------------- | |
| # Khronos sample benchmark | |
| # --------------------------------------------------------------------- | |
| - name: Build openvx-mark against Khronos sample | |
| run: | | |
| mkdir -p ${{ github.workspace }}/openvx-mark/build-khronos | |
| cd ${{ github.workspace }}/openvx-mark/build-khronos | |
| cmake \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DOPENVX_INCLUDES=${{ steps.khronos.outputs.include_dir }} \ | |
| -DOPENVX_LIB_DIR=${{ steps.khronos.outputs.lib_dir }} \ | |
| .. | |
| cmake --build . -j$(nproc) | |
| - name: Run benchmark (Khronos sample) | |
| run: | | |
| cd ${{ github.workspace }}/openvx-mark/build-khronos | |
| export LD_LIBRARY_PATH=${{ steps.khronos.outputs.lib_dir }}:$LD_LIBRARY_PATH | |
| ./openvx-mark --resolution FHD --iterations 20 --warmup 5 | |
| # --------------------------------------------------------------------- | |
| # Compare results | |
| # --------------------------------------------------------------------- | |
| - name: Compare benchmark results (rustVX vs Khronos) | |
| run: | | |
| RUSTVX=${{ github.workspace }}/openvx-mark/build-rustvx/benchmark_results/benchmark_results.json | |
| KHRONOS=${{ github.workspace }}/openvx-mark/build-khronos/benchmark_results/benchmark_results.json | |
| if [ ! -f "$RUSTVX" ] || [ ! -f "$KHRONOS" ]; then | |
| echo "Skipping comparison — one or both benchmark results missing" | |
| ls -la "$(dirname $RUSTVX)" 2>/dev/null || true | |
| ls -la "$(dirname $KHRONOS)" 2>/dev/null || true | |
| exit 0 | |
| fi | |
| # `compare_reports.py` defines Speedup as | |
| # speedup = throughput(report_b) / throughput(report_a) | |
| # i.e. ">1.00 means report_b is faster". To make the Speedup | |
| # column read as "rustVX over Khronos" (>1.00x = rustVX wins), | |
| # pass Khronos first (baseline / report_a) and rustVX second | |
| # (candidate / report_b). | |
| python3 ${{ github.workspace }}/openvx-mark/scripts/compare_reports.py \ | |
| "$KHRONOS" "$RUSTVX" \ | |
| --output ${{ github.workspace }}/openvx-mark/comparison | |
| - name: Post comparison to job summary | |
| if: always() | |
| run: | | |
| COMPARISON=${{ github.workspace }}/openvx-mark/comparison.md | |
| RUSTVX=${{ github.workspace }}/openvx-mark/build-rustvx/benchmark_results/benchmark_results.json | |
| KHRONOS=${{ github.workspace }}/openvx-mark/build-khronos/benchmark_results/benchmark_results.json | |
| # ----- Headline: aggregate speedup of rustVX over Khronos sample ----- | |
| if [ -f "$RUSTVX" ] && [ -f "$KHRONOS" ]; then | |
| python3 - "$RUSTVX" "$KHRONOS" >> "$GITHUB_STEP_SUMMARY" <<'PY' | |
| import json, math, sys | |
| rustvx_path, khronos_path = sys.argv[1], sys.argv[2] | |
| with open(rustvx_path) as f: rustvx = json.load(f) | |
| with open(khronos_path) as f: khronos = json.load(f) | |
| def by_key(report): | |
| return {(r['name'], r['mode'], r['resolution']): r | |
| for r in report.get('results', [])} | |
| a = by_key(rustvx) | |
| b = by_key(khronos) | |
| shared = sorted(set(a) & set(b)) | |
| speedups = [] | |
| wins, losses = 0, 0 | |
| best = (None, 0.0) | |
| worst = (None, math.inf) | |
| for key in shared: | |
| ra, rb = a[key], b[key] | |
| if not (ra.get('verified', True) and rb.get('verified', True)): | |
| continue | |
| mps_r = ra.get('megapixels_per_sec', 0) | |
| mps_k = rb.get('megapixels_per_sec', 0) | |
| if mps_r <= 0 or mps_k <= 0: | |
| continue | |
| s = mps_r / mps_k # >1.0 = rustVX faster than Khronos | |
| speedups.append(s) | |
| if s > 1.0: wins += 1 | |
| elif s < 1.0: losses += 1 | |
| if s > best[1]: best = (key, s) | |
| if s < worst[1]: worst = (key, s) | |
| print('# rustVX vs Khronos sample — headline') | |
| print() | |
| if not speedups: | |
| print('_No verified benchmarks were directly comparable._') | |
| else: | |
| geomean = math.exp(sum(math.log(s) for s in speedups) / len(speedups)) | |
| median = sorted(speedups)[len(speedups) // 2] | |
| print('| Metric | Value |') | |
| print('|:---|---:|') | |
| print(f'| Geomean speedup (rustVX / Khronos) | **{geomean:.2f}x** |') | |
| print(f'| Median speedup (rustVX / Khronos) | {median:.2f}x |') | |
| print(f'| Benchmarks compared | {len(speedups)} |') | |
| print(f'| rustVX faster | {wins} |') | |
| print(f'| Khronos sample faster | {losses} |') | |
| if best[0]: | |
| bk, bv = best | |
| print(f'| Best rustVX speedup | {bv:.2f}x ({bk[0]} / {bk[1]} / {bk[2]}) |') | |
| if worst[0] and worst[1] != math.inf: | |
| wk, wv = worst | |
| print(f'| Worst rustVX speedup | {wv:.2f}x ({wk[0]} / {wk[1]} / {wk[2]}) |') | |
| print() | |
| if geomean >= 1.0: | |
| print(f'> rustVX is **{geomean:.2f}x** faster than the Khronos sample on average (geomean across {len(speedups)} verified benchmarks).') | |
| else: | |
| print(f'> rustVX is **{1.0/geomean:.2f}x slower** than the Khronos sample on average (geomean across {len(speedups)} verified benchmarks).') | |
| print() | |
| PY | |
| fi | |
| # ----- Detailed comparison table from compare_reports.py ----- | |
| if [ -f "$COMPARISON" ]; then | |
| cat "$COMPARISON" >> "$GITHUB_STEP_SUMMARY" | |
| else | |
| echo "_No comparison report was produced._" >> "$GITHUB_STEP_SUMMARY" | |
| fi | |
| - name: Upload rustVX benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results-rustvx | |
| path: ${{ github.workspace }}/openvx-mark/build-rustvx/benchmark_results/ | |
| if-no-files-found: ignore | |
| - name: Upload Khronos sample benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results-khronos-sample | |
| path: ${{ github.workspace }}/openvx-mark/build-khronos/benchmark_results/ | |
| if-no-files-found: ignore | |
| - name: Upload comparison report | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-comparison | |
| path: ${{ github.workspace }}/openvx-mark/comparison.* | |
| if-no-files-found: ignore | |
| # --------------------------------------------------------------------------- | |
| # Perf gate (PR rustVX vs main rustVX) | |
| # | |
| # This job is self-contained and runs in parallel with the existing | |
| # `benchmark` job (which still does the rustVX-vs-Khronos comparison | |
| # report). It pulls down the two rustVX `libopenvx_ffi.so` archives | |
| # produced in Phase 1 (`build` for the PR, `build-main` for the merge | |
| # target), builds openvx-mark twice (once per library), runs both | |
| # benches back-to-back on this single runner VM, and compares the | |
| # results with `.github/scripts/perf_gate.py`. Same-VM bench is the | |
| # whole point — hardware variance between separate runs would swamp | |
| # any real regression. | |
| # | |
| # Both rustVX binaries are now built with explicit AVX2 features + | |
| # `-C target-cpu=x86-64-v3` in Phase 1 (no per-VM auto-detection), | |
| # so any noise that remains is genuine same-VM jitter rather than | |
| # divergent compile-time configuration. This lets us run the | |
| # per-kernel floor much tighter than the previous 25% threshold. | |
| # | |
| # Threshold rationale (see `.github/scripts/perf_gate.py` for full | |
| # docstring and per-flag semantics): | |
| # | |
| # * --geomean-floor 0.97 -> aggregate move > 3% slower fails; | |
| # the real signal for actual perf | |
| # bugs that touch multiple kernels. | |
| # * --kernel-floor 0.90 -> a SINGLE-kernel hard fail requires | |
| # > 10% regression. With explicit- | |
| # AVX2 binaries the same-VM noise | |
| # floor sits well below this, so | |
| # anything tripping the gate is a | |
| # real regression worth investigating. | |
| # * --warn-floor 0.95 -> soft-warn band [0.90, 0.95) — kernels | |
| # in the 5-10% slower range get an | |
| # advisory annotation but don't block | |
| # merge. | |
| # * --max-cv 5.0 -> auto-skip kernels above this within- | |
| # run CV%. | |
| # | |
| # Trigger: | |
| # * pull_request only — push events to main do not gate against | |
| # themselves (there's no merge target to diff against). | |
| # --------------------------------------------------------------------------- | |
| perf-gate: | |
| name: Perf gate (PR vs main) | |
| if: github.event_name == 'pull_request' | |
| runs-on: ubuntu-22.04 | |
| needs: | |
| - build | |
| - build-main | |
| steps: | |
| - name: Checkout repo (for the perf_gate script) | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 1 | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y build-essential cmake git python3 | |
| - name: Download PR rustVX archive | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts | |
| path: ${{ github.workspace }}/rustvx-pr-pkg | |
| - name: Download main rustVX archive | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: build-artifacts-main | |
| path: ${{ github.workspace }}/rustvx-main-pkg | |
| - name: Stage PR rustVX (libopenvx / libvxu symlinks for openvx-mark) | |
| id: pr_rustvx | |
| run: | | |
| set -euo pipefail | |
| LIB_DIR=${{ github.workspace }}/rustvx-pr-pkg/target/release | |
| chmod -R u+rwX "$LIB_DIR" | |
| cd "$LIB_DIR" | |
| ln -sf libopenvx_ffi.so libopenvx.so | |
| ln -sf libopenvx_ffi.so libvxu.so | |
| ls -la libopenvx*.so libvxu*.so | |
| echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT" | |
| echo "include_dir=${{ github.workspace }}/rustvx-pr-pkg/include" >> "$GITHUB_OUTPUT" | |
| - name: Stage main rustVX (libopenvx / libvxu symlinks for openvx-mark) | |
| id: main_rustvx | |
| run: | | |
| set -euo pipefail | |
| LIB_DIR=${{ github.workspace }}/rustvx-main-pkg/target/release | |
| chmod -R u+rwX "$LIB_DIR" | |
| cd "$LIB_DIR" | |
| ln -sf libopenvx_ffi.so libopenvx.so | |
| ln -sf libopenvx_ffi.so libvxu.so | |
| ls -la libopenvx*.so libvxu*.so | |
| echo "lib_dir=$LIB_DIR" >> "$GITHUB_OUTPUT" | |
| echo "include_dir=${{ github.workspace }}/rustvx-main-pkg/include" >> "$GITHUB_OUTPUT" | |
| - name: Clone openvx-mark | |
| run: | | |
| git clone --depth 1 https://github.com/kiritigowda/openvx-mark.git \ | |
| ${{ github.workspace }}/openvx-mark | |
| # Build openvx-mark once per library. The two CMake configs differ | |
| # only in the include / lib paths, so we keep them in separate | |
| # build trees to avoid any re-link confusion. | |
| - name: Build openvx-mark against rustVX-PR | |
| run: | | |
| mkdir -p ${{ github.workspace }}/openvx-mark/build-pr | |
| cd ${{ github.workspace }}/openvx-mark/build-pr | |
| cmake \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DOPENVX_INCLUDES=${{ steps.pr_rustvx.outputs.include_dir }} \ | |
| -DOPENVX_LIB_DIR=${{ steps.pr_rustvx.outputs.lib_dir }} \ | |
| .. | |
| cmake --build . -j$(nproc) | |
| - name: Build openvx-mark against rustVX-main | |
| run: | | |
| mkdir -p ${{ github.workspace }}/openvx-mark/build-main | |
| cd ${{ github.workspace }}/openvx-mark/build-main | |
| cmake \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DOPENVX_INCLUDES=${{ steps.main_rustvx.outputs.include_dir }} \ | |
| -DOPENVX_LIB_DIR=${{ steps.main_rustvx.outputs.lib_dir }} \ | |
| .. | |
| cmake --build . -j$(nproc) | |
| # Per-library "warmup + measure" cycles, back-to-back for each lib. | |
| # | |
| # The previous design did all warmups first (PR warmup, main | |
| # warmup) and then both real measurements (PR measure, main | |
| # measure). That sequence is asymmetric: by the time the PR's | |
| # real measurement runs, the main warmup has just evicted the | |
| # PR-side instruction/data caches; by the time main's real | |
| # measurement runs, the PR measurement has been thrashing for | |
| # ~30s and the system is "warm" overall. The result was tight | |
| # kernels like Box3x3 / Gaussian3x3 (~1.8 ms) consistently | |
| # showing 25-30% slower numbers on the PR side on no-op PRs | |
| # — a methodology artefact, not a real regression. | |
| # | |
| # New design: run each lib's throwaway warmup IMMEDIATELY | |
| # before its measurement, in the same step. Both | |
| # warmup-then-measure cycles run back-to-back with no other | |
| # lib's bench process in between, so each measurement sees a | |
| # comparable warm-VM state. | |
| - name: Bench rustVX-PR (warmup + measure) | |
| run: | | |
| set -eo pipefail | |
| # NB: not using `-u` because `$LD_LIBRARY_PATH` is unset in a | |
| # fresh step and the trailing `:${LD_LIBRARY_PATH:-}` default | |
| # would still trip `-u` in some bash variants. | |
| cd ${{ github.workspace }}/openvx-mark/build-pr | |
| export LD_LIBRARY_PATH=${{ steps.pr_rustvx.outputs.lib_dir }} | |
| # Throwaway: prime instruction/data caches and any | |
| # is_x86_feature_detected! one-time-dispatch overhead. | |
| ./openvx-mark --resolution FHD --iterations 5 --warmup 0 \ | |
| --output /tmp/warmup-pr-throwaway >/dev/null 2>&1 || true | |
| # Real measurement. | |
| ./openvx-mark --resolution FHD --iterations 20 --warmup 5 | |
| - name: Bench rustVX-main (warmup + measure) | |
| run: | | |
| set -eo pipefail | |
| cd ${{ github.workspace }}/openvx-mark/build-main | |
| export LD_LIBRARY_PATH=${{ steps.main_rustvx.outputs.lib_dir }} | |
| ./openvx-mark --resolution FHD --iterations 5 --warmup 0 \ | |
| --output /tmp/warmup-main-throwaway >/dev/null 2>&1 || true | |
| ./openvx-mark --resolution FHD --iterations 20 --warmup 5 | |
| - name: Run perf gate | |
| run: | | |
| set -euo pipefail | |
| PR=${{ github.workspace }}/openvx-mark/build-pr/benchmark_results/benchmark_results.json | |
| MAIN=${{ github.workspace }}/openvx-mark/build-main/benchmark_results/benchmark_results.json | |
| if [ ! -f "$PR" ] || [ ! -f "$MAIN" ]; then | |
| echo "::error::Missing benchmark JSONs (PR=$PR, MAIN=$MAIN)." | |
| ls -la "$(dirname "$PR")" "$(dirname "$MAIN")" 2>/dev/null || true | |
| exit 1 | |
| fi | |
| python3 ${{ github.workspace }}/.github/scripts/perf_gate.py \ | |
| "$MAIN" "$PR" \ | |
| --geomean-floor 0.97 \ | |
| --kernel-floor 0.90 \ | |
| --warn-floor 0.95 \ | |
| --max-cv 5.0 \ | |
| --summary-out "$GITHUB_STEP_SUMMARY" | |
| - name: Upload PR rustVX benchmark results (perf-gate) | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: perf-gate-results-pr | |
| path: ${{ github.workspace }}/openvx-mark/build-pr/benchmark_results/ | |
| if-no-files-found: ignore | |
| - name: Upload main rustVX benchmark results (perf-gate) | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: perf-gate-results-main | |
| path: ${{ github.workspace }}/openvx-mark/build-main/benchmark_results/ | |
| if-no-files-found: ignore |