kiritigowda
diff --git a/‎.github/workflows/conformance.yml‎
Lines changed: 8 additions & 3 deletions b/‎.github/workflows/conformance.yml‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 24 additions & 4 deletions b/‎README.md‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎openvx-core/Cargo.toml‎
Lines changed: 11 additions & 0 deletions b/‎openvx-core/Cargo.toml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎openvx-core/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎openvx-core/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
@@ -75,13 +75,18 @@ jobs:
             x86_64|amd64)
               HAS_SSE2=false
               HAS_AVX2=false
+              # openvx-core hosts the C-API kernel callbacks (vxAdd /
+              # vxSubtract / vxBox3x3 / vxGaussian3x3 / vxColorConvert
+              # → crate::simd_kernels). openvx-vision hosts the public
+              # Rust-API SIMD kernels. Both crates need the matching
+              # feature flag for the SIMD path to actually compile in.
               if echo "$FLAGS" | grep -qw sse2; then
-                CARGO_FEATURES="$CARGO_FEATURES openvx-vision/sse2"
+                CARGO_FEATURES="$CARGO_FEATURES openvx-core/sse2 openvx-vision/sse2"
                 HAS_SSE2=true
                 echo "  + sse2 detected"
               fi
               if echo "$FLAGS" | grep -qw avx2; then
-                CARGO_FEATURES="$CARGO_FEATURES openvx-vision/avx2"
+                CARGO_FEATURES="$CARGO_FEATURES openvx-core/avx2 openvx-vision/avx2"
                 HAS_AVX2=true
                 echo "  + avx2 detected"
               fi
@@ -96,7 +101,7 @@ jobs:
               fi
               ;;
             aarch64|arm64)
-              CARGO_FEATURES="$CARGO_FEATURES openvx-vision/neon"
+              CARGO_FEATURES="$CARGO_FEATURES openvx-core/neon openvx-vision/neon"
               echo "  + neon (mandatory on aarch64)"
               ;;
             *)
 
@@ -74,19 +74,39 @@ The standard OpenVX 1.3 C headers are bundled in [`include/VX/`](include/VX/) an
 
 ### Cargo features
 
-The vision kernel crate exposes opt-in performance features:
+Both `openvx-core` (host of the C-API kernel callbacks the OpenVX graph executor invokes) and `openvx-vision` (host of the public Rust API kernels) expose a matching opt-in feature set:
 
 | Feature | Effect |
 |---------|--------|
 | `simd` | Enables architecture-neutral SIMD code paths |
 | `sse2` / `avx2` | x86_64 SIMD back-ends (imply `simd`) |
 | `neon` | AArch64 SIMD back-end (implies `simd`) |
-| `parallel` | Enables Rayon-based multi-threaded kernels |
+| `parallel` (`openvx-vision` only) | Enables Rayon-based multi-threaded kernels |
 
-Build with one or more features, e.g.:
+Build with the matching pair on each crate so the FFI graph path and the direct Rust API path both pick up the SIMD kernels:
 
 ```bash
-cargo build --release -p openvx-ffi --features "openvx-vision/avx2 openvx-vision/parallel"
+cargo build --release -p openvx-ffi \
+  --features "openvx-core/sse2 openvx-core/avx2 openvx-vision/sse2 openvx-vision/avx2"
+```
+
+### Hardware acceleration
+
+Performance work targets **AMD Zen (Ryzen / EPYC, Zen 2+)** — that's what CI measures and what the *Benchmark & compare* numbers come from. Intel and ARM aren't penalised; the runtime dispatcher reads CPU **flags**, not vendor strings, so any host whose flags match the same gate runs the same path:
+
+- **AMD Zen 2+** (Ryzen 3000+, Threadripper 3000+, EPYC Rome / Milan / Genoa) → AVX2 kernels + `-C target-cpu=x86-64-v3` auto-vec.
+- **Intel Haswell and later** → same AVX2 path, parity with Zen.
+- **Older x86_64** (pre-AVX2) → SSE2 kernels + `-C target-cpu=x86-64-v2`.
+- **AArch64** (Apple Silicon, AWS Graviton, etc.) → NEON path.
+- **Anything else / no features** → scalar slice loop (still ~50× faster than the original per-pixel kernels).
+
+Dispatch lives in `openvx-core::simd_kernels` (FFI graph path) and `openvx-vision::x86_64_simd` (Rust API). CI auto-detects host flags; for a manual Zen-targeted build:
+
+```bash
+RUSTFLAGS="-C target-cpu=x86-64-v3" \
+  cargo build --release -p openvx-ffi \
+    --features "openvx-core/sse2 openvx-core/avx2 \
+                openvx-vision/sse2 openvx-vision/avx2"
 ```
 
 ## Using rustVX from a C application
 
@@ -18,3 +18,14 @@ once_cell = { workspace = true }
 [features]
 default = []
 c-api = []
+# SIMD acceleration. Mirrors openvx-vision's feature set so the FFI
+# build can pass `openvx-core/sse2 openvx-core/avx2` (or
+# `openvx-core/neon` on aarch64) and have the C-API kernel callbacks
+# (vxAdd, vxSubtract, vxBox3x3, vxGaussian3x3, vxColorConvert) pick
+# up the SIMD-fast paths in `crate::simd_kernels` at runtime via
+# `is_x86_feature_detected!`. When none of these features are set,
+# the kernel callbacks fall back to the existing tight scalar loops.
+simd = []
+sse2 = ["simd"]
+avx2 = ["simd"]
+neon = ["simd"]
@@ -4,6 +4,7 @@ pub mod c_api;
 pub mod c_api_data;
 pub mod context;
 pub mod reference;
+pub mod simd_kernels;
 pub mod types;
 pub mod unified_c_api;
 pub mod vxu_impl;