Skip to content

Commit 66c3a54

Browse files
authored
Merge branch 'main' into perf/median3x3
2 parents 7281667 + c16a4d4 commit 66c3a54

6 files changed

Lines changed: 723 additions & 150 deletions

File tree

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
//! Fast-path kernel implementations that avoid per-pixel get_pixel() overhead.
2+
//!
3+
//! These are called from `vxu_impl.rs` after C images have been converted to
4+
//! Rust `Image` objects. Keeping them in a separate module means we can grow
5+
//! this file without shifting the binary layout of hot functions inside
6+
//! `vxu_impl.rs` (Add, AbsDiff, etc.).
7+
8+
use crate::vxu_impl::{Image, ImageFormat};
9+
use crate::VxStatus;
10+
11+
/// Optimised ChannelCombine for RGB (interleaved R-G-B).
12+
pub fn channel_combine_rgb(
13+
r: &Image,
14+
g: &Image,
15+
b: &Image,
16+
dst: &mut Image,
17+
) -> Result<(), VxStatus> {
18+
let width = dst.width();
19+
let height = dst.height();
20+
let pixels = width * height;
21+
22+
let r_data = r.data();
23+
let g_data = g.data();
24+
let b_data = b.data();
25+
let dst_data = dst.data_mut();
26+
27+
// Bounds checks – the caller already verified dimensions match.
28+
if r_data.len() < pixels || g_data.len() < pixels || b_data.len() < pixels {
29+
return Err(VxStatus::ErrorInvalidDimension);
30+
}
31+
if dst_data.len() < pixels * 3 {
32+
return Err(VxStatus::ErrorInvalidDimension);
33+
}
34+
35+
// Interleave three planes into one RGB buffer.
36+
for y in 0..height {
37+
let src_base = y * width;
38+
let dst_base = y * width * 3;
39+
for x in 0..width {
40+
let s = src_base + x;
41+
let d = dst_base + x * 3;
42+
dst_data[d] = r_data[s];
43+
dst_data[d + 1] = g_data[s];
44+
dst_data[d + 2] = b_data[s];
45+
}
46+
}
47+
Ok(())
48+
}
49+
50+
/// Optimised ChannelCombine for RGBX (interleaved R-G-B-X).
51+
pub fn channel_combine_rgbx(
52+
r: &Image,
53+
g: &Image,
54+
b: &Image,
55+
a: &Image,
56+
dst: &mut Image,
57+
) -> Result<(), VxStatus> {
58+
let width = dst.width();
59+
let height = dst.height();
60+
let pixels = width * height;
61+
62+
let r_data = r.data();
63+
let g_data = g.data();
64+
let b_data = b.data();
65+
let a_data = a.data();
66+
let dst_data = dst.data_mut();
67+
68+
if r_data.len() < pixels || g_data.len() < pixels || b_data.len() < pixels || a_data.len() < pixels {
69+
return Err(VxStatus::ErrorInvalidDimension);
70+
}
71+
if dst_data.len() < pixels * 4 {
72+
return Err(VxStatus::ErrorInvalidDimension);
73+
}
74+
75+
for y in 0..height {
76+
let src_base = y * width;
77+
let dst_base = y * width * 4;
78+
for x in 0..width {
79+
let s = src_base + x;
80+
let d = dst_base + x * 4;
81+
dst_data[d] = r_data[s];
82+
dst_data[d + 1] = g_data[s];
83+
dst_data[d + 2] = b_data[s];
84+
dst_data[d + 3] = a_data[s];
85+
}
86+
}
87+
Ok(())
88+
}
89+
90+
/// Optimised 3×3 convolution for U8 output with Undefined border.
91+
/// `coeffs` are given in **OpenVX order** (already reversed by the caller).
92+
pub fn convolve_3x3_u8_undefined(
93+
src: &Image,
94+
coeffs: &[i16],
95+
scale: i32,
96+
dst: &mut Image,
97+
) -> Result<(), VxStatus> {
98+
let width = src.width();
99+
let height = src.height();
100+
let src_data = src.data();
101+
let dst_data = dst.data_mut();
102+
103+
if width < 3 || height < 3 {
104+
// Too small for a 3×3 kernel – caller should fall back to generic path.
105+
return Err(VxStatus::ErrorInvalidDimension);
106+
}
107+
108+
let w = width as isize;
109+
let row = |y: usize| &src_data[y * width..(y + 1) * width];
110+
111+
// Inner region: no bounds checks needed.
112+
for y in 1..height - 1 {
113+
let ym1 = row(y - 1);
114+
let y0 = row(y);
115+
let yp1 = row(y + 1);
116+
let dst_row = &mut dst_data[y * width..(y + 1) * width];
117+
118+
for x in 1..width - 1 {
119+
let mut sum: i32 = 0;
120+
// Row -1
121+
sum += ym1[x - 1] as i32 * coeffs[0] as i32;
122+
sum += ym1[x] as i32 * coeffs[1] as i32;
123+
sum += ym1[x + 1] as i32 * coeffs[2] as i32;
124+
// Row 0
125+
sum += y0[x - 1] as i32 * coeffs[3] as i32;
126+
sum += y0[x] as i32 * coeffs[4] as i32;
127+
sum += y0[x + 1] as i32 * coeffs[5] as i32;
128+
// Row +1
129+
sum += yp1[x - 1] as i32 * coeffs[6] as i32;
130+
sum += yp1[x] as i32 * coeffs[7] as i32;
131+
sum += yp1[x + 1] as i32 * coeffs[8] as i32;
132+
133+
let val = (sum / scale).clamp(0, 255) as u8;
134+
dst_row[x] = val;
135+
}
136+
}
137+
138+
// Edge handling (first/last row, first/last column) – for Undefined border
139+
// we replicate the nearest valid pixel, which is what Khronos reference does
140+
// for edges when border mode is Undefined.
141+
// Top row
142+
let y0 = 0;
143+
let dst_row = &mut dst_data[y0 * width..(y0 + 1) * width];
144+
for x in 0..width {
145+
let mut sum: i32 = 0;
146+
for ky in 0..3 {
147+
let py = if y0 + ky == 0 { 0 } else { y0 + ky - 1 };
148+
let src_row = row(py);
149+
for kx in 0..3 {
150+
let px = if x + kx == 0 { 0 } else if x + kx >= width { width - 1 } else { x + kx - 1 };
151+
let c = coeffs[ky * 3 + kx] as i32;
152+
sum += src_row[px] as i32 * c;
153+
}
154+
}
155+
dst_row[x] = (sum / scale).clamp(0, 255) as u8;
156+
}
157+
158+
// Bottom row
159+
let ylast = height - 1;
160+
let dst_row = &mut dst_data[ylast * width..(ylast + 1) * width];
161+
for x in 0..width {
162+
let mut sum: i32 = 0;
163+
for ky in 0..3 {
164+
let py = if ylast + ky >= height + 1 { height - 1 } else if ylast + ky == 0 { 0 } else { ylast + ky - 1 };
165+
let src_row = row(py);
166+
for kx in 0..3 {
167+
let px = if x + kx == 0 { 0 } else if x + kx >= width { width - 1 } else { x + kx - 1 };
168+
let c = coeffs[ky * 3 + kx] as i32;
169+
sum += src_row[px] as i32 * c;
170+
}
171+
}
172+
dst_row[x] = (sum / scale).clamp(0, 255) as u8;
173+
}
174+
175+
// First & last column for inner rows
176+
for y in 1..height - 1 {
177+
let ym1 = row(y - 1);
178+
let y0 = row(y);
179+
let yp1 = row(y + 1);
180+
let dst_row = &mut dst_data[y * width..(y + 1) * width];
181+
182+
// x = 0
183+
{
184+
let mut sum: i32 = 0;
185+
for ky in 0..3 {
186+
let py = y + ky - 1;
187+
let src_row = match ky {
188+
0 => ym1,
189+
1 => y0,
190+
2 => yp1,
191+
_ => unreachable!(),
192+
};
193+
for kx in 0..3 {
194+
let px = if kx == 0 { 0 } else { kx - 1 };
195+
let c = coeffs[ky * 3 + kx] as i32;
196+
sum += src_row[px] as i32 * c;
197+
}
198+
}
199+
dst_row[0] = (sum / scale).clamp(0, 255) as u8;
200+
}
201+
202+
// x = width - 1
203+
{
204+
let mut sum: i32 = 0;
205+
let xlast = width - 1;
206+
for ky in 0..3 {
207+
let src_row = match ky {
208+
0 => ym1,
209+
1 => y0,
210+
2 => yp1,
211+
_ => unreachable!(),
212+
};
213+
for kx in 0..3 {
214+
let px = if xlast + kx >= width + 1 { width - 1 } else { xlast + kx - 1 };
215+
let c = coeffs[ky * 3 + kx] as i32;
216+
sum += src_row[px] as i32 * c;
217+
}
218+
}
219+
dst_row[xlast] = (sum / scale).clamp(0, 255) as u8;
220+
}
221+
}
222+
223+
Ok(())
224+
}

openvx-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ pub mod simd_kernels;
88
pub mod types;
99
pub mod unified_c_api;
1010
pub mod vxu_impl;
11+
pub mod kernel_fast_paths;
1112

1213
pub use c_api::vx_status;
1314
pub use context::{Context, KernelTrait};

0 commit comments

Comments
 (0)