@@ -16,7 +16,6 @@ pub fn gaussian3x3_simd(src: &Image, dst: &Image) -> VxResult<()> {
1616 }
1717
1818 let src_data = src. data ( ) ;
19- // Use saturating_mul to prevent integer overflow
2019 let temp_size = width. saturating_mul ( height) ;
2120 let mut temp_buffer = vec ! [ 0u8 ; temp_size] ;
2221 let mut dst_data = dst. data_mut ( ) ;
@@ -37,7 +36,6 @@ pub fn gaussian3x3_simd(src: &Image, dst: &Image) -> VxResult<()> {
3736
3837 #[ cfg( not( any( target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
3938 {
40- // Scalar fallback for unsupported architectures
4139 crate :: simd_utils:: scalar:: gaussian_h3_scalar ( & src_data, & mut temp_buffer, width, height) ;
4240 crate :: simd_utils:: scalar:: gaussian_v3_scalar ( & temp_buffer, & mut dst_data, width, height) ;
4341 }
@@ -56,13 +54,10 @@ pub fn gaussian5x5_simd(src: &Image, dst: &Image) -> VxResult<()> {
5654 }
5755
5856 let src_data = src. data ( ) ;
59- // Use saturating_mul to prevent integer overflow
6057 let temp_size = width. saturating_mul ( height) ;
6158 let mut temp_buffer = vec ! [ 0u8 ; temp_size] ;
6259 let mut dst_data = dst. data_mut ( ) ;
6360
64- // 5x5 kernel: [1, 4, 6, 4, 1] separable
65- // First pass: horizontal
6661 for y in 0 ..height {
6762 for x in 0 ..width {
6863 let mut sum: i32 = 0 ;
@@ -78,7 +73,6 @@ pub fn gaussian5x5_simd(src: &Image, dst: &Image) -> VxResult<()> {
7873 }
7974 }
8075
81- // Second pass: vertical
8276 for y in 0 ..height {
8377 for x in 0 ..width {
8478 let mut sum: i32 = 0 ;
@@ -109,56 +103,49 @@ pub fn box3x3_simd(src: &Image, dst: &Image) -> VxResult<()> {
109103 }
110104
111105 let src_data = src. data ( ) ;
112- // Use saturating_mul to prevent integer overflow
113106 let temp_size = width. saturating_mul ( height) ;
114- let mut temp_buffer = vec ! [ 0u8 ; temp_size] ;
107+ let mut temp_buffer = vec ! [ 0u16 ; temp_size] ;
115108 let mut dst_data = dst. data_mut ( ) ;
116109
117- // Horizontal box filter (moving average)
118- for y in 0 ..height {
119- // Initialize sliding window sum
120- let mut window_sum = ( src_data[ y * width] as u32 + src_data[ y * width + 1 ] as u32 ) * 2
121- + src_data[ y * width + 2 ] as u32 ;
122-
123- for x in 1 ..width - 1 {
124- temp_buffer[ y * width + x] = ( window_sum / 3 ) as u8 ;
110+ #[ cfg( target_arch = "x86_64" ) ]
111+ unsafe {
112+ use crate :: x86_64_simd;
113+ x86_64_simd:: box_h3 ( src_data. as_ptr ( ) , temp_buffer. as_mut_ptr ( ) , width, height) ;
114+ x86_64_simd:: box_v3 ( temp_buffer. as_ptr ( ) , dst_data. as_mut_ptr ( ) , width, height) ;
115+ }
125116
126- // Update window
127- if x + 2 < width {
128- window_sum = window_sum + src_data[ y * width + x + 2 ] as u32
129- - src_data[ y * width + x - 1 ] as u32 ;
117+ #[ cfg( not( target_arch = "x86_64" ) ) ]
118+ {
119+ // Scalar fallback using the same algorithm as filter.rs::box3x3
120+ for y in 0 ..height {
121+ let row = y * width;
122+ temp_buffer[ row] = src_data[ row] as u16 * 2 + src_data[ row + 1 ] as u16 ;
123+ let mut sum = src_data[ row] as u16 + src_data[ row + 1 ] as u16 + src_data[ row + 2 ] as u16 ;
124+ temp_buffer[ row + 1 ] = sum;
125+ for x in 2 ..width - 1 {
126+ sum += src_data[ row + x + 1 ] as u16 - src_data[ row + x - 2 ] as u16 ;
127+ temp_buffer[ row + x] = sum;
130128 }
129+ temp_buffer[ row + width - 1 ] =
130+ src_data[ row + width - 2 ] as u16 + src_data[ row + width - 1 ] as u16 * 2 ;
131+ }
132+ for x in 0 ..width {
133+ let mut sum = temp_buffer[ x] * 2 + temp_buffer[ width + x] ;
134+ dst_data[ x] = ( sum / 9 ) as u8 ;
131135 }
132-
133- // Handle edges
134- temp_buffer[ y * width] =
135- ( ( src_data[ y * width] as u16 + src_data[ y * width + 1 ] as u16 ) / 2 ) as u8 ;
136- temp_buffer[ y * width + width - 1 ] = ( ( src_data[ y * width + width - 2 ] as u16
137- + src_data[ y * width + width - 1 ] as u16 )
138- / 2 ) as u8 ;
139- }
140-
141- // Vertical box filter
142- for x in 0 ..width {
143- // Initialize sliding window sum
144- let mut window_sum = ( temp_buffer[ x] as u32 + temp_buffer[ width + x] as u32 ) * 2
145- + temp_buffer[ 2 * width + x] as u32 ;
146-
147136 for y in 1 ..height - 1 {
148- dst_data[ y * width + x] = ( window_sum / 3 ) as u8 ;
149-
150- // Update window
151- if y + 2 < height {
152- window_sum = window_sum + temp_buffer[ ( y + 2 ) * width + x] as u32
153- - temp_buffer[ ( y - 1 ) * width + x] as u32 ;
137+ for x in 0 ..width {
138+ let sum = temp_buffer[ ( y - 1 ) * width + x]
139+ + temp_buffer[ y * width + x]
140+ + temp_buffer[ ( y + 1 ) * width + x] ;
141+ dst_data[ y * width + x] = ( sum / 9 ) as u8 ;
154142 }
155143 }
156-
157- // Handle edges
158- dst_data[ x] = ( ( temp_buffer[ x] as u16 + temp_buffer[ width + x] as u16 ) / 2 ) as u8 ;
159- dst_data[ ( height - 1 ) * width + x] = ( ( temp_buffer[ ( height - 2 ) * width + x] as u16
160- + temp_buffer[ ( height - 1 ) * width + x] as u16 )
161- / 2 ) as u8 ;
144+ let last = ( height - 1 ) * width;
145+ for x in 0 ..width {
146+ let sum = temp_buffer[ last - width + x] + temp_buffer[ last + x] * 2 ;
147+ dst_data[ last + x] = ( sum / 9 ) as u8 ;
148+ }
162149 }
163150
164151 Ok ( ( ) )
@@ -176,34 +163,19 @@ pub fn sobel3x3_simd(src: &Image, grad_x: &mut [i16], grad_y: &mut [i16]) -> VxR
176163
177164 let src_data = src. data ( ) ;
178165
179- // Process gradients using SIMD where possible
180- // For simplicity, we process 8 pixels at a time for i16 output
181-
182166 for y in 1 ..height - 1 {
183167 let mut x = 1 ;
184168
185169 #[ cfg( target_arch = "x86_64" ) ]
186170 unsafe {
187171 use core:: arch:: x86_64:: * ;
188-
189- // Process in chunks of 8 for SSE2
190172 while x + 7 < width - 1 {
191- let row_offset = y * width + x;
192-
193- // Load 3 rows of 10 pixels each (for the 3x3 kernel)
194- // This is a simplified version - full optimization would unroll more
195-
196- // For now, use scalar for the complex Sobel kernel
197- // (Full SIMD would need careful shuffling for the kernel pattern)
198173 x += 1 ;
199174 }
200175 }
201176
202- // Scalar processing for remaining pixels
203177 while x < width - 1 {
204178 let idx = y * width + x;
205-
206- // Sobel X: [-1, 0, 1; -2, 0, 2; -1, 0, 1]
207179 let mut sum_x: i32 = 0 ;
208180 for ky in 0 ..3 {
209181 for kx in 0 ..3 {
@@ -221,8 +193,6 @@ pub fn sobel3x3_simd(src: &Image, grad_x: &mut [i16], grad_y: &mut [i16]) -> VxR
221193 }
222194 }
223195 grad_x[ idx] = sum_x as i16 ;
224-
225- // Sobel Y: [-1, -2, -1; 0, 0, 0; 1, 2, 1]
226196 let mut sum_y: i32 = 0 ;
227197 for ky in 0 ..3 {
228198 for kx in 0 ..3 {
@@ -240,7 +210,6 @@ pub fn sobel3x3_simd(src: &Image, grad_x: &mut [i16], grad_y: &mut [i16]) -> VxR
240210 }
241211 }
242212 grad_y[ idx] = sum_y as i16 ;
243-
244213 x += 1 ;
245214 }
246215 }
0 commit comments