@@ -32,7 +32,7 @@ use crate::saturate_narrow::SaturateNarrow;
3232use num_traits:: AsPrimitive ;
3333use std:: ops:: { AddAssign , Mul } ;
3434
35- #[ inline( always ) ]
35+ #[ inline( never ) ]
3636/// # Generics
3737/// `T` - template buffer type
3838/// `J` - accumulator type
@@ -86,11 +86,8 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer<
8686 }
8787}
8888
89- #[ inline( always) ]
90- /// # Generics
91- /// `T` - template buffer type
92- /// `J` - accumulator type
93- pub ( crate ) fn convolve_column_handler_fixed_point_direct_buffer_double <
89+ #[ inline( never) ]
90+ pub ( crate ) fn convolve_column_handler_fixed_point_direct_buffer16 <
9491 T : Copy + ' static + AsPrimitive < J > + Default ,
9592 J : Copy + ' static + AsPrimitive < T > + Mul < Output = J > + AddAssign + SaturateNarrow < T > + Default ,
9693 const BUFFER_SIZE : usize ,
@@ -109,49 +106,181 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double<
109106 if filter. is_empty ( ) {
110107 return ;
111108 }
112- let mut direct_store0: [ J ; BUFFER_SIZE ] = [ ROUNDING_CONST . as_ ( ) ; BUFFER_SIZE ] ;
113- let mut direct_store1: [ J ; BUFFER_SIZE ] = [ ROUNDING_CONST . as_ ( ) ; BUFFER_SIZE ] ;
114109
115- let v_start_px = x;
110+ let rc: J = ROUNDING_CONST . as_ ( ) ;
111+ let mut store0: [ J ; 4 ] = [ rc; 4 ] ;
112+ let mut store1: [ J ; 4 ] = [ rc; 4 ] ;
113+ let mut store2: [ J ; 4 ] = [ rc; 4 ] ;
114+ let mut store3: [ J ; 4 ] = [ rc; 4 ] ;
115+
116+ let base = src_stride * bounds. start + x;
117+ let quarter = BUFFER_SIZE / 4 ;
118+
119+ for j in 0 ..bounds. size {
120+ let w: J = filter[ j] . as_ ( ) ;
121+ let off = base + src_stride * j;
122+
123+ // Four contiguous non-overlapping chunks of the same row.
124+ // Each chunk has a statically known length (BUFFER_SIZE/4),
125+ // giving LLVM exactly one XMM register worth of i32 accumulators.
126+ // The lack of data dependency between chunks enables the vectorizer
127+ // to treat them as 4 independent pmovzxbd + pmulld chains.
128+ let ( chunk0, rest) = src[ off..off + BUFFER_SIZE ] . split_at ( quarter) ;
129+ let ( chunk1, rest) = rest. split_at ( quarter) ;
130+ let ( chunk2, chunk3) = rest. split_at ( quarter) ;
131+
132+ for ( acc, & s) in store0. iter_mut ( ) . zip ( chunk0) {
133+ * acc += s. as_ ( ) * w;
134+ }
135+ for ( acc, & s) in store1. iter_mut ( ) . zip ( chunk1) {
136+ * acc += s. as_ ( ) * w;
137+ }
138+ for ( acc, & s) in store2. iter_mut ( ) . zip ( chunk2) {
139+ * acc += s. as_ ( ) * w;
140+ }
141+ for ( acc, & s) in store3. iter_mut ( ) . zip ( chunk3) {
142+ * acc += s. as_ ( ) * w;
143+ }
144+ }
116145
117- let py = bounds . start ;
118- let weight = filter [ 0 ] . as_ ( ) ;
119- let offset = src_stride * py + v_start_px ;
120- let src_ptr0 = & src [ offset.. ( offset + BUFFER_SIZE ) ] ;
121- let src_ptr1 = & src [ ( offset + BUFFER_SIZE ) .. ( offset + BUFFER_SIZE * 2 ) ] ;
146+ // Writeback: four independent store chains
147+ let v_dst = & mut dst [ x..x + BUFFER_SIZE ] ;
148+ let ( out0 , rest ) = v_dst . split_at_mut ( quarter ) ;
149+ let ( out1 , rest ) = rest . split_at_mut ( quarter ) ;
150+ let ( out2 , out3 ) = rest . split_at_mut ( quarter ) ;
122151
123- for ( dst, src) in direct_store0. iter_mut ( ) . zip ( src_ptr0) {
124- * dst += src. as_ ( ) * weight;
152+ for ( d, s) in out0. iter_mut ( ) . zip ( store0) {
153+ * d = s. saturate_narrow ( bit_depth) ;
154+ }
155+ for ( d, s) in out1. iter_mut ( ) . zip ( store1) {
156+ * d = s. saturate_narrow ( bit_depth) ;
157+ }
158+ for ( d, s) in out2. iter_mut ( ) . zip ( store2) {
159+ * d = s. saturate_narrow ( bit_depth) ;
125160 }
161+ for ( d, s) in out3. iter_mut ( ) . zip ( store3) {
162+ * d = s. saturate_narrow ( bit_depth) ;
163+ }
164+ }
126165
127- for ( dst, src) in direct_store1. iter_mut ( ) . zip ( src_ptr1) {
128- * dst += src. as_ ( ) * weight;
166+ #[ inline( never) ]
167+ /// # Generics
168+ /// `T` - template buffer type
169+ /// `J` - accumulator type
170+ pub ( crate ) fn convolve_column_handler_fixed_point_direct_buffer_double <
171+ T : Copy + ' static + AsPrimitive < J > + Default ,
172+ J : Copy + ' static + AsPrimitive < T > + Mul < Output = J > + AddAssign + SaturateNarrow < T > + Default ,
173+ const BUFFER_SIZE : usize ,
174+ > (
175+ src : & [ T ] ,
176+ src_stride : usize ,
177+ dst : & mut [ T ] ,
178+ filter : & [ i16 ] ,
179+ bounds : & FilterBounds ,
180+ bit_depth : u32 ,
181+ x : usize ,
182+ ) where
183+ i32 : AsPrimitive < J > ,
184+ i16 : AsPrimitive < J > ,
185+ {
186+ if filter. is_empty ( ) {
187+ return ;
129188 }
130189
131- for ( j, & k_weight) in filter. iter ( ) . take ( bounds. size ) . skip ( 1 ) . enumerate ( ) {
132- // Adding 1 is necessary because skip do not incrementing value on values that skipped
133- let py = bounds. start + j + 1 ;
134- let weight = k_weight. as_ ( ) ;
135- let offset = src_stride * py + v_start_px;
136- let src_ptr0 = & src[ offset..( offset + BUFFER_SIZE ) ] ;
137- let src_ptr1 = & src[ ( offset + BUFFER_SIZE ) ..( offset + BUFFER_SIZE * 2 ) ] ;
190+ let rc: J = ROUNDING_CONST . as_ ( ) ;
191+ let quarter = BUFFER_SIZE / 4 ;
192+
193+ // 8 independent accumulator chains → 8 XMM registers, no spill
194+ let mut s00: [ J ; 4 ] = [ rc; 4 ] ;
195+ let mut s01: [ J ; 4 ] = [ rc; 4 ] ;
196+ let mut s02: [ J ; 4 ] = [ rc; 4 ] ;
197+ let mut s03: [ J ; 4 ] = [ rc; 4 ] ;
198+ // second 16-pixel block
199+ let mut s10: [ J ; 4 ] = [ rc; 4 ] ;
200+ let mut s11: [ J ; 4 ] = [ rc; 4 ] ;
201+ let mut s12: [ J ; 4 ] = [ rc; 4 ] ;
202+ let mut s13: [ J ; 4 ] = [ rc; 4 ] ;
203+
204+ let base = src_stride * bounds. start + x;
205+
206+ for j in 0 ..bounds. size {
207+ let w: J = filter[ j] . as_ ( ) ;
208+ let off = base + src_stride * j;
209+
210+ // First 16-pixel block split into 4 independent chains
211+ let ( c00, rest) = src[ off..off + BUFFER_SIZE ] . split_at ( quarter) ;
212+ let ( c01, rest) = rest. split_at ( quarter) ;
213+ let ( c02, c03) = rest. split_at ( quarter) ;
214+
215+ // Second 16-pixel block split into 4 independent chains
216+ let off1 = off + BUFFER_SIZE ;
217+ let ( c10, rest) = src[ off1..off1 + BUFFER_SIZE ] . split_at ( quarter) ;
218+ let ( c11, rest) = rest. split_at ( quarter) ;
219+ let ( c12, c13) = rest. split_at ( quarter) ;
220+
221+ for ( acc, & s) in s00. iter_mut ( ) . zip ( c00) {
222+ * acc += s. as_ ( ) * w;
223+ }
224+ for ( acc, & s) in s01. iter_mut ( ) . zip ( c01) {
225+ * acc += s. as_ ( ) * w;
226+ }
227+ for ( acc, & s) in s02. iter_mut ( ) . zip ( c02) {
228+ * acc += s. as_ ( ) * w;
229+ }
230+ for ( acc, & s) in s03. iter_mut ( ) . zip ( c03) {
231+ * acc += s. as_ ( ) * w;
232+ }
138233
139- for ( dst , src ) in direct_store0 . iter_mut ( ) . zip ( src_ptr0 . iter ( ) ) {
140- * dst += src . as_ ( ) * weight ;
234+ for ( acc , & s ) in s10 . iter_mut ( ) . zip ( c10 ) {
235+ * acc += s . as_ ( ) * w ;
141236 }
142- for ( dst, src) in direct_store1. iter_mut ( ) . zip ( src_ptr1. iter ( ) ) {
143- * dst += src. as_ ( ) * weight;
237+ for ( acc, & s) in s11. iter_mut ( ) . zip ( c11) {
238+ * acc += s. as_ ( ) * w;
239+ }
240+ for ( acc, & s) in s12. iter_mut ( ) . zip ( c12) {
241+ * acc += s. as_ ( ) * w;
242+ }
243+ for ( acc, & s) in s13. iter_mut ( ) . zip ( c13) {
244+ * acc += s. as_ ( ) * w;
144245 }
145246 }
146247
147- let v_dst0 = & mut dst[ v_start_px..( v_start_px + BUFFER_SIZE ) ] ;
148- for ( dst, src) in v_dst0. iter_mut ( ) . zip ( direct_store0) {
149- * dst = src. saturate_narrow ( bit_depth) ;
248+ // Writeback block 0
249+ let v_dst0 = & mut dst[ x..x + BUFFER_SIZE ] ;
250+ let ( o00, rest) = v_dst0. split_at_mut ( quarter) ;
251+ let ( o01, rest) = rest. split_at_mut ( quarter) ;
252+ let ( o02, o03) = rest. split_at_mut ( quarter) ;
253+
254+ for ( d, s) in o00. iter_mut ( ) . zip ( s00) {
255+ * d = s. saturate_narrow ( bit_depth) ;
256+ }
257+ for ( d, s) in o01. iter_mut ( ) . zip ( s01) {
258+ * d = s. saturate_narrow ( bit_depth) ;
259+ }
260+ for ( d, s) in o02. iter_mut ( ) . zip ( s02) {
261+ * d = s. saturate_narrow ( bit_depth) ;
262+ }
263+ for ( d, s) in o03. iter_mut ( ) . zip ( s03) {
264+ * d = s. saturate_narrow ( bit_depth) ;
150265 }
151266
152- let v_dst1 = & mut dst[ ( v_start_px + BUFFER_SIZE ) ..( v_start_px + BUFFER_SIZE * 2 ) ] ;
153- for ( dst, src) in v_dst1. iter_mut ( ) . zip ( direct_store1) {
154- * dst = src. saturate_narrow ( bit_depth) ;
267+ // Writeback block 1
268+ let v_dst1 = & mut dst[ x + BUFFER_SIZE ..x + BUFFER_SIZE * 2 ] ;
269+ let ( o10, rest) = v_dst1. split_at_mut ( quarter) ;
270+ let ( o11, rest) = rest. split_at_mut ( quarter) ;
271+ let ( o12, o13) = rest. split_at_mut ( quarter) ;
272+
273+ for ( d, s) in o10. iter_mut ( ) . zip ( s10) {
274+ * d = s. saturate_narrow ( bit_depth) ;
275+ }
276+ for ( d, s) in o11. iter_mut ( ) . zip ( s11) {
277+ * d = s. saturate_narrow ( bit_depth) ;
278+ }
279+ for ( d, s) in o12. iter_mut ( ) . zip ( s12) {
280+ * d = s. saturate_narrow ( bit_depth) ;
281+ }
282+ for ( d, s) in o13. iter_mut ( ) . zip ( s13) {
283+ * d = s. saturate_narrow ( bit_depth) ;
155284 }
156285}
157286
@@ -294,7 +423,7 @@ pub(crate) fn column_handler_fixed_point<
294423 }
295424
296425 while cx + 16 < total_width {
297- convolve_column_handler_fixed_point_direct_buffer :: < T , J , 16 > (
426+ convolve_column_handler_fixed_point_direct_buffer16 :: < T , J , 16 > (
298427 src, src_stride, dst, weight, bounds, bit_depth, cx,
299428 ) ;
300429
0 commit comments