Skip to content

Commit 2c51a0c

Browse files
committed
CI update
1 parent 83db285 commit 2c51a0c

1 file changed

Lines changed: 165 additions & 36 deletions

File tree

src/fixed_point_vertical.rs

Lines changed: 165 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use crate::saturate_narrow::SaturateNarrow;
3232
use num_traits::AsPrimitive;
3333
use std::ops::{AddAssign, Mul};
3434

35-
#[inline(always)]
35+
#[inline(never)]
3636
/// # Generics
3737
/// `T` - template buffer type
3838
/// `J` - accumulator type
@@ -86,11 +86,8 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer<
8686
}
8787
}
8888

89-
#[inline(always)]
90-
/// # Generics
91-
/// `T` - template buffer type
92-
/// `J` - accumulator type
93-
pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double<
89+
#[inline(never)]
90+
pub(crate) fn convolve_column_handler_fixed_point_direct_buffer16<
9491
T: Copy + 'static + AsPrimitive<J> + Default,
9592
J: Copy + 'static + AsPrimitive<T> + Mul<Output = J> + AddAssign + SaturateNarrow<T> + Default,
9693
const BUFFER_SIZE: usize,
@@ -109,49 +106,181 @@ pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double<
109106
if filter.is_empty() {
110107
return;
111108
}
112-
let mut direct_store0: [J; BUFFER_SIZE] = [ROUNDING_CONST.as_(); BUFFER_SIZE];
113-
let mut direct_store1: [J; BUFFER_SIZE] = [ROUNDING_CONST.as_(); BUFFER_SIZE];
114109

115-
let v_start_px = x;
110+
let rc: J = ROUNDING_CONST.as_();
111+
let mut store0: [J; 4] = [rc; 4];
112+
let mut store1: [J; 4] = [rc; 4];
113+
let mut store2: [J; 4] = [rc; 4];
114+
let mut store3: [J; 4] = [rc; 4];
115+
116+
let base = src_stride * bounds.start + x;
117+
let quarter = BUFFER_SIZE / 4;
118+
119+
for j in 0..bounds.size {
120+
let w: J = filter[j].as_();
121+
let off = base + src_stride * j;
122+
123+
// Four contiguous non-overlapping chunks of the same row.
124+
// Each chunk has a statically known length (BUFFER_SIZE/4),
125+
// giving LLVM exactly one XMM register worth of i32 accumulators.
126+
// The lack of data dependency between chunks enables the vectorizer
127+
// to treat them as 4 independent pmovzxbd + pmulld chains.
128+
let (chunk0, rest) = src[off..off + BUFFER_SIZE].split_at(quarter);
129+
let (chunk1, rest) = rest.split_at(quarter);
130+
let (chunk2, chunk3) = rest.split_at(quarter);
131+
132+
for (acc, &s) in store0.iter_mut().zip(chunk0) {
133+
*acc += s.as_() * w;
134+
}
135+
for (acc, &s) in store1.iter_mut().zip(chunk1) {
136+
*acc += s.as_() * w;
137+
}
138+
for (acc, &s) in store2.iter_mut().zip(chunk2) {
139+
*acc += s.as_() * w;
140+
}
141+
for (acc, &s) in store3.iter_mut().zip(chunk3) {
142+
*acc += s.as_() * w;
143+
}
144+
}
116145

117-
let py = bounds.start;
118-
let weight = filter[0].as_();
119-
let offset = src_stride * py + v_start_px;
120-
let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
121-
let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
146+
// Writeback: four independent store chains
147+
let v_dst = &mut dst[x..x + BUFFER_SIZE];
148+
let (out0, rest) = v_dst.split_at_mut(quarter);
149+
let (out1, rest) = rest.split_at_mut(quarter);
150+
let (out2, out3) = rest.split_at_mut(quarter);
122151

123-
for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
124-
*dst += src.as_() * weight;
152+
for (d, s) in out0.iter_mut().zip(store0) {
153+
*d = s.saturate_narrow(bit_depth);
154+
}
155+
for (d, s) in out1.iter_mut().zip(store1) {
156+
*d = s.saturate_narrow(bit_depth);
157+
}
158+
for (d, s) in out2.iter_mut().zip(store2) {
159+
*d = s.saturate_narrow(bit_depth);
125160
}
161+
for (d, s) in out3.iter_mut().zip(store3) {
162+
*d = s.saturate_narrow(bit_depth);
163+
}
164+
}
126165

127-
for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
128-
*dst += src.as_() * weight;
166+
#[inline(never)]
167+
/// # Generics
168+
/// `T` - template buffer type
169+
/// `J` - accumulator type
170+
pub(crate) fn convolve_column_handler_fixed_point_direct_buffer_double<
171+
T: Copy + 'static + AsPrimitive<J> + Default,
172+
J: Copy + 'static + AsPrimitive<T> + Mul<Output = J> + AddAssign + SaturateNarrow<T> + Default,
173+
const BUFFER_SIZE: usize,
174+
>(
175+
src: &[T],
176+
src_stride: usize,
177+
dst: &mut [T],
178+
filter: &[i16],
179+
bounds: &FilterBounds,
180+
bit_depth: u32,
181+
x: usize,
182+
) where
183+
i32: AsPrimitive<J>,
184+
i16: AsPrimitive<J>,
185+
{
186+
if filter.is_empty() {
187+
return;
129188
}
130189

131-
for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
132-
// Adding 1 is necessary because skip do not incrementing value on values that skipped
133-
let py = bounds.start + j + 1;
134-
let weight = k_weight.as_();
135-
let offset = src_stride * py + v_start_px;
136-
let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
137-
let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
190+
let rc: J = ROUNDING_CONST.as_();
191+
let quarter = BUFFER_SIZE / 4;
192+
193+
// 8 independent accumulator chains → 8 XMM registers, no spill
194+
let mut s00: [J; 4] = [rc; 4];
195+
let mut s01: [J; 4] = [rc; 4];
196+
let mut s02: [J; 4] = [rc; 4];
197+
let mut s03: [J; 4] = [rc; 4];
198+
// second 16-pixel block
199+
let mut s10: [J; 4] = [rc; 4];
200+
let mut s11: [J; 4] = [rc; 4];
201+
let mut s12: [J; 4] = [rc; 4];
202+
let mut s13: [J; 4] = [rc; 4];
203+
204+
let base = src_stride * bounds.start + x;
205+
206+
for j in 0..bounds.size {
207+
let w: J = filter[j].as_();
208+
let off = base + src_stride * j;
209+
210+
// First 16-pixel block split into 4 independent chains
211+
let (c00, rest) = src[off..off + BUFFER_SIZE].split_at(quarter);
212+
let (c01, rest) = rest.split_at(quarter);
213+
let (c02, c03) = rest.split_at(quarter);
214+
215+
// Second 16-pixel block split into 4 independent chains
216+
let off1 = off + BUFFER_SIZE;
217+
let (c10, rest) = src[off1..off1 + BUFFER_SIZE].split_at(quarter);
218+
let (c11, rest) = rest.split_at(quarter);
219+
let (c12, c13) = rest.split_at(quarter);
220+
221+
for (acc, &s) in s00.iter_mut().zip(c00) {
222+
*acc += s.as_() * w;
223+
}
224+
for (acc, &s) in s01.iter_mut().zip(c01) {
225+
*acc += s.as_() * w;
226+
}
227+
for (acc, &s) in s02.iter_mut().zip(c02) {
228+
*acc += s.as_() * w;
229+
}
230+
for (acc, &s) in s03.iter_mut().zip(c03) {
231+
*acc += s.as_() * w;
232+
}
138233

139-
for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
140-
*dst += src.as_() * weight;
234+
for (acc, &s) in s10.iter_mut().zip(c10) {
235+
*acc += s.as_() * w;
141236
}
142-
for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
143-
*dst += src.as_() * weight;
237+
for (acc, &s) in s11.iter_mut().zip(c11) {
238+
*acc += s.as_() * w;
239+
}
240+
for (acc, &s) in s12.iter_mut().zip(c12) {
241+
*acc += s.as_() * w;
242+
}
243+
for (acc, &s) in s13.iter_mut().zip(c13) {
244+
*acc += s.as_() * w;
144245
}
145246
}
146247

147-
let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
148-
for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
149-
*dst = src.saturate_narrow(bit_depth);
248+
// Writeback block 0
249+
let v_dst0 = &mut dst[x..x + BUFFER_SIZE];
250+
let (o00, rest) = v_dst0.split_at_mut(quarter);
251+
let (o01, rest) = rest.split_at_mut(quarter);
252+
let (o02, o03) = rest.split_at_mut(quarter);
253+
254+
for (d, s) in o00.iter_mut().zip(s00) {
255+
*d = s.saturate_narrow(bit_depth);
256+
}
257+
for (d, s) in o01.iter_mut().zip(s01) {
258+
*d = s.saturate_narrow(bit_depth);
259+
}
260+
for (d, s) in o02.iter_mut().zip(s02) {
261+
*d = s.saturate_narrow(bit_depth);
262+
}
263+
for (d, s) in o03.iter_mut().zip(s03) {
264+
*d = s.saturate_narrow(bit_depth);
150265
}
151266

152-
let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
153-
for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
154-
*dst = src.saturate_narrow(bit_depth);
267+
// Writeback block 1
268+
let v_dst1 = &mut dst[x + BUFFER_SIZE..x + BUFFER_SIZE * 2];
269+
let (o10, rest) = v_dst1.split_at_mut(quarter);
270+
let (o11, rest) = rest.split_at_mut(quarter);
271+
let (o12, o13) = rest.split_at_mut(quarter);
272+
273+
for (d, s) in o10.iter_mut().zip(s10) {
274+
*d = s.saturate_narrow(bit_depth);
275+
}
276+
for (d, s) in o11.iter_mut().zip(s11) {
277+
*d = s.saturate_narrow(bit_depth);
278+
}
279+
for (d, s) in o12.iter_mut().zip(s12) {
280+
*d = s.saturate_narrow(bit_depth);
281+
}
282+
for (d, s) in o13.iter_mut().zip(s13) {
283+
*d = s.saturate_narrow(bit_depth);
155284
}
156285
}
157286

@@ -294,7 +423,7 @@ pub(crate) fn column_handler_fixed_point<
294423
}
295424

296425
while cx + 16 < total_width {
297-
convolve_column_handler_fixed_point_direct_buffer::<T, J, 16>(
426+
convolve_column_handler_fixed_point_direct_buffer16::<T, J, 16>(
298427
src, src_stride, dst, weight, bounds, bit_depth, cx,
299428
);
300429

0 commit comments

Comments
 (0)