Skip to content

Commit ebd256f

Browse files
authored
Merge pull request #125 from awxkee/v703
RGB F32 AVX improvements
2 parents 91205c0 + a36c763 commit ebd256f

9 files changed

Lines changed: 96 additions & 100 deletions

File tree

app/src/main.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,29 +34,29 @@ fn main() {
3434
.unwrap();
3535
// img.save("top_right.tga").unwrap();
3636
let dimensions = img.dimensions();
37-
let transient = img.to_luma_alpha8();
37+
let transient = img.to_rgb32f();
3838
let mut bytes = transient.to_vec();
3939

4040
// img.resize_exact(dimensions.0 as u32 / 4, dimensions.1 as u32 / 4, image::imageops::FilterType::Lanczos3).save("resized.png").unwrap();
4141

4242
let mut scaler = Scaler::new(ResamplingFunction::Lanczos3)
4343
.set_threading_policy(ThreadingPolicy::Single)
44-
.set_workload_strategy(WorkloadStrategy::PreferQuality);
44+
.set_workload_strategy(WorkloadStrategy::PreferSpeed);
4545
// scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed);
4646

4747
let mut t_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize) / 4;
4848
t_size.height += 1;
4949
let resizing_plan = scaler
50-
.plan_cbcr_resampling(
50+
.plan_rgb_resampling_f32(
5151
ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
5252
t_size,
5353
)
5454
.unwrap();
5555

5656
let mut store =
57-
CbCr8ImageStore::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize).unwrap();
57+
RgbF32ImageStore::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize).unwrap();
5858
store.bit_depth = 8;
59-
let mut dst_store = CbCr8ImageStoreMut::alloc_with_depth(
59+
let mut dst_store = RgbF32ImageStoreMut::alloc_with_depth(
6060
dimensions.0 as usize / 4,
6161
dimensions.1 as usize / 4 + 1,
6262
8,
@@ -113,9 +113,9 @@ fn main() {
113113
let dst = dst_store
114114
.as_bytes()
115115
.iter()
116-
.map(|&x| x)
116+
// .map(|&x| x)
117117
// .map(|&x| ((x >> 8) as u8).min(255))
118-
// .map(|&x| (x as f32 * 255.).round() as u8)
118+
.map(|&x| (x as f32 * 255.).round() as u8)
119119
.collect::<Vec<_>>();
120120

121121
if dst_store.channels == 4 {

src/avx2/check_alpha.rs

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,7 @@ pub(crate) fn avx_has_non_constant_cap_alpha_rgba8(
3939
}
4040

4141
#[target_feature(enable = "avx2")]
42-
unsafe fn avx_has_non_constant_cap_alpha_rgba8_impl(
43-
store: &[u8],
44-
width: usize,
45-
stride: usize,
46-
) -> bool {
42+
fn avx_has_non_constant_cap_alpha_rgba8_impl(store: &[u8], width: usize, stride: usize) -> bool {
4743
unsafe {
4844
if store.is_empty() {
4945
return true;
@@ -138,11 +134,7 @@ pub(crate) fn avx_has_non_constant_cap_alpha_rgba16(
138134
}
139135

140136
#[target_feature(enable = "avx2")]
141-
unsafe fn avx_has_non_constant_cap_alpha_rgba16_impl(
142-
store: &[u16],
143-
width: usize,
144-
stride: usize,
145-
) -> bool {
137+
fn avx_has_non_constant_cap_alpha_rgba16_impl(store: &[u16], width: usize, stride: usize) -> bool {
146138
unsafe {
147139
if store.is_empty() {
148140
return true;

src/avx2/horizontal_ar30.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ impl<const AR_TYPE: usize, const AR_ORDER: usize> Row4ExecutionUnit<AR_TYPE, AR_
151151

152152
#[inline]
153153
#[target_feature(enable = "avx2")]
154-
unsafe fn conv_horiz_rgba_1_u8_i16(
154+
fn conv_horiz_rgba_1_u8_i16(
155155
&self,
156156
start_x: usize,
157157
src0: &[u8],

src/avx2/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ pub(crate) use plane_f32_f64::{
8282
convolve_hor_plane_avx_row_one_f32_f64, convolve_hor_plane_avx_rows_4_f32_f64,
8383
};
8484
pub(crate) use plane_u16::{
85-
convolve_horizontal_plane_avx_rows_4_u16_f, convolve_horizontal_plane_avx_u16_row_f,
85+
convolve_horizontal_plane_avx_rows_4_u16_default, convolve_horizontal_plane_avx_rows_4_u16_fma,
86+
convolve_horizontal_plane_avx_u16_row_default, convolve_horizontal_plane_avx_u16_row_fma,
8687
};
8788
pub(crate) use plane_u16_lb::{
8889
convolve_horizontal_plane_avx_rows_4_u16, convolve_horizontal_plane_avx_u16lp_row,

src/avx2/plane_u16.rs

Lines changed: 44 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ fn conv_horiz_rgba_8_u16<const FMA: bool>(
110110
}
111111
}
112112

113-
pub(crate) fn convolve_horizontal_plane_avx_rows_4_u16_f(
113+
pub(crate) fn convolve_horizontal_plane_avx_rows_4_u16_default(
114114
src: &[u16],
115115
src_stride: usize,
116116
dst: &mut [u16],
@@ -119,25 +119,34 @@ pub(crate) fn convolve_horizontal_plane_avx_rows_4_u16_f(
119119
bit_depth: u32,
120120
) {
121121
unsafe {
122-
if std::arch::is_x86_feature_detected!("fma") {
123-
convolve_horizontal_plane_avx_rows_4_u16_fma(
124-
src,
125-
src_stride,
126-
dst,
127-
dst_stride,
128-
filter_weights,
129-
bit_depth,
130-
);
131-
} else {
132-
convolve_horizontal_plane_avx_rows_4_u16_def(
133-
src,
134-
src_stride,
135-
dst,
136-
dst_stride,
137-
filter_weights,
138-
bit_depth,
139-
);
140-
}
122+
convolve_horizontal_plane_avx_rows_4_u16_def(
123+
src,
124+
src_stride,
125+
dst,
126+
dst_stride,
127+
filter_weights,
128+
bit_depth,
129+
);
130+
}
131+
}
132+
133+
pub(crate) fn convolve_horizontal_plane_avx_rows_4_u16_fma(
134+
src: &[u16],
135+
src_stride: usize,
136+
dst: &mut [u16],
137+
dst_stride: usize,
138+
filter_weights: &FilterWeights<f32>,
139+
bit_depth: u32,
140+
) {
141+
unsafe {
142+
convolve_horizontal_plane_avx_rows_4_u16_fma_impl(
143+
src,
144+
src_stride,
145+
dst,
146+
dst_stride,
147+
filter_weights,
148+
bit_depth,
149+
);
141150
}
142151
}
143152

@@ -157,7 +166,7 @@ fn convolve_horizontal_plane_avx_rows_4_u16_def(
157166

158167
#[target_feature(enable = "avx2", enable = "fma")]
159168
/// This inlining is required to activate all features for runtime dispatch.
160-
fn convolve_horizontal_plane_avx_rows_4_u16_fma(
169+
fn convolve_horizontal_plane_avx_rows_4_u16_fma_impl(
161170
src: &[u16],
162171
src_stride: usize,
163172
dst: &mut [u16],
@@ -282,18 +291,25 @@ impl<const FMA: bool> Row4ExecutionHandler<FMA> {
282291
}
283292
}
284293

285-
pub(crate) fn convolve_horizontal_plane_avx_u16_row_f(
294+
pub(crate) fn convolve_horizontal_plane_avx_u16_row_fma(
286295
src: &[u16],
287296
dst: &mut [u16],
288297
filter_weights: &FilterWeights<f32>,
289298
bit_depth: u32,
290299
) {
291300
unsafe {
292-
if std::arch::is_x86_feature_detected!("fma") {
293-
convolve_horizontal_plane_avx_u16_row_fma(src, dst, filter_weights, bit_depth);
294-
} else {
295-
convolve_horizontal_plane_avx_u16_row_def(src, dst, filter_weights, bit_depth);
296-
}
301+
convolve_horizontal_plane_avx_u16_row_fma_impl(src, dst, filter_weights, bit_depth);
302+
}
303+
}
304+
305+
pub(crate) fn convolve_horizontal_plane_avx_u16_row_default(
306+
src: &[u16],
307+
dst: &mut [u16],
308+
filter_weights: &FilterWeights<f32>,
309+
bit_depth: u32,
310+
) {
311+
unsafe {
312+
convolve_horizontal_plane_avx_u16_row_def(src, dst, filter_weights, bit_depth);
297313
}
298314
}
299315

@@ -311,7 +327,7 @@ fn convolve_horizontal_plane_avx_u16_row_def(
311327

312328
#[target_feature(enable = "avx2", enable = "fma")]
313329
/// This inlining is required to activate all features for runtime dispatch.
314-
fn convolve_horizontal_plane_avx_u16_row_fma(
330+
fn convolve_horizontal_plane_avx_u16_row_fma_impl(
315331
src: &[u16],
316332
dst: &mut [u16],
317333
filter_weights: &FilterWeights<f32>,

src/avx2/plane_u16_lb.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@ pub(crate) fn convolve_horizontal_plane_avx_u16lp_row(
252252
) {
253253
unsafe {
254254
#[cfg(feature = "avx512")]
255-
#[allow(clippy::incompatible_msrv)]
256255
if std::arch::is_x86_feature_detected!("avxvnni") {
257256
return convolve_horizontal_plane_avx_u16_row_vn(src, dst, filter_weights, bit_depth);
258257
}

src/avx2/rgb_f32.rs

Lines changed: 12 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,8 @@ fn ch_parts_4_rgb_f32_sse<const FMA: bool>(
4848
let rgb_pixel_0 = _mm_loadu_ps(src_ptr.as_ptr());
4949
let rgb_pixel_1 = _mm_loadu_ps(src_ptr.get_unchecked(3..).as_ptr());
5050
let rgb_pixel_2 = _mm_loadu_ps(src_ptr.get_unchecked(6..).as_ptr());
51-
let rgb_pixel_3 = _mm_setr_ps(
52-
*src_ptr.get_unchecked(9),
53-
*src_ptr.get_unchecked(10),
54-
*src_ptr.get_unchecked(11),
55-
0.,
56-
);
51+
let mut rgb_pixel_3 = _mm_loadu_ps(src_ptr.get_unchecked(8..).as_ptr());
52+
rgb_pixel_3 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_3, rgb_pixel_3);
5753

5854
let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
5955
let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
@@ -81,22 +77,14 @@ fn ch_parts_4_rgb_f32_avx<const FMA: bool>(
8177
let rgb_pixel_0_0 = _mm_loadu_ps(src_ptr0.as_ptr());
8278
let rgb_pixel_0_1 = _mm_loadu_ps(src_ptr0.get_unchecked(3..).as_ptr());
8379
let rgb_pixel_0_2 = _mm_loadu_ps(src_ptr0.get_unchecked(6..).as_ptr());
84-
let rgb_pixel_0_3 = _mm_setr_ps(
85-
*src_ptr0.get_unchecked(9),
86-
*src_ptr0.get_unchecked(10),
87-
*src_ptr0.get_unchecked(11),
88-
0.,
89-
);
80+
let mut rgb_pixel_0_3 = _mm_loadu_ps(src_ptr0.get_unchecked(8..).as_ptr());
81+
rgb_pixel_0_3 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_0_3, rgb_pixel_0_3);
9082

9183
let rgb_pixel_1_0 = _mm_loadu_ps(src_ptr1.as_ptr());
9284
let rgb_pixel_1_1 = _mm_loadu_ps(src_ptr1.get_unchecked(3..).as_ptr());
9385
let rgb_pixel_1_2 = _mm_loadu_ps(src_ptr1.get_unchecked(6..).as_ptr());
94-
let rgb_pixel_1_3 = _mm_setr_ps(
95-
*src_ptr1.get_unchecked(9),
96-
*src_ptr1.get_unchecked(10),
97-
*src_ptr1.get_unchecked(11),
98-
0.,
99-
);
86+
let mut rgb_pixel_1_3 = _mm_loadu_ps(src_ptr1.get_unchecked(8..).as_ptr());
87+
rgb_pixel_1_3 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_1_3, rgb_pixel_1_3);
10088

10189
let rgb_pixel_0 =
10290
_mm256_insertf128_ps::<1>(_mm256_castps128_ps256(rgb_pixel_0_0), rgb_pixel_1_0);
@@ -132,20 +120,12 @@ fn ch_parts_2_rgb_f32_avx<const FMA: bool>(
132120
let orig1 = _mm_loadu_ps(src_ptr1.as_ptr());
133121

134122
let rgb_pixel_0_0 = orig0;
135-
let rgb_pixel_0_1 = _mm_setr_ps(
136-
*src_ptr0.get_unchecked(3),
137-
*src_ptr0.get_unchecked(4),
138-
*src_ptr0.get_unchecked(5),
139-
0.,
140-
);
123+
let mut rgb_pixel_0_1 = _mm_loadu_ps(src_ptr0.get_unchecked(2..).as_ptr());
124+
rgb_pixel_0_1 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_0_1, rgb_pixel_0_1);
141125

142126
let rgb_pixel_1_0 = orig1;
143-
let rgb_pixel_1_1 = _mm_setr_ps(
144-
*src_ptr1.get_unchecked(3),
145-
*src_ptr1.get_unchecked(4),
146-
*src_ptr1.get_unchecked(5),
147-
0.,
148-
);
127+
let mut rgb_pixel_1_1 = _mm_loadu_ps(src_ptr1.get_unchecked(2..).as_ptr());
128+
rgb_pixel_1_1 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_1_1, rgb_pixel_1_1);
149129

150130
let rgb_pixel_0 =
151131
_mm256_insertf128_ps::<1>(_mm256_castps128_ps256(rgb_pixel_0_0), rgb_pixel_1_0);
@@ -172,12 +152,8 @@ fn ch_parts_2_rgb_f32<const FMA: bool>(
172152

173153
let orig1 = _mm_loadu_ps(src_ptr.as_ptr());
174154
let rgb_pixel_0 = orig1;
175-
let rgb_pixel_1 = _mm_setr_ps(
176-
*src_ptr.get_unchecked(3),
177-
*src_ptr.get_unchecked(4),
178-
*src_ptr.get_unchecked(5),
179-
0.,
180-
);
155+
let mut rgb_pixel_1 = _mm_loadu_ps(src_ptr.get_unchecked(2..).as_ptr());
156+
rgb_pixel_1 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_1, rgb_pixel_1);
181157

182158
let mut acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
183159
acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);

src/handler_provider.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,17 @@ impl RowHandlerFloatingPoint<u16, f32, f32> for u16 {
252252
bit_depth,
253253
);
254254
} else if CN == 1 {
255-
use crate::avx2::convolve_horizontal_plane_avx_u16_row_f;
256-
return convolve_horizontal_plane_avx_u16_row_f(
255+
if has_fma {
256+
use crate::avx2::convolve_horizontal_plane_avx_u16_row_fma;
257+
return convolve_horizontal_plane_avx_u16_row_fma(
258+
src,
259+
dst,
260+
filter_weights,
261+
bit_depth,
262+
);
263+
}
264+
use crate::avx2::convolve_horizontal_plane_avx_u16_row_default;
265+
return convolve_horizontal_plane_avx_u16_row_default(
257266
src,
258267
dst,
259268
filter_weights,
@@ -350,8 +359,19 @@ impl RowHandlerFloatingPoint<u16, f32, f32> for u16 {
350359
bit_depth,
351360
);
352361
} else if CN == 1 {
353-
use crate::avx2::convolve_horizontal_plane_avx_rows_4_u16_f;
354-
return convolve_horizontal_plane_avx_rows_4_u16_f(
362+
if has_fma {
363+
use crate::avx2::convolve_horizontal_plane_avx_rows_4_u16_fma;
364+
return convolve_horizontal_plane_avx_rows_4_u16_fma(
365+
src,
366+
src_stride,
367+
dst,
368+
dst_stride,
369+
filter_weights,
370+
bit_depth,
371+
);
372+
}
373+
use crate::avx2::convolve_horizontal_plane_avx_rows_4_u16_default;
374+
return convolve_horizontal_plane_avx_rows_4_u16_default(
355375
src,
356376
src_stride,
357377
dst,

src/sse/rgb_f32.rs

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,8 @@ fn convolve_horizontal_parts_4_rgb_f32<const FMA: bool>(
5151
let rgb_pixel_0 = _mm_loadu_ps(src_ptr);
5252
let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(3));
5353
let rgb_pixel_2 = _mm_loadu_ps(src_ptr.add(6));
54-
let rgb_pixel_3 = _mm_setr_ps(
55-
src_ptr.add(9).read_unaligned(),
56-
src_ptr.add(10).read_unaligned(),
57-
src_ptr.add(11).read_unaligned(),
58-
0f32,
59-
);
54+
let mut rgb_pixel_3 = _mm_loadu_ps(src_ptr.add(8));
55+
rgb_pixel_3 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_3, rgb_pixel_3);
6056

6157
let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
6258
let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
@@ -79,12 +75,8 @@ fn convolve_horizontal_parts_2_rgb_f32<const FMA: bool>(
7975

8076
let orig1 = _mm_loadu_ps(src_ptr);
8177
let rgb_pixel_0 = orig1;
82-
let rgb_pixel_1 = _mm_setr_ps(
83-
src_ptr.add(3).read_unaligned(),
84-
src_ptr.add(4).read_unaligned(),
85-
src_ptr.add(5).read_unaligned(),
86-
0f32,
87-
);
78+
let mut rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(2));
79+
rgb_pixel_1 = _mm_shuffle_ps::<{ shuffle(0, 3, 2, 1) }>(rgb_pixel_1, rgb_pixel_1);
8880

8981
let mut acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
9082
acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);

0 commit comments

Comments
 (0)