Skip to content

Commit ff580d3

Browse files
authored
Merge pull request #66 from awxkee/dev
RDM bit-depth 16 improvements
2 parents c948569 + ab6ceee commit ff580d3

6 files changed

Lines changed: 207 additions & 133 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm", "fuzz", "app/accelerate"], exclude = ["p
22

33
[package]
44
name = "pic-scale"
5-
version = "0.5.8"
5+
version = "0.5.9"
66
edition = "2021"
77
description = "High performance image scaling"
88
readme = "README.md"

app/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ fn resize_plane(
4646

4747
fn main() {
4848
// test_fast_image();
49-
let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
49+
let img = ImageReader::open("./assets/asset_5.png")
5050
.unwrap()
5151
.decode()
5252
.unwrap();

src/neon/rgb_u16_hb.rs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ unsafe fn conv_horiz_rgba_1_u16(
4444
vdup_n_u32(0),
4545
));
4646
let rgba_pixel = vld1_lane_u16::<2>(src_ptr.get_unchecked(2..).as_ptr() as *const _, vl0);
47-
let lo = vreinterpretq_s32_u32(vmovl_u16(rgba_pixel));
47+
let lo = vreinterpretq_s32_u32(vshll_n_u16::<6>(rgba_pixel));
4848
vqrdmlahq_s32(store, lo, w0)
4949
}
5050

@@ -68,8 +68,8 @@ unsafe fn conv_horiz_rgba_2_u16(
6868

6969
let hi = vext_u16::<3>(l1, vreinterpret_u16_u32(l2));
7070

71-
let acc = vqrdmlahq_s32(store, vreinterpretq_s32_u32(vmovl_u16(l1)), w1);
72-
vqrdmlahq_s32(acc, vreinterpretq_s32_u32(vmovl_u16(hi)), w0)
71+
let acc = vqrdmlahq_s32(store, vreinterpretq_s32_u32(vshll_n_u16::<6>(l1)), w1);
72+
vqrdmlahq_s32(acc, vreinterpretq_s32_u32(vshll_n_u16::<6>(hi)), w0)
7373
}
7474

7575
#[must_use]
@@ -90,12 +90,21 @@ unsafe fn conv_horiz_rgba_4_u16(
9090
let third = vext_u16::<2>(vget_high_u16(part0), part1);
9191
let fourth = vext_u16::<1>(part1, part1);
9292

93-
let acc = vqrdmlahq_laneq_s32::<3>(store, vreinterpretq_s32_u32(vmovl_u16(fourth)), weights);
94-
let acc = vqrdmlahq_laneq_s32::<2>(acc, vreinterpretq_s32_u32(vmovl_u16(third)), weights);
95-
let acc = vqrdmlahq_laneq_s32::<1>(acc, vreinterpretq_s32_u32(vmovl_u16(second)), weights);
93+
let acc = vqrdmlahq_laneq_s32::<3>(
94+
store,
95+
vreinterpretq_s32_u32(vshll_n_u16::<6>(fourth)),
96+
weights,
97+
);
98+
let acc =
99+
vqrdmlahq_laneq_s32::<2>(acc, vreinterpretq_s32_u32(vshll_n_u16::<6>(third)), weights);
100+
let acc = vqrdmlahq_laneq_s32::<1>(
101+
acc,
102+
vreinterpretq_s32_u32(vshll_n_u16::<6>(second)),
103+
weights,
104+
);
96105
vqrdmlahq_laneq_s32::<0>(
97106
acc,
98-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(part0))),
107+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(part0))),
99108
weights,
100109
)
101110
}
@@ -137,7 +146,7 @@ unsafe fn convolve_horizontal_rgb_neon_rows_4_hb_impl(
137146
) {
138147
unsafe {
139148
const CHANNELS: usize = 3;
140-
let init = vdupq_n_s32(0);
149+
let init = vdupq_n_s32((1 << 5) - 1);
141150

142151
let v_max_colors = vdup_n_u16(((1u32 << bit_depth) - 1) as u16);
143152

@@ -208,10 +217,10 @@ unsafe fn convolve_horizontal_rgb_neon_rows_4_hb_impl(
208217
jx += 1;
209218
}
210219

211-
let j0 = vqmovun_s32(store_0);
212-
let j1 = vqmovun_s32(store_1);
213-
let j2 = vqmovun_s32(store_2);
214-
let j3 = vqmovun_s32(store_3);
220+
let j0 = vqshrun_n_s32::<6>(store_0);
221+
let j1 = vqshrun_n_s32::<6>(store_1);
222+
let j2 = vqshrun_n_s32::<6>(store_2);
223+
let j3 = vqshrun_n_s32::<6>(store_3);
215224

216225
let store_16_0 = vmin_u16(j0, v_max_colors);
217226
let store_16_1 = vmin_u16(j1, v_max_colors);
@@ -260,7 +269,7 @@ unsafe fn convolve_horizontal_rgb_neon_u16_hb_impl(
260269
{
261270
let bounds_size = bounds.size;
262271
let mut jx = 0usize;
263-
let mut store = vdupq_n_s32(0);
272+
let mut store = vdupq_n_s32((1 << 5) - 1);
264273

265274
while jx + 4 < bounds_size {
266275
let w_ptr = weights.get_unchecked(jx..(jx + 4));
@@ -287,7 +296,7 @@ unsafe fn convolve_horizontal_rgb_neon_u16_hb_impl(
287296
jx += 1;
288297
}
289298

290-
let store_16_0 = vmin_u16(vqmovun_s32(store), v_max_colors);
299+
let store_16_0 = vmin_u16(vqshrun_n_s32::<6>(store), v_max_colors);
291300

292301
set_pixel(dst, store_16_0);
293302
}

src/neon/rgba_u16_hb.rs

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ unsafe fn conv_horiz_rgba_1_u16(
4141
const COMPONENTS: usize = 4;
4242
let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
4343
let rgba_pixel = vld1_u16(src_ptr.as_ptr());
44-
let lo = vreinterpretq_s32_u32(vmovl_u16(rgba_pixel));
44+
let lo = vreinterpretq_s32_u32(vshll_n_u16::<6>(rgba_pixel));
4545
vqrdmlahq_s32(store, lo, w0)
4646
}
4747

@@ -59,10 +59,14 @@ unsafe fn conv_horiz_rgba_2_u16(
5959

6060
let rgb_pixel = vld1q_u16(src_ptr.as_ptr());
6161

62-
let acc = vqrdmlahq_s32(store, vreinterpretq_s32_u32(vmovl_high_u16(rgb_pixel)), w1);
62+
let acc = vqrdmlahq_s32(
63+
store,
64+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(rgb_pixel)),
65+
w1,
66+
);
6367
vqrdmlahq_s32(
6468
acc,
65-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(rgb_pixel))),
69+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(rgb_pixel))),
6670
w0,
6771
)
6872
}
@@ -83,16 +87,24 @@ unsafe fn conv_horiz_rgba_4_u16(
8387
let hi = rgba_pixel.1;
8488
let lo = rgba_pixel.0;
8589

86-
let acc = vqrdmlahq_laneq_s32::<3>(store, vreinterpretq_s32_u32(vmovl_high_u16(hi)), weights);
90+
let acc = vqrdmlahq_laneq_s32::<3>(
91+
store,
92+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(hi)),
93+
weights,
94+
);
8795
let acc = vqrdmlahq_laneq_s32::<2>(
8896
acc,
89-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(hi))),
97+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(hi))),
98+
weights,
99+
);
100+
let acc = vqrdmlahq_laneq_s32::<1>(
101+
acc,
102+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(lo)),
90103
weights,
91104
);
92-
let acc = vqrdmlahq_laneq_s32::<1>(acc, vreinterpretq_s32_u32(vmovl_high_u16(lo)), weights);
93105
vqrdmlahq_laneq_s32::<0>(
94106
acc,
95-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(lo))),
107+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(lo))),
96108
weights,
97109
)
98110
}
@@ -115,30 +127,45 @@ unsafe fn conv_horiz_rgba_8_u16(
115127
let hi1 = rgba_pixel.3;
116128
let lo1 = rgba_pixel.2;
117129

118-
let mut acc =
119-
vqrdmlahq_laneq_s32::<3>(store, vreinterpretq_s32_u32(vmovl_high_u16(hi0)), weights.0);
130+
let mut acc = vqrdmlahq_laneq_s32::<3>(
131+
store,
132+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(hi0)),
133+
weights.0,
134+
);
120135
acc = vqrdmlahq_laneq_s32::<2>(
121136
acc,
122-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(hi0))),
137+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(hi0))),
138+
weights.0,
139+
);
140+
acc = vqrdmlahq_laneq_s32::<1>(
141+
acc,
142+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(lo0)),
123143
weights.0,
124144
);
125-
acc = vqrdmlahq_laneq_s32::<1>(acc, vreinterpretq_s32_u32(vmovl_high_u16(lo0)), weights.0);
126145
acc = vqrdmlahq_laneq_s32::<0>(
127146
acc,
128-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(lo0))),
147+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(lo0))),
129148
weights.0,
130149
);
131150

132-
acc = vqrdmlahq_laneq_s32::<3>(acc, vreinterpretq_s32_u32(vmovl_high_u16(hi1)), weights.1);
151+
acc = vqrdmlahq_laneq_s32::<3>(
152+
acc,
153+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(hi1)),
154+
weights.1,
155+
);
133156
acc = vqrdmlahq_laneq_s32::<2>(
134157
acc,
135-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(hi1))),
158+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(hi1))),
159+
weights.1,
160+
);
161+
acc = vqrdmlahq_laneq_s32::<1>(
162+
acc,
163+
vreinterpretq_s32_u32(vshll_high_n_u16::<6>(lo1)),
136164
weights.1,
137165
);
138-
acc = vqrdmlahq_laneq_s32::<1>(acc, vreinterpretq_s32_u32(vmovl_high_u16(lo1)), weights.1);
139166
acc = vqrdmlahq_laneq_s32::<0>(
140167
acc,
141-
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(lo1))),
168+
vreinterpretq_s32_u32(vshll_n_u16::<6>(vget_low_u16(lo1))),
142169
weights.1,
143170
);
144171
acc
@@ -175,7 +202,7 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_hb_impl(
175202
) {
176203
unsafe {
177204
const CHANNELS: usize = 4;
178-
let init = vdupq_n_s32(0);
205+
let init = vdupq_n_s32((1 << 5) - 1);
179206

180207
let v_max_colors = vdup_n_u16(((1u32 << bit_depth) - 1) as u16);
181208

@@ -260,10 +287,10 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_hb_impl(
260287
jx += 1;
261288
}
262289

263-
let j0 = vqmovun_s32(store_0);
264-
let j1 = vqmovun_s32(store_1);
265-
let j2 = vqmovun_s32(store_2);
266-
let j3 = vqmovun_s32(store_3);
290+
let j0 = vqshrun_n_s32::<6>(store_0);
291+
let j1 = vqshrun_n_s32::<6>(store_1);
292+
let j2 = vqshrun_n_s32::<6>(store_2);
293+
let j3 = vqshrun_n_s32::<6>(store_3);
267294

268295
let store_16_0 = vmin_u16(j0, v_max_colors);
269296
let store_16_1 = vmin_u16(j1, v_max_colors);
@@ -312,7 +339,7 @@ unsafe fn convolve_horizontal_rgba_neon_u16_hb_impl(
312339
{
313340
let bounds_size = bounds.size;
314341
let mut jx = 0usize;
315-
let mut store = vdupq_n_s32(0);
342+
let mut store = vdupq_n_s32((1 << 5) - 1);
316343

317344
while jx + 8 < bounds_size {
318345
let bounds_start = bounds.start + jx;
@@ -350,7 +377,7 @@ unsafe fn convolve_horizontal_rgba_neon_u16_hb_impl(
350377
jx += 1;
351378
}
352379

353-
let store_16_0 = vmin_u16(vqmovun_s32(store), v_max_colors);
380+
let store_16_0 = vmin_u16(vqshrun_n_s32::<6>(store), v_max_colors);
354381

355382
vst1_u16(dst.as_mut_ptr(), store_16_0);
356383
}

0 commit comments

Comments
 (0)