@@ -41,7 +41,7 @@ unsafe fn conv_horiz_rgba_1_u16(
4141 const COMPONENTS : usize = 4 ;
4242 let src_ptr = src. get_unchecked ( ( start_x * COMPONENTS ) ..) ;
4343 let rgba_pixel = vld1_u16 ( src_ptr. as_ptr ( ) ) ;
44- let lo = vreinterpretq_s32_u32 ( vmovl_u16 ( rgba_pixel) ) ;
44+ let lo = vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( rgba_pixel) ) ;
4545 vqrdmlahq_s32 ( store, lo, w0)
4646}
4747
@@ -59,10 +59,14 @@ unsafe fn conv_horiz_rgba_2_u16(
5959
6060 let rgb_pixel = vld1q_u16 ( src_ptr. as_ptr ( ) ) ;
6161
62- let acc = vqrdmlahq_s32 ( store, vreinterpretq_s32_u32 ( vmovl_high_u16 ( rgb_pixel) ) , w1) ;
62+ let acc = vqrdmlahq_s32 (
63+ store,
64+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( rgb_pixel) ) ,
65+ w1,
66+ ) ;
6367 vqrdmlahq_s32 (
6468 acc,
65- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( rgb_pixel) ) ) ,
69+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( rgb_pixel) ) ) ,
6670 w0,
6771 )
6872}
@@ -83,16 +87,24 @@ unsafe fn conv_horiz_rgba_4_u16(
8387 let hi = rgba_pixel. 1 ;
8488 let lo = rgba_pixel. 0 ;
8589
86- let acc = vqrdmlahq_laneq_s32 :: < 3 > ( store, vreinterpretq_s32_u32 ( vmovl_high_u16 ( hi) ) , weights) ;
90+ let acc = vqrdmlahq_laneq_s32 :: < 3 > (
91+ store,
92+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( hi) ) ,
93+ weights,
94+ ) ;
8795 let acc = vqrdmlahq_laneq_s32 :: < 2 > (
8896 acc,
89- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( hi) ) ) ,
97+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( hi) ) ) ,
98+ weights,
99+ ) ;
100+ let acc = vqrdmlahq_laneq_s32 :: < 1 > (
101+ acc,
102+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( lo) ) ,
90103 weights,
91104 ) ;
92- let acc = vqrdmlahq_laneq_s32 :: < 1 > ( acc, vreinterpretq_s32_u32 ( vmovl_high_u16 ( lo) ) , weights) ;
93105 vqrdmlahq_laneq_s32 :: < 0 > (
94106 acc,
95- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( lo) ) ) ,
107+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( lo) ) ) ,
96108 weights,
97109 )
98110}
@@ -115,30 +127,45 @@ unsafe fn conv_horiz_rgba_8_u16(
115127 let hi1 = rgba_pixel. 3 ;
116128 let lo1 = rgba_pixel. 2 ;
117129
118- let mut acc =
119- vqrdmlahq_laneq_s32 :: < 3 > ( store, vreinterpretq_s32_u32 ( vmovl_high_u16 ( hi0) ) , weights. 0 ) ;
130+ let mut acc = vqrdmlahq_laneq_s32 :: < 3 > (
131+ store,
132+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( hi0) ) ,
133+ weights. 0 ,
134+ ) ;
120135 acc = vqrdmlahq_laneq_s32 :: < 2 > (
121136 acc,
122- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( hi0) ) ) ,
137+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( hi0) ) ) ,
138+ weights. 0 ,
139+ ) ;
140+ acc = vqrdmlahq_laneq_s32 :: < 1 > (
141+ acc,
142+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( lo0) ) ,
123143 weights. 0 ,
124144 ) ;
125- acc = vqrdmlahq_laneq_s32 :: < 1 > ( acc, vreinterpretq_s32_u32 ( vmovl_high_u16 ( lo0) ) , weights. 0 ) ;
126145 acc = vqrdmlahq_laneq_s32 :: < 0 > (
127146 acc,
128- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( lo0) ) ) ,
147+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( lo0) ) ) ,
129148 weights. 0 ,
130149 ) ;
131150
132- acc = vqrdmlahq_laneq_s32 :: < 3 > ( acc, vreinterpretq_s32_u32 ( vmovl_high_u16 ( hi1) ) , weights. 1 ) ;
151+ acc = vqrdmlahq_laneq_s32 :: < 3 > (
152+ acc,
153+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( hi1) ) ,
154+ weights. 1 ,
155+ ) ;
133156 acc = vqrdmlahq_laneq_s32 :: < 2 > (
134157 acc,
135- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( hi1) ) ) ,
158+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( hi1) ) ) ,
159+ weights. 1 ,
160+ ) ;
161+ acc = vqrdmlahq_laneq_s32 :: < 1 > (
162+ acc,
163+ vreinterpretq_s32_u32 ( vshll_high_n_u16 :: < 6 > ( lo1) ) ,
136164 weights. 1 ,
137165 ) ;
138- acc = vqrdmlahq_laneq_s32 :: < 1 > ( acc, vreinterpretq_s32_u32 ( vmovl_high_u16 ( lo1) ) , weights. 1 ) ;
139166 acc = vqrdmlahq_laneq_s32 :: < 0 > (
140167 acc,
141- vreinterpretq_s32_u32 ( vmovl_u16 ( vget_low_u16 ( lo1) ) ) ,
168+ vreinterpretq_s32_u32 ( vshll_n_u16 :: < 6 > ( vget_low_u16 ( lo1) ) ) ,
142169 weights. 1 ,
143170 ) ;
144171 acc
@@ -175,7 +202,7 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_hb_impl(
175202) {
176203 unsafe {
177204 const CHANNELS : usize = 4 ;
178- let init = vdupq_n_s32 ( 0 ) ;
205+ let init = vdupq_n_s32 ( ( 1 << 5 ) - 1 ) ;
179206
180207 let v_max_colors = vdup_n_u16 ( ( ( 1u32 << bit_depth) - 1 ) as u16 ) ;
181208
@@ -260,10 +287,10 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_hb_impl(
260287 jx += 1 ;
261288 }
262289
263- let j0 = vqmovun_s32 ( store_0) ;
264- let j1 = vqmovun_s32 ( store_1) ;
265- let j2 = vqmovun_s32 ( store_2) ;
266- let j3 = vqmovun_s32 ( store_3) ;
290+ let j0 = vqshrun_n_s32 :: < 6 > ( store_0) ;
291+ let j1 = vqshrun_n_s32 :: < 6 > ( store_1) ;
292+ let j2 = vqshrun_n_s32 :: < 6 > ( store_2) ;
293+ let j3 = vqshrun_n_s32 :: < 6 > ( store_3) ;
267294
268295 let store_16_0 = vmin_u16 ( j0, v_max_colors) ;
269296 let store_16_1 = vmin_u16 ( j1, v_max_colors) ;
@@ -312,7 +339,7 @@ unsafe fn convolve_horizontal_rgba_neon_u16_hb_impl(
312339 {
313340 let bounds_size = bounds. size ;
314341 let mut jx = 0usize ;
315- let mut store = vdupq_n_s32 ( 0 ) ;
342+ let mut store = vdupq_n_s32 ( ( 1 << 5 ) - 1 ) ;
316343
317344 while jx + 8 < bounds_size {
318345 let bounds_start = bounds. start + jx;
@@ -350,7 +377,7 @@ unsafe fn convolve_horizontal_rgba_neon_u16_hb_impl(
350377 jx += 1 ;
351378 }
352379
353- let store_16_0 = vmin_u16 ( vqmovun_s32 ( store) , v_max_colors) ;
380+ let store_16_0 = vmin_u16 ( vqshrun_n_s32 :: < 6 > ( store) , v_max_colors) ;
354381
355382 vst1_u16 ( dst. as_mut_ptr ( ) , store_16_0) ;
356383 }
0 commit comments