4747namespace ojph {
4848 namespace local {
4949
50+ // ////////////////////////////////////////////////////////////////////////
51+ static inline
52+ v128_t ojph_convert_float_to_i32 (v128_t a, v128_t zero, v128_t half)
53+ { // We implement ojph_round, which is
54+ // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
55+ v128_t c = wasm_f32x4_ge (a, zero); // greater or equal to zero
56+ v128_t p = wasm_f32x4_add (a, half); // for positive, add half
57+ v128_t n = wasm_f32x4_sub (a, half); // for negative, subtract half
58+ v128_t d = wasm_v128_and (c, p); // keep positive only
59+ v128_t e = wasm_v128_andnot (n, c); // keep negative only
60+ v128_t v = wasm_v128_or (d, e); // combine
61+ return wasm_i32x4_trunc_sat_f32x4 (v);// truncate (towards 0)
62+ }
63+
5064 // ////////////////////////////////////////////////////////////////////////
5165 void wasm_rev_convert (const line_buf *src_line,
5266 const ui32 src_line_offset,
@@ -129,7 +143,7 @@ namespace ojph {
129143 v128_t c = wasm_i32x4_lt (s, zero); // 0xFFFFFFFF for -ve value
130144 v128_t v_m_sh = wasm_i32x4_sub (sh, s); // - shift - value
131145 v_m_sh = wasm_v128_and (c, v_m_sh); // keep only - shift - value
132- s = wasm_v128_andnot (c, s ); // keep only +ve or 0
146+ s = wasm_v128_andnot (s, c ); // keep only +ve or 0
133147 s = wasm_v128_or (s, v_m_sh); // combine
134148 wasm_v128_store (dp, s);
135149 }
@@ -149,7 +163,7 @@ namespace ojph {
149163 c = wasm_i64x2_lt (u, zero); // 64b -1 for -ve value
150164 v_m_sh = wasm_i64x2_sub (sh, u); // - shift - value
151165 v_m_sh = wasm_v128_and (c, v_m_sh); // keep only - shift - value
152- u = wasm_v128_andnot (c, u ); // keep only +ve or 0
166+ u = wasm_v128_andnot (u, c ); // keep only +ve or 0
153167 u = wasm_v128_or (u, v_m_sh); // combine
154168
155169 wasm_v128_store (dp, u);
@@ -158,7 +172,7 @@ namespace ojph {
158172 c = wasm_i64x2_lt (u, zero); // 64b -1 for -ve value
159173 v_m_sh = wasm_i64x2_sub (sh, u); // - shift - value
160174 v_m_sh = wasm_v128_and (c, v_m_sh); // keep only - shift - value
161- u = wasm_v128_andnot (c, u ); // keep only +ve or 0
175+ u = wasm_v128_andnot (u, c ); // keep only +ve or 0
162176 u = wasm_v128_or (u, v_m_sh); // combine
163177
164178 wasm_v128_store (dp + 2 , u);
@@ -182,14 +196,14 @@ namespace ojph {
182196 m = wasm_i64x2_lt (s, zero); // 64b -1 for -ve value
183197 tm = wasm_i64x2_sub (sh, s); // - shift - value
184198 n = wasm_v128_and (m, tm); // -ve
185- p = wasm_v128_andnot (m, s ); // +ve
199+ p = wasm_v128_andnot (s, m ); // +ve
186200 t0 = wasm_v128_or (n, p);
187201
188202 s = wasm_v128_load (sp + 2 );
189203 m = wasm_i64x2_lt (s, zero); // 64b -1 for -ve value
190204 tm = wasm_i64x2_sub (sh, s); // - shift - value
191205 n = wasm_v128_and (m, tm); // -ve
192- p = wasm_v128_andnot (m, s ); // +ve
206+ p = wasm_v128_andnot (s, m ); // +ve
193207 t1 = wasm_v128_or (n, p);
194208
195209 t0 = wasm_i32x4_shuffle (t0, t1, 0 , 2 , 4 + 0 , 4 + 2 );
@@ -232,32 +246,32 @@ namespace ojph {
232246 void wasm_cnvrt_float_to_si32_shftd (const float *sp, si32 *dp, float mul,
233247 ui32 width)
234248 {
235- // rounding mode is always set to _MM_ROUND_NEAREST
236- v128_t shift = wasm_f32x4_splat (0 .5f );
249+ const v128_t zero = wasm_f32x4_splat ( 0 . 0f );
250+ const v128_t half = wasm_f32x4_splat (0 .5f );
237251 v128_t m = wasm_f32x4_splat (mul);
238252 for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp+=4 , dp+=4 )
239253 {
240254 v128_t t = wasm_v128_load (sp);
241- v128_t s = wasm_f32x4_add (t, shift );
255+ v128_t s = wasm_f32x4_add (t, half );
242256 s = wasm_f32x4_mul (s, m);
243- s = wasm_f32x4_add (s, shift ); // + 0.5 and followed by floor next
244- wasm_v128_store (dp, wasm_i32x4_trunc_sat_f32x4 (s ));
257+ s = wasm_f32x4_add (s, half ); // + 0.5 and followed by floor next
258+ wasm_v128_store (dp, ojph_convert_float_to_i32 (s, zero, half ));
245259 }
246260 }
247261
248262 // ////////////////////////////////////////////////////////////////////////
249263 void wasm_cnvrt_float_to_si32 (const float *sp, si32 *dp, float mul,
250264 ui32 width)
251265 {
252- // rounding mode is always set to _MM_ROUND_NEAREST
253- v128_t shift = wasm_f32x4_splat (0 .5f );
266+ const v128_t zero = wasm_f32x4_splat ( 0 . 0f );
267+ const v128_t half = wasm_f32x4_splat (0 .5f );
254268 v128_t m = wasm_f32x4_splat (mul);
255269 for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp+=4 , dp+=4 )
256270 {
257271 v128_t t = wasm_v128_load (sp);
258272 v128_t s = wasm_f32x4_mul (t, m);
259- s = wasm_f32x4_add (s, shift ); // + 0.5 and followed by floor next
260- wasm_v128_store (dp, wasm_i32x4_trunc_sat_f32x4 (s ));
273+ s = wasm_f32x4_add (s, half ); // + 0.5 and followed by floor next
274+ wasm_v128_store (dp, ojph_convert_float_to_i32 (s, zero, half ));
261275 }
262276 }
263277
@@ -267,7 +281,7 @@ namespace ojph {
267281 {
268282 v128_t c = wasm_i32x4_ge (x, y); // 0xFFFFFFFF for x >= y
269283 v128_t d = wasm_v128_and (c, a); // keep only a, where x >= y
270- v128_t e = wasm_v128_andnot (c, b ); // keep only b, where x < y
284+ v128_t e = wasm_v128_andnot (b, c ); // keep only b, where x < y
271285 return wasm_v128_or (d, e); // combine
272286 }
273287
@@ -277,7 +291,7 @@ namespace ojph {
277291 {
278292 v128_t c = wasm_i32x4_lt (x, y); // 0xFFFFFFFF for x < y
279293 v128_t d = wasm_v128_and (c, a); // keep only a, where x < y
280- v128_t e = wasm_v128_andnot (c, b ); // keep only b, where x >= y
294+ v128_t e = wasm_v128_andnot (b, c ); // keep only b, where x >= y
281295 return wasm_v128_or (d, e); // combine
282296 }
283297
@@ -291,8 +305,6 @@ namespace ojph {
291305 (dst_line->flags & line_buf::LFT_32BIT) &&
292306 (dst_line->flags & line_buf::LFT_INTEGER));
293307
294- // rounding mode is always set to _MM_ROUND_NEAREST
295-
296308 const float * sp = src_line->f32 ;
297309 si32* dp = dst_line->i32 + dst_line_offset;
298310 if (bit_depth <= 30 )
@@ -306,34 +318,37 @@ namespace ojph {
306318
307319 if (is_signed)
308320 {
309- v128_t zero = wasm_i32x4_splat (0 );
321+ const v128_t zero = wasm_f32x4_splat (0 .0f );
322+ const v128_t half = wasm_f32x4_splat (0 .5f );
310323 v128_t bias = wasm_i32x4_splat (-((1 << (bit_depth - 1 )) + 1 ));
311324 for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 )
312325 {
313326 v128_t t = wasm_v128_load (sp);
314327 t = wasm_f32x4_mul (t, mul);
315- v128_t u = wasm_i32x4_trunc_sat_f32x4 (t );
328+ v128_t u = ojph_convert_float_to_i32 (t, zero, half );
316329 u = wasm_i32x4_max (u, lower_limit);
317330 u = wasm_i32x4_min (u, upper_limit);
318331
319332 v128_t c = wasm_i32x4_gt (zero, u); // 0xFFFFFFFF for -ve value
320333 v128_t neg = wasm_i32x4_sub (bias, u); // -bias -value
321334 neg = wasm_v128_and (c, neg); // keep only - bias - value
322- v128_t v = wasm_v128_andnot (c, u ); // keep only +ve or 0
335+ v128_t v = wasm_v128_andnot (u, c ); // keep only +ve or 0
323336 v = wasm_v128_or (neg, v); // combine
324337 wasm_v128_store (dp, v);
325338 }
326339 }
327340 else
328341 {
329- v128_t half = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
342+ const v128_t zero = wasm_f32x4_splat (0 .0f );
343+ const v128_t half = wasm_f32x4_splat (0 .5f );
344+ v128_t ihalf = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
330345 for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
331346 v128_t t = wasm_v128_load (sp);
332347 t = wasm_f32x4_mul (t, mul);
333- v128_t u = wasm_i32x4_trunc_sat_f32x4 (t );
348+ v128_t u = ojph_convert_float_to_i32 (t, zero, half );
334349 u = wasm_i32x4_max (u, lower_limit);
335350 u = wasm_i32x4_min (u, upper_limit);
336- u = wasm_i32x4_add (u, half );
351+ u = wasm_i32x4_add (u, ihalf );
337352 wasm_v128_store (dp, u);
338353 }
339354 }
@@ -359,32 +374,35 @@ namespace ojph {
359374
360375 if (is_signed)
361376 {
362- v128_t zero = wasm_i32x4_splat (0 );
377+ const v128_t zero = wasm_f32x4_splat (0 .0f );
378+ const v128_t half = wasm_f32x4_splat (0 .5f );
363379 v128_t bias = wasm_i32x4_splat (-((1 << (bit_depth - 1 )) + 1 ));
364380 for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
365381 v128_t t = wasm_v128_load (sp);
366382 t = wasm_f32x4_mul (t, mul);
367- v128_t u = wasm_i32x4_trunc_sat_f32x4 (t );
383+ v128_t u = ojph_convert_float_to_i32 (t, zero, half );
368384 u = ojph_wasm_i32x4_max_ge (u, s32_lower_limit, t, fl_lower_limit);
369385 u = ojph_wasm_i32x4_min_lt (u, s32_upper_limit, t, fl_upper_limit);
370386 v128_t c = wasm_i32x4_gt (zero, u); // 0xFFFFFFFF for -ve value
371387 v128_t neg = wasm_i32x4_sub (bias, u); // -bias -value
372388 neg = wasm_v128_and (c, neg); // keep only - bias - value
373- v128_t v = wasm_v128_andnot (c, u ); // keep only +ve or 0
389+ v128_t v = wasm_v128_andnot (u, c ); // keep only +ve or 0
374390 v = wasm_v128_or (neg, v); // combine
375391 wasm_v128_store (dp, v);
376392 }
377393 }
378394 else
379395 {
380- v128_t half = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
396+ const v128_t zero = wasm_f32x4_splat (0 .0f );
397+ const v128_t half = wasm_f32x4_splat (0 .5f );
398+ v128_t ihalf = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
381399 for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
382400 v128_t t = wasm_v128_load (sp);
383401 t = wasm_f32x4_mul (t, mul);
384- v128_t u = wasm_i32x4_trunc_sat_f32x4 (t );
402+ v128_t u = ojph_convert_float_to_i32 (t, zero, half );
385403 u = ojph_wasm_i32x4_max_ge (u, s32_lower_limit, t, fl_lower_limit);
386404 u = ojph_wasm_i32x4_min_lt (u, s32_upper_limit, t, fl_upper_limit);
387- u = wasm_i32x4_add (u, half );
405+ u = wasm_i32x4_add (u, ihalf );
388406 wasm_v128_store (dp, u);
389407 }
390408 }
@@ -416,7 +434,7 @@ namespace ojph {
416434 v128_t c = wasm_i32x4_lt (u, zero); // 0xFFFFFFFF for -ve value
417435 v128_t neg = wasm_i32x4_sub (bias, u); // - bias - value
418436 neg = wasm_v128_and (c, neg); // keep only - bias - value
419- t = wasm_v128_andnot (c, u ); // keep only +ve or 0
437+ t = wasm_v128_andnot (u, c ); // keep only +ve or 0
420438 u = wasm_v128_or (neg, t); // combine
421439 v128_t v = wasm_f32x4_convert_i32x4 (u);
422440 v = wasm_f32x4_mul (v, mul);
0 commit comments