@@ -50,7 +50,7 @@ namespace ojph {
5050 // ////////////////////////////////////////////////////////////////////////
5151 static inline
5252 v128_t ojph_convert_float_to_i32 (v128_t a, v128_t zero, v128_t half)
53- { // We implement ojph_round, which is
53+ { // We implement ojph_round, which is
5454 // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
5555 v128_t c = wasm_f32x4_ge (a, zero); // greater or equal to zero
5656 v128_t p = wasm_f32x4_add (a, half); // for positive, add half
@@ -279,7 +279,7 @@ namespace ojph {
279279 static inline
280280 v128_t ojph_wasm_i32x4_max_ge (v128_t a, v128_t b, v128_t x, v128_t y)
281281 {
282- v128_t c = wasm_i32x4_ge (x, y); // 0xFFFFFFFF for x >= y
282+ v128_t c = wasm_f32x4_ge (x, y); // 0xFFFFFFFF for x >= y
283283 v128_t d = wasm_v128_and (c, a); // keep only a, where x >= y
284284 v128_t e = wasm_v128_andnot (b, c); // keep only b, where x < y
285285 return wasm_v128_or (d, e); // combine
@@ -289,7 +289,7 @@ namespace ojph {
289289 static inline
290290 v128_t ojph_wasm_i32x4_min_lt (v128_t a, v128_t b, v128_t x, v128_t y)
291291 {
292- v128_t c = wasm_i32x4_lt (x, y); // 0xFFFFFFFF for x < y
292+ v128_t c = wasm_f32x4_lt (x, y); // 0xFFFFFFFF for x < y
293293 v128_t d = wasm_v128_and (c, a); // keep only a, where x < y
294294 v128_t e = wasm_v128_andnot (b, c); // keep only b, where x >= y
295295 return wasm_v128_or (d, e); // combine
@@ -305,106 +305,54 @@ namespace ojph {
305305 (dst_line->flags & line_buf::LFT_32BIT) &&
306306 (dst_line->flags & line_buf::LFT_INTEGER));
307307
308+ assert (bit_depth <= 32 );
308309 const float * sp = src_line->f32 ;
309310 si32* dp = dst_line->i32 + dst_line_offset;
310- if (bit_depth <= 30 )
311+ // There is the possibility that converting to integer will
312+ // exceed the dynamic range of 32bit integer; therefore, care must be
313+ // exercised.
314+ // We look if the floating point number is outside the half-closed
315+ // interval [-0.5f, 0.5f). If so, we limit the resulting integer
316+ // to the maximum/minimum that number supports.
317+ si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
318+ v128_t mul = wasm_f32x4_splat ((float )(1ull << bit_depth));
319+ v128_t fl_up_lim = wasm_f32x4_splat (-(float )neg_limit); // val < upper
320+ v128_t fl_low_lim = wasm_f32x4_splat ((float )neg_limit); // val >= lower
321+ v128_t s32_up_lim = wasm_i32x4_splat (INT_MAX >> (32 - bit_depth));
322+ v128_t s32_low_lim = wasm_i32x4_splat (INT_MIN >> (32 - bit_depth));
323+
324+ if (is_signed)
311325 {
312- // We are leaving two bit overhead -- here, we are assuming that after
313- // multiplications, the resulting number can still be represented
314- // using 32 bit integer
315- v128_t mul = wasm_f32x4_splat ((float )(1 << bit_depth));
316- v128_t upper_limit = wasm_i32x4_splat (INT_MAX >> (32 - bit_depth));
317- v128_t lower_limit = wasm_i32x4_splat (INT_MIN >> (32 - bit_depth));
318-
319- if (is_signed)
320- {
321- const v128_t zero = wasm_f32x4_splat (0 .0f );
322- const v128_t half = wasm_f32x4_splat (0 .5f );
323- v128_t bias = wasm_i32x4_splat (-((1 << (bit_depth - 1 )) + 1 ));
324- for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 )
325- {
326- v128_t t = wasm_v128_load (sp);
327- t = wasm_f32x4_mul (t, mul);
328- v128_t u = ojph_convert_float_to_i32 (t, zero, half);
329- u = wasm_i32x4_max (u, lower_limit);
330- u = wasm_i32x4_min (u, upper_limit);
331-
332- v128_t c = wasm_i32x4_gt (zero, u); // 0xFFFFFFFF for -ve value
333- v128_t neg = wasm_i32x4_sub (bias, u); // -bias -value
334- neg = wasm_v128_and (c, neg); // keep only - bias - value
335- v128_t v = wasm_v128_andnot (u, c); // keep only +ve or 0
336- v = wasm_v128_or (neg, v); // combine
337- wasm_v128_store (dp, v);
338- }
339- }
340- else
341- {
342- const v128_t zero = wasm_f32x4_splat (0 .0f );
343- const v128_t half = wasm_f32x4_splat (0 .5f );
344- v128_t ihalf = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
345- for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
346- v128_t t = wasm_v128_load (sp);
347- t = wasm_f32x4_mul (t, mul);
348- v128_t u = ojph_convert_float_to_i32 (t, zero, half);
349- u = wasm_i32x4_max (u, lower_limit);
350- u = wasm_i32x4_min (u, upper_limit);
351- u = wasm_i32x4_add (u, ihalf);
352- wasm_v128_store (dp, u);
353- }
326+ const v128_t zero = wasm_f32x4_splat (0 .0f );
327+ const v128_t half = wasm_f32x4_splat (0 .5f );
328+ v128_t bias = wasm_i32x4_splat (-((1 << (bit_depth - 1 )) + 1 ));
329+ for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
330+ v128_t t = wasm_v128_load (sp);
331+ t = wasm_f32x4_mul (t, mul);
332+ v128_t u = ojph_convert_float_to_i32 (t, zero, half);
333+ u = ojph_wasm_i32x4_max_ge (u, s32_low_lim, t, fl_low_lim);
334+ u = ojph_wasm_i32x4_min_lt (u, s32_up_lim, t, fl_up_lim);
335+ v128_t c = wasm_i32x4_gt (zero, u); // 0xFFFFFFFF for -ve value
336+ v128_t neg = wasm_i32x4_sub (bias, u); // -bias -value
337+ neg = wasm_v128_and (c, neg); // keep only - bias - value
338+ v128_t v = wasm_v128_andnot (u, c); // keep only +ve or 0
339+ v = wasm_v128_or (neg, v); // combine
340+ wasm_v128_store (dp, v);
354341 }
355342 }
356343 else
357344 {
358- // There is the possibility that converting to integer will
359- // exceed the dynamic range of 32bit integer; therefore, we need
360- // to use 64 bit. One may think, why not limit the floats to the
361- // range of [-0.5f, 0.5f)?
362- // Notice the half closed range -- we need a value just below 0.5f.
363- // While getting this number is possible, after multiplication, the
364- // resulting number will not be exactly the maximum that the integer
365- // can achieve. All this is academic, because here are talking
366- // about a number which has all the exponent bits set, meaning
367- // it is either infinity, -infinity, qNan or sNan.
368- si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
369- v128_t mul = wasm_f32x4_splat ((float )(1 << bit_depth));
370- v128_t fl_upper_limit = wasm_f32x4_splat (-(float )neg_limit); // val< up
371- v128_t fl_lower_limit = wasm_f32x4_splat ( (float )neg_limit); // val>=lo
372- v128_t s32_upper_limit = wasm_i32x4_splat (INT_MAX >> (32 - bit_depth));
373- v128_t s32_lower_limit = wasm_i32x4_splat (INT_MIN >> (32 - bit_depth));
374-
375- if (is_signed)
376- {
377- const v128_t zero = wasm_f32x4_splat (0 .0f );
378- const v128_t half = wasm_f32x4_splat (0 .5f );
379- v128_t bias = wasm_i32x4_splat (-((1 << (bit_depth - 1 )) + 1 ));
380- for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
381- v128_t t = wasm_v128_load (sp);
382- t = wasm_f32x4_mul (t, mul);
383- v128_t u = ojph_convert_float_to_i32 (t, zero, half);
384- u = ojph_wasm_i32x4_max_ge (u, s32_lower_limit, t, fl_lower_limit);
385- u = ojph_wasm_i32x4_min_lt (u, s32_upper_limit, t, fl_upper_limit);
386- v128_t c = wasm_i32x4_gt (zero, u); // 0xFFFFFFFF for -ve value
387- v128_t neg = wasm_i32x4_sub (bias, u); // -bias -value
388- neg = wasm_v128_and (c, neg); // keep only - bias - value
389- v128_t v = wasm_v128_andnot (u, c); // keep only +ve or 0
390- v = wasm_v128_or (neg, v); // combine
391- wasm_v128_store (dp, v);
392- }
393- }
394- else
395- {
396- const v128_t zero = wasm_f32x4_splat (0 .0f );
397- const v128_t half = wasm_f32x4_splat (0 .5f );
398- v128_t ihalf = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
399- for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
400- v128_t t = wasm_v128_load (sp);
401- t = wasm_f32x4_mul (t, mul);
402- v128_t u = ojph_convert_float_to_i32 (t, zero, half);
403- u = ojph_wasm_i32x4_max_ge (u, s32_lower_limit, t, fl_lower_limit);
404- u = ojph_wasm_i32x4_min_lt (u, s32_upper_limit, t, fl_upper_limit);
405- u = wasm_i32x4_add (u, ihalf);
406- wasm_v128_store (dp, u);
407- }
345+ const v128_t zero = wasm_f32x4_splat (0 .0f );
346+ const v128_t half = wasm_f32x4_splat (0 .5f );
347+ v128_t ihalf = wasm_i32x4_splat (-(1 << (bit_depth - 1 )));
348+ for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
349+ v128_t t = wasm_v128_load (sp);
350+ t = wasm_f32x4_mul (t, mul);
351+ v128_t u = ojph_convert_float_to_i32 (t, zero, half);
352+ u = ojph_wasm_i32x4_max_ge (u, s32_low_lim, t, fl_low_lim);
353+ u = ojph_wasm_i32x4_min_lt (u, s32_up_lim, t, fl_up_lim);
354+ u = wasm_i32x4_add (u, ihalf);
355+ wasm_v128_store (dp, u);
408356 }
409357 }
410358 }
0 commit comments