Skip to content

Commit 2ea19eb

Browse files
committed
Updated WASM SIMD
1 parent ef9f713 commit 2ea19eb

File tree

1 file changed

+44
-96
lines changed

1 file changed

+44
-96
lines changed

src/core/transform/ojph_colour_wasm.cpp

Lines changed: 44 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ namespace ojph {
5050
//////////////////////////////////////////////////////////////////////////
5151
static inline
5252
v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
53-
{ // We implement ojph_round, which is
53+
{ // We implement ojph_round, which is
5454
// val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
5555
v128_t c = wasm_f32x4_ge(a, zero); // greater or equal to zero
5656
v128_t p = wasm_f32x4_add(a, half); // for positive, add half
@@ -279,7 +279,7 @@ namespace ojph {
279279
static inline
280280
v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
281281
{
282-
v128_t c = wasm_i32x4_ge(x, y); // 0xFFFFFFFF for x >= y
282+
v128_t c = wasm_f32x4_ge(x, y); // 0xFFFFFFFF for x >= y
283283
v128_t d = wasm_v128_and(c, a); // keep only a, where x >= y
284284
v128_t e = wasm_v128_andnot(b, c); // keep only b, where x < y
285285
return wasm_v128_or(d, e); // combine
@@ -289,7 +289,7 @@ namespace ojph {
289289
static inline
290290
v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
291291
{
292-
v128_t c = wasm_i32x4_lt(x, y); // 0xFFFFFFFF for x < y
292+
v128_t c = wasm_f32x4_lt(x, y); // 0xFFFFFFFF for x < y
293293
v128_t d = wasm_v128_and(c, a); // keep only a, where x < y
294294
v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y
295295
return wasm_v128_or(d, e); // combine
@@ -305,106 +305,54 @@ namespace ojph {
305305
(dst_line->flags & line_buf::LFT_32BIT) &&
306306
(dst_line->flags & line_buf::LFT_INTEGER));
307307

308+
assert(bit_depth <= 32);
308309
const float* sp = src_line->f32;
309310
si32* dp = dst_line->i32 + dst_line_offset;
310-
if (bit_depth <= 30)
311+
// There is the possibility that converting to integer will
312+
// exceed the dynamic range of 32bit integer; therefore, care must be
313+
// exercised.
314+
// We look if the floating point number is outside the half-closed
315+
// interval [-0.5f, 0.5f). If so, we limit the resulting integer
316+
// to the maximum/minimum that number supports.
317+
si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
318+
v128_t mul = wasm_f32x4_splat((float)(1ull << bit_depth));
319+
v128_t fl_up_lim = wasm_f32x4_splat(-(float)neg_limit); // val < upper
320+
v128_t fl_low_lim = wasm_f32x4_splat((float)neg_limit); // val >= lower
321+
v128_t s32_up_lim = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
322+
v128_t s32_low_lim = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
323+
324+
if (is_signed)
311325
{
312-
// We are leaving two bit overhead -- here, we are assuming that after
313-
// multiplications, the resulting number can still be represented
314-
// using 32 bit integer
315-
v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth));
316-
v128_t upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
317-
v128_t lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
318-
319-
if (is_signed)
320-
{
321-
const v128_t zero = wasm_f32x4_splat(0.0f);
322-
const v128_t half = wasm_f32x4_splat(0.5f);
323-
v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
324-
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4)
325-
{
326-
v128_t t = wasm_v128_load(sp);
327-
t = wasm_f32x4_mul(t, mul);
328-
v128_t u = ojph_convert_float_to_i32(t, zero, half);
329-
u = wasm_i32x4_max(u, lower_limit);
330-
u = wasm_i32x4_min(u, upper_limit);
331-
332-
v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value
333-
v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
334-
neg = wasm_v128_and(c, neg); //keep only - bias - value
335-
v128_t v = wasm_v128_andnot(u, c); //keep only +ve or 0
336-
v = wasm_v128_or(neg, v); //combine
337-
wasm_v128_store(dp, v);
338-
}
339-
}
340-
else
341-
{
342-
const v128_t zero = wasm_f32x4_splat(0.0f);
343-
const v128_t half = wasm_f32x4_splat(0.5f);
344-
v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
345-
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
346-
v128_t t = wasm_v128_load(sp);
347-
t = wasm_f32x4_mul(t, mul);
348-
v128_t u = ojph_convert_float_to_i32(t, zero, half);
349-
u = wasm_i32x4_max(u, lower_limit);
350-
u = wasm_i32x4_min(u, upper_limit);
351-
u = wasm_i32x4_add(u, ihalf);
352-
wasm_v128_store(dp, u);
353-
}
326+
const v128_t zero = wasm_f32x4_splat(0.0f);
327+
const v128_t half = wasm_f32x4_splat(0.5f);
328+
v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
329+
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
330+
v128_t t = wasm_v128_load(sp);
331+
t = wasm_f32x4_mul(t, mul);
332+
v128_t u = ojph_convert_float_to_i32(t, zero, half);
333+
u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
334+
u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
335+
v128_t c = wasm_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value
336+
v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value
337+
neg = wasm_v128_and(c, neg); // keep only - bias - value
338+
v128_t v = wasm_v128_andnot(u, c); // keep only +ve or 0
339+
v = wasm_v128_or(neg, v); // combine
340+
wasm_v128_store(dp, v);
354341
}
355342
}
356343
else
357344
{
358-
// There is the possibility that converting to integer will
359-
// exceed the dynamic range of 32bit integer; therefore, we need
360-
// to use 64 bit. One may think, why not limit the floats to the
361-
// range of [-0.5f, 0.5f)?
362-
// Notice the half closed range -- we need a value just below 0.5f.
363-
// While getting this number is possible, after multiplication, the
364-
// resulting number will not be exactly the maximum that the integer
365-
// can achieve. All this is academic, because here are talking
366-
// about a number which has all the exponent bits set, meaning
367-
// it is either infinity, -infinity, qNan or sNan.
368-
si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
369-
v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth));
370-
v128_t fl_upper_limit = wasm_f32x4_splat(-(float)neg_limit); // val< up
371-
v128_t fl_lower_limit = wasm_f32x4_splat( (float)neg_limit); // val>=lo
372-
v128_t s32_upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
373-
v128_t s32_lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
374-
375-
if (is_signed)
376-
{
377-
const v128_t zero = wasm_f32x4_splat(0.0f);
378-
const v128_t half = wasm_f32x4_splat(0.5f);
379-
v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
380-
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
381-
v128_t t = wasm_v128_load(sp);
382-
t = wasm_f32x4_mul(t, mul);
383-
v128_t u = ojph_convert_float_to_i32(t, zero, half);
384-
u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
385-
u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
386-
v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value
387-
v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
388-
neg = wasm_v128_and(c, neg); //keep only - bias - value
389-
v128_t v = wasm_v128_andnot(u, c); //keep only +ve or 0
390-
v = wasm_v128_or(neg, v); //combine
391-
wasm_v128_store(dp, v);
392-
}
393-
}
394-
else
395-
{
396-
const v128_t zero = wasm_f32x4_splat(0.0f);
397-
const v128_t half = wasm_f32x4_splat(0.5f);
398-
v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
399-
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
400-
v128_t t = wasm_v128_load(sp);
401-
t = wasm_f32x4_mul(t, mul);
402-
v128_t u = ojph_convert_float_to_i32(t, zero, half);
403-
u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
404-
u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
405-
u = wasm_i32x4_add(u, ihalf);
406-
wasm_v128_store(dp, u);
407-
}
345+
const v128_t zero = wasm_f32x4_splat(0.0f);
346+
const v128_t half = wasm_f32x4_splat(0.5f);
347+
v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
348+
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
349+
v128_t t = wasm_v128_load(sp);
350+
t = wasm_f32x4_mul(t, mul);
351+
v128_t u = ojph_convert_float_to_i32(t, zero, half);
352+
u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
353+
u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
354+
u = wasm_i32x4_add(u, ihalf);
355+
wasm_v128_store(dp, u);
408356
}
409357
}
410358
}

0 commit comments

Comments
 (0)