Skip to content

Commit e0a3c2b

Browse files
committed
Added SIMD, except for the 32bit path. Wasm SIMD is missing. Not tested.
1 parent e60473c commit e0a3c2b

File tree

5 files changed

+571
-76
lines changed

5 files changed

+571
-76
lines changed

src/core/transform/ojph_colour.cpp

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -79,16 +79,16 @@ namespace ojph {
7979
void (*cnvrt_float_to_si32)
8080
(const float *sp, si32 *dp, float mul, ui32 width) = NULL;
8181

82-
//////////////////////////////////////////////////////////////////////////
83-
void (*irv_convert_to_float_nlt_type3) (
84-
const line_buf *src_line, ui32 src_line_offset,
85-
line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
86-
8782
//////////////////////////////////////////////////////////////////////////
8883
void (*irv_convert_to_integer_nlt_type3) (
8984
const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
9085
ui32 bit_depth, bool is_signed, ui32 width) = NULL;
9186

87+
//////////////////////////////////////////////////////////////////////////
88+
void (*irv_convert_to_float_nlt_type3) (
89+
const line_buf *src_line, ui32 src_line_offset,
90+
line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
91+
9292
//////////////////////////////////////////////////////////////////////////
9393
void (*rct_forward)
9494
(const line_buf* r, const line_buf* g, const line_buf* b,
@@ -156,6 +156,10 @@ namespace ojph {
156156
rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3;
157157
cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd;
158158
cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32;
159+
irv_convert_to_integer_nlt_type3 =
160+
sse2_irv_convert_to_integer_nlt_type3;
161+
irv_convert_to_float_nlt_type3 =
162+
sse2_irv_convert_to_float_nlt_type3;
159163
rct_forward = sse2_rct_forward;
160164
rct_backward = sse2_rct_backward;
161165
}
@@ -178,6 +182,10 @@ namespace ojph {
178182
{
179183
rev_convert = avx2_rev_convert;
180184
rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3;
185+
irv_convert_to_integer_nlt_type3 =
186+
avx2_irv_convert_to_integer_nlt_type3;
187+
irv_convert_to_float_nlt_type3 =
188+
avx2_irv_convert_to_float_nlt_type3;
181189
rct_forward = avx2_rct_forward;
182190
rct_backward = avx2_rct_backward;
183191
}
@@ -332,39 +340,6 @@ namespace ojph {
332340
*dp++ = ojph_round(*sp++ * mul);
333341
}
334342

335-
//////////////////////////////////////////////////////////////////////////
336-
void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line,
337-
ui32 src_line_offset, line_buf *dst_line,
338-
ui32 bit_depth, bool is_signed, ui32 width)
339-
{
340-
assert((src_line->flags & line_buf::LFT_32BIT) &&
341-
(src_line->flags & line_buf::LFT_INTEGER) &&
342-
(dst_line->flags & line_buf::LFT_32BIT) &&
343-
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
344-
345-
float mul = (float)(1.0 / 65536.0 / 65536.0);
346-
347-
const si32* sp = src_line->i32 + src_line_offset;
348-
float* dp = dst_line->f32;
349-
ui32 shift = 32 - bit_depth;
350-
if (is_signed)
351-
{
352-
si32 bias = (si32)((ui32)INT_MIN + 1);
353-
for (ui32 i = width; i > 0; --i) {
354-
si32 v = *sp++ << shift;
355-
v = (v >= 0) ? v : (- v - bias);
356-
*dp++ = (float)v * mul;
357-
}
358-
}
359-
else
360-
{
361-
for (ui32 i = width; i > 0; --i) {
362-
si32 v = *sp++ << shift;
363-
*dp++ = (float)v * mul - 0.5f;
364-
}
365-
}
366-
}
367-
368343
//////////////////////////////////////////////////////////////////////////
369344
void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
370345
line_buf *dst_line, ui32 dst_line_offset,
@@ -450,6 +425,39 @@ namespace ojph {
450425
}
451426
}
452427

428+
//////////////////////////////////////////////////////////////////////////
429+
void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line,
430+
ui32 src_line_offset, line_buf *dst_line,
431+
ui32 bit_depth, bool is_signed, ui32 width)
432+
{
433+
assert((src_line->flags & line_buf::LFT_32BIT) &&
434+
(src_line->flags & line_buf::LFT_INTEGER) &&
435+
(dst_line->flags & line_buf::LFT_32BIT) &&
436+
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
437+
438+
float mul = (float)(1.0 / 65536.0 / 65536.0);
439+
440+
const si32* sp = src_line->i32 + src_line_offset;
441+
float* dp = dst_line->f32;
442+
ui32 shift = 32 - bit_depth;
443+
if (is_signed)
444+
{
445+
si32 bias = (si32)((ui32)INT_MIN + 1);
446+
for (ui32 i = width; i > 0; --i) {
447+
si32 v = *sp++ << shift;
448+
v = (v >= 0) ? v : (- v - bias);
449+
*dp++ = (float)v * mul;
450+
}
451+
}
452+
else
453+
{
454+
for (ui32 i = width; i > 0; --i) {
455+
si32 v = *sp++ << shift;
456+
*dp++ = (float)v * mul - 0.5f;
457+
}
458+
}
459+
}
460+
453461
//////////////////////////////////////////////////////////////////////////
454462
void gen_rct_forward(
455463
const line_buf *r, const line_buf *g, const line_buf *b,

src/core/transform/ojph_colour_avx2.cpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,153 @@ namespace ojph {
236236
}
237237
}
238238

239+
//////////////////////////////////////////////////////////////////////////
240+
void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
241+
line_buf *dst_line, ui32 dst_line_offset,
242+
ui32 bit_depth, bool is_signed, ui32 width)
243+
{
244+
assert((src_line->flags & line_buf::LFT_32BIT) &&
245+
(src_line->flags & line_buf::LFT_INTEGER) == 0 &&
246+
(dst_line->flags & line_buf::LFT_32BIT) &&
247+
(dst_line->flags & line_buf::LFT_INTEGER));
248+
249+
const float* sp = src_line->f32;
250+
si32* dp = dst_line->i32 + dst_line_offset;
251+
if (bit_depth <= 30)
252+
{
253+
// We are leaving two bit overhead -- here, we are assuming that after
254+
// multiplications, the resulting number can still be represented
255+
// using 32 bit integer
256+
__m256 mul = _mm256_set1_ps((float)(1 << bit_depth));
257+
__m256i upper_limit = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
258+
__m256i lower_limit = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
259+
260+
if (is_signed)
261+
{
262+
__m256i zero = _mm256_setzero_si256();
263+
__m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
264+
for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8)
265+
{
266+
__m256 t = _mm256_loadu_ps(sp);
267+
t = _mm256_mul_ps(t, mul);
268+
t = _mm256_round_ps(t,
269+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
270+
__m256i u = _mm256_cvtps_epi32(t);
271+
u = _mm256_max_epi32(u, lower_limit);
272+
u = _mm256_min_epi32(u, upper_limit);
273+
274+
__m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
275+
__m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
276+
neg = _mm256_and_si256(c, neg); //keep only - bias - value
277+
__m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0
278+
v = _mm256_or_si256(neg, v); //combine
279+
_mm256_storeu_si256((__m256i*)dp, v);
280+
}
281+
}
282+
else
283+
{
284+
__m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
285+
for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
286+
__m256 t = _mm256_loadu_ps(sp);
287+
t = _mm256_mul_ps(t, mul);
288+
t = _mm256_round_ps(t,
289+
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
290+
__m256i u = _mm256_cvtps_epi32(t);
291+
u = _mm256_max_epi32(u, lower_limit);
292+
u = _mm256_min_epi32(u, upper_limit);
293+
u = _mm256_add_epi32(u, half);
294+
_mm256_storeu_si256((__m256i*)dp, u);
295+
}
296+
}
297+
}
298+
else
299+
{
300+
// There is the possibility that converting to integer will
301+
// exceed the dynamic range of 32bit integer; therefore, we need
302+
// to use 64 bit. One may think, why not limit the floats to the
303+
// range of [-0.5f, 0.5f)?
304+
// Notice the half closed range -- we need a value just below 0.5f.
305+
// While getting this number is possible, after multiplication, the
306+
// resulting number will not be exactly the maximum that the integer
307+
// can achieve. All this is academic, because here are talking
308+
// about a number which has all the exponent bits set, meaning
309+
// it is either infinity, -infinity, qNan or sNan.
310+
float mul = (float)(1ull << bit_depth);
311+
const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
312+
const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
313+
314+
if (is_signed)
315+
{
316+
const si32 bias = (1 << (bit_depth - 1)) + 1;
317+
for (ui32 i = width; i > 0; --i) {
318+
si64 t = ojph_round64(*sp++ * mul);
319+
t = ojph_max(t, lower_limit);
320+
t = ojph_min(t, upper_limit);
321+
si32 v = (si32)t;
322+
v = (v >= 0) ? v : (- v - bias);
323+
*dp++ = v;
324+
}
325+
}
326+
else
327+
{
328+
const si32 half = (1 << (bit_depth - 1));
329+
for (ui32 i = width; i > 0; --i) {
330+
si64 t = ojph_round64(*sp++ * mul);
331+
t = ojph_max(t, lower_limit);
332+
t = ojph_min(t, upper_limit);
333+
si32 v = (si32)t;
334+
*dp++ = v + half;
335+
}
336+
}
337+
}
338+
}
339+
340+
//////////////////////////////////////////////////////////////////////////
341+
void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
342+
ui32 src_line_offset, line_buf *dst_line,
343+
ui32 bit_depth, bool is_signed, ui32 width)
344+
{
345+
assert((src_line->flags & line_buf::LFT_32BIT) &&
346+
(src_line->flags & line_buf::LFT_INTEGER) &&
347+
(dst_line->flags & line_buf::LFT_32BIT) &&
348+
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
349+
350+
__m256 mul = _mm256_set1_ps((float)(1.0 / 65536.0 / 65536.0));
351+
352+
const si32* sp = src_line->i32 + src_line_offset;
353+
float* dp = dst_line->f32;
354+
si32 shift = 32 - (si32)bit_depth;
355+
if (is_signed)
356+
{
357+
__m256i zero = _mm256_setzero_si256();
358+
__m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1));
359+
for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
360+
__m256i t = _mm256_loadu_si256((__m256i*)sp);
361+
__m256i u = _mm256_slli_epi32(t, shift);
362+
__m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve value
363+
__m256i neg = _mm256_sub_epi32(bias, u); // - bias - value
364+
neg = _mm256_and_si256(c, neg); // keep only - bias - value
365+
t = _mm256_andnot_si256(c, u); // keep only +ve or 0
366+
u = _mm256_or_si256(neg, t); // combine
367+
__m256 v = _mm256_cvtepi32_ps(u);
368+
v = _mm256_mul_ps(v, mul);
369+
_mm256_storeu_ps(dp, v);
370+
}
371+
}
372+
else
373+
{
374+
__m256 half = _mm256_set1_ps(0.5f);
375+
for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
376+
__m256i t = _mm256_loadu_si256((__m256i*)sp);
377+
__m256i u = _mm256_slli_epi32(t, shift);
378+
__m256 v = _mm256_cvtepi32_ps(u);
379+
v = _mm256_mul_ps(v, mul);
380+
v = _mm256_add_ps(v, half);
381+
_mm256_storeu_ps(dp, v);
382+
}
383+
}
384+
}
385+
239386
//////////////////////////////////////////////////////////////////////////
240387
void avx2_rct_forward(const line_buf *r,
241388
const line_buf *g,

src/core/transform/ojph_colour_local.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ namespace ojph {
168168
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
169169
ui32 width);
170170

171+
//////////////////////////////////////////////////////////////////////////
172+
void sse2_irv_convert_to_integer_nlt_type3(
173+
const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
174+
ui32 bit_depth, bool is_signed, ui32 width);
175+
171176
//////////////////////////////////////////////////////////////////////////
172177
//
173178
//
@@ -188,6 +193,11 @@ namespace ojph {
188193
line_buf *dst_line, const ui32 dst_line_offset,
189194
si64 shift, ui32 width);
190195

196+
//////////////////////////////////////////////////////////////////////////
197+
void sse2_irv_convert_to_float_nlt_type3(
198+
const line_buf *src_line, ui32 src_line_offset,
199+
line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
200+
191201
//////////////////////////////////////////////////////////////////////////
192202
void sse2_rct_forward(
193203
const line_buf *r, const line_buf *g, const line_buf *b,
@@ -250,6 +260,16 @@ namespace ojph {
250260
line_buf *dst_line, const ui32 dst_line_offset,
251261
si64 shift, ui32 width);
252262

263+
//////////////////////////////////////////////////////////////////////////
264+
void avx2_irv_convert_to_integer_nlt_type3(
265+
const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
266+
ui32 bit_depth, bool is_signed, ui32 width);
267+
268+
//////////////////////////////////////////////////////////////////////////
269+
void avx2_irv_convert_to_float_nlt_type3(
270+
const line_buf *src_line, ui32 src_line_offset,
271+
line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
272+
253273
//////////////////////////////////////////////////////////////////////////
254274
void avx2_rct_forward(
255275
const line_buf *r, const line_buf *g, const line_buf *b,
@@ -296,6 +316,16 @@ namespace ojph {
296316
line_buf *dst_line, const ui32 dst_line_offset,
297317
si64 shift, ui32 width);
298318

319+
//////////////////////////////////////////////////////////////////////////
320+
void wasm_irv_convert_to_integer_nlt_type3(
321+
const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
322+
ui32 bit_depth, bool is_signed, ui32 width);
323+
324+
//////////////////////////////////////////////////////////////////////////
325+
void wasm_irv_convert_to_float_nlt_type3(
326+
const line_buf *src_line, ui32 src_line_offset,
327+
line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
328+
299329
//////////////////////////////////////////////////////////////////////////
300330
void wasm_rct_forward(
301331
const line_buf *r, const line_buf *g, const line_buf *b,

0 commit comments

Comments
 (0)