Skip to content

Commit e21bfd0

Browse files
committed
Improvement and bug fixes.
1 parent 42b2efd commit e21bfd0

File tree

5 files changed

+394
-75
lines changed

5 files changed

+394
-75
lines changed

src/core/transform/ojph_colour.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ namespace ojph {
326326

327327
if (is_signed)
328328
{
329-
const si32 bias = (1 << (bit_depth - 1)) + 1;
329+
const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1);
330330
for (int i = (int)width; i > 0; --i) {
331331
float t = *sp++ * mul;
332332
si32 v = ojph_round(t);
@@ -339,7 +339,7 @@ namespace ojph {
339339
}
340340
else
341341
{
342-
const si32 half = 1 << (bit_depth - 1);
342+
const si32 half = (si32)(1ULL << (bit_depth - 1));
343343
for (int i = (int)width; i > 0; --i) {
344344
float t = *sp++ * mul;
345345
si32 v = ojph_round(t);
@@ -380,29 +380,28 @@ namespace ojph {
380380
(dst_line->flags & line_buf::LFT_32BIT) &&
381381
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
382382

383-
float mul = (float)(1.0 / 65536.0 / 65536.0);
383+
assert(bit_depth <= 32);
384+
float mul = (float)(1.0 / (double)(1ULL << bit_depth));
384385

385386
const si32* sp = src_line->i32 + src_line_offset;
386387
float* dp = dst_line->f32;
387-
ui32 shift = 32 - bit_depth;
388388
if (is_signed)
389389
{
390-
si32 bias = (si32)((ui32)INT_MIN + 1);
390+
const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1);
391391
for (int i = (int)width; i > 0; --i) {
392-
si32 v = *sp++ << shift;
392+
si32 v = *sp++;
393393
if (NLT_TYPE3)
394394
v = (v >= 0) ? v : (- v - bias);
395395
*dp++ = (float)v * mul;
396396
}
397397
}
398398
else
399399
{
400-
const ui32 half = (ui32)INT_MIN;
400+
const si32 half = (si32)(1ULL << (bit_depth - 1));
401401
for (int i = (int)width; i > 0; --i) {
402-
ui32 v = (ui32)*sp++;
403-
v <<= shift;
402+
si32 v = *sp++;
404403
v -= half;
405-
*dp++ = (float)(si32)v * mul;
404+
*dp++ = (float)v * mul;
406405
}
407406
}
408407
}

src/core/transform/ojph_colour_avx2.cpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,8 @@ namespace ojph {
293293
if (is_signed)
294294
{
295295
__m256i zero = _mm256_setzero_si256();
296-
__m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
296+
__m256i bias =
297+
_mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
297298
for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
298299
__m256 t = _mm256_loadu_ps(sp);
299300
t = _mm256_mul_ps(t, mul);
@@ -313,7 +314,7 @@ namespace ojph {
313314
}
314315
else
315316
{
316-
__m256i half = _mm256_set1_epi32(1 << (bit_depth - 1));
317+
__m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
317318
for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
318319
__m256 t = _mm256_loadu_ps(sp);
319320
t = _mm256_mul_ps(t, mul);
@@ -356,37 +357,36 @@ namespace ojph {
356357
(dst_line->flags & line_buf::LFT_32BIT) &&
357358
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
358359

359-
__m256 mul = _mm256_set1_ps((float)(1.0 / 65536.0 / 65536.0));
360+
assert(bit_depth <= 32);
361+
__m256 mul = _mm256_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
360362

361363
const si32* sp = src_line->i32 + src_line_offset;
362364
float* dp = dst_line->f32;
363-
si32 shift = 32 - (si32)bit_depth;
364365
if (is_signed)
365366
{
366367
__m256i zero = _mm256_setzero_si256();
367-
__m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1));
368+
__m256i bias =
369+
_mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
368370
for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
369371
__m256i t = _mm256_loadu_si256((__m256i*)sp);
370-
__m256i u = _mm256_slli_epi32(t, shift);
371372
if (NLT_TYPE3)
372373
{
373-
__m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
374-
__m256i neg = _mm256_sub_epi32(bias, u); // - bias - value
374+
__m256i c = _mm256_cmpgt_epi32(zero, t); // 0xFFFFFFFF for -ve val
375+
__m256i neg = _mm256_sub_epi32(bias, t); // - bias - value
375376
neg = _mm256_and_si256(c, neg); // keep only - bias - val
376-
t = _mm256_andnot_si256(c, u); // keep only +ve or 0
377-
u = _mm256_or_si256(neg, t); // combine
377+
c = _mm256_andnot_si256(c, t); // keep only +ve or 0
378+
t = _mm256_or_si256(neg, c); // combine
378379
}
379-
__m256 v = _mm256_cvtepi32_ps(u);
380+
__m256 v = _mm256_cvtepi32_ps(t);
380381
v = _mm256_mul_ps(v, mul);
381382
_mm256_storeu_ps(dp, v);
382383
}
383384
}
384385
else
385386
{
386-
__m256i half = _mm256_set1_epi32(INT_MIN);
387+
__m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
387388
for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
388389
__m256i t = _mm256_loadu_si256((__m256i*)sp);
389-
t = _mm256_slli_epi32(t, shift);
390390
t = _mm256_sub_epi32(t, half);
391391
__m256 v = _mm256_cvtepi32_ps(t);
392392
v = _mm256_mul_ps(v, mul);

src/core/transform/ojph_colour_sse2.cpp

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ namespace ojph {
160160
if (is_signed)
161161
{
162162
__m128i zero = _mm_setzero_si128();
163-
__m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
163+
__m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
164164
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
165165
__m128 t = _mm_loadu_ps(sp);
166166
t = _mm_mul_ps(t, mul);
@@ -180,7 +180,7 @@ namespace ojph {
180180
}
181181
else
182182
{
183-
__m128i half = _mm_set1_epi32(1 << (bit_depth - 1));
183+
__m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
184184
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
185185
__m128 t = _mm_loadu_ps(sp);
186186
t = _mm_mul_ps(t, mul);
@@ -426,37 +426,35 @@ namespace ojph {
426426
(dst_line->flags & line_buf::LFT_32BIT) &&
427427
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
428428

429-
__m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
429+
assert(bit_depth <= 32);
430+
__m128 mul = _mm_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
430431

431432
const si32* sp = src_line->i32 + src_line_offset;
432433
float* dp = dst_line->f32;
433-
si32 shift = 32 - (si32)bit_depth;
434434
if (is_signed)
435435
{
436436
__m128i zero = _mm_setzero_si128();
437-
__m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
437+
__m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
438438
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
439439
__m128i t = _mm_loadu_si128((__m128i*)sp);
440-
__m128i u = _mm_slli_epi32(t, shift);
441440
if (NLT_TYPE3)
442441
{
443-
__m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
444-
__m128i neg = _mm_sub_epi32(bias, u); // - bias - value
442+
__m128i c = _mm_cmplt_epi32(t, zero); // 0xFFFFFFFF for -ve value
443+
__m128i neg = _mm_sub_epi32(bias, t); // - bias - value
445444
neg = _mm_and_si128(c, neg); // keep only - bias - value
446-
t = _mm_andnot_si128(c, u); // keep only +ve or 0
447-
u = _mm_or_si128(neg, t); // combine
445+
c = _mm_andnot_si128(c, t); // keep only +ve or 0
446+
t = _mm_or_si128(neg, c); // combine
448447
}
449-
__m128 v = _mm_cvtepi32_ps(u);
448+
__m128 v = _mm_cvtepi32_ps(t);
450449
v = _mm_mul_ps(v, mul);
451450
_mm_storeu_ps(dp, v);
452451
}
453452
}
454453
else
455454
{
456-
__m128i half = _mm_set1_epi32(INT_MIN);
455+
__m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
457456
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
458457
__m128i t = _mm_loadu_si128((__m128i*)sp);
459-
t = _mm_slli_epi32(t, shift);
460458
t = _mm_sub_epi32(t, half);
461459
__m128 v = _mm_cvtepi32_ps(t);
462460
v = _mm_mul_ps(v, mul);

src/core/transform/ojph_colour_wasm.cpp

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ namespace ojph {
327327
{
328328
const v128_t zero = wasm_f32x4_splat(0.0f);
329329
const v128_t half = wasm_f32x4_splat(0.5f);
330-
v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
330+
v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
331331
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
332332
v128_t t = wasm_v128_load(sp);
333333
t = wasm_f32x4_mul(t, mul);
@@ -349,7 +349,7 @@ namespace ojph {
349349
{
350350
const v128_t zero = wasm_f32x4_splat(0.0f);
351351
const v128_t half = wasm_f32x4_splat(0.5f);
352-
v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1));
352+
v128_t ihalf = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
353353
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
354354
v128_t t = wasm_v128_load(sp);
355355
t = wasm_f32x4_mul(t, mul);
@@ -392,25 +392,24 @@ namespace ojph {
392392
(dst_line->flags & line_buf::LFT_32BIT) &&
393393
(dst_line->flags & line_buf::LFT_INTEGER) == 0);
394394

395-
v128_t mul = wasm_f32x4_splat((float)(1.0 / 65536.0 / 65536.0));
395+
assert(bit_depth <= 32);
396+
v128_t mul = wasm_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth)));
396397

397398
const si32* sp = src_line->i32 + src_line_offset;
398399
float* dp = dst_line->f32;
399-
ui32 shift = (ui32)32 - bit_depth;
400400
if (is_signed)
401401
{
402402
v128_t zero = wasm_i32x4_splat(0);
403-
v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1));
403+
v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
404404
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
405405
v128_t t = wasm_v128_load(sp);
406-
v128_t u = wasm_i32x4_shl(t, shift);
407406
if (NLT_TYPE3)
408407
{
409-
v128_t c = wasm_i32x4_lt(u, zero); // 0xFFFFFFFF for -ve value
410-
v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value
408+
v128_t c = wasm_i32x4_lt(t, zero); // 0xFFFFFFFF for -ve value
409+
v128_t neg = wasm_i32x4_sub(bias, t); // - bias - value
411410
neg = wasm_v128_and(c, neg); // keep only - bias - value
412-
t = wasm_v128_andnot(u, c); // keep only +ve or 0
413-
u = wasm_v128_or(neg, t); // combine
411+
c = wasm_v128_andnot(t, c); // keep only +ve or 0
412+
t = wasm_v128_or(neg, c); // combine
414413
}
415414
v128_t v = wasm_f32x4_convert_i32x4(u);
416415
v = wasm_f32x4_mul(v, mul);
@@ -419,10 +418,9 @@ namespace ojph {
419418
}
420419
else
421420
{
422-
v128_t half = wasm_i32x4_splat(INT_MIN);
421+
v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
423422
for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
424423
v128_t t = wasm_v128_load(sp);
425-
v128_t u = wasm_i32x4_shl(t, shift);
426424
u = wasm_i32x4_sub(u, half);
427425
v128_t v = wasm_f32x4_convert_i32x4(u);
428426
v = wasm_f32x4_mul(v, mul);

0 commit comments

Comments
 (0)