Skip to content

Commit ae77a31

Browse files
Merge branch 'aous72:master' into feature/add-openexr-support
2 parents 3eafb16 + caa3d3d commit ae77a31

File tree

5 files changed

+80
-63
lines changed

5 files changed

+80
-63
lines changed

src/core/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ if(EMSCRIPTEN)
4040
endif()
4141
else()
4242
if (NOT OJPH_DISABLE_SIMD)
43-
if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")
43+
if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")
4444
OR ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_I386")
4545
OR MULTI_GEN_X86_64)
4646

@@ -95,7 +95,7 @@ else()
9595
set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3)
9696
set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
9797
set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
98-
set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd)
98+
set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512cd")
9999
set_source_files_properties(transform/ojph_colour_sse.cpp PROPERTIES COMPILE_FLAGS -msse)
100100
set_source_files_properties(transform/ojph_colour_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
101101
set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
@@ -104,7 +104,7 @@ else()
104104
set_source_files_properties(transform/ojph_transform_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
105105
set_source_files_properties(transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
106106
set_source_files_properties(transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
107-
set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
107+
set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512cd")
108108
endif()
109109
endif()
110110

src/core/codestream/ojph_codeblock.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,21 @@
33
// This software is released under the 2-Clause BSD license, included
44
// below.
55
//
6-
// Copyright (c) 2019, Aous Naman
6+
// Copyright (c) 2019, Aous Naman
77
// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
88
// Copyright (c) 2019, The University of New South Wales, Australia
9-
//
9+
//
1010
// Redistribution and use in source and binary forms, with or without
1111
// modification, are permitted provided that the following conditions are
1212
// met:
13-
//
13+
//
1414
// 1. Redistributions of source code must retain the above copyright
1515
// notice, this list of conditions and the following disclaimer.
16-
//
16+
//
1717
// 2. Redistributions in binary form must reproduce the above copyright
1818
// notice, this list of conditions and the following disclaimer in the
1919
// documentation and/or other materials provided with the distribution.
20-
//
20+
//
2121
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
2222
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
2323
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -71,11 +71,11 @@ namespace ojph {
7171
};
7272

7373
public:
74-
static void pre_alloc(codestream *codestream, const size& nominal,
74+
static void pre_alloc(codestream *codestream, const size& nominal,
7575
ui32 precision);
7676
void finalize_alloc(codestream *codestream, subband* parent,
7777
const size& nominal, const size& cb_size,
78-
coded_cb_header* coded_cb, ui32 K_max,
78+
coded_cb_header* coded_cb, ui32 K_max,
7979
int tbx0, ui32 precision, ui32 comp_idx);
8080
void push(line_buf *line);
8181
void encode(mem_elastic_allocator *elastic);
@@ -115,7 +115,7 @@ namespace ojph {
115115
struct coded_cb_header
116116
{
117117
ui32 pass_length[2];
118-
ui32 num_passes;
118+
ui32 num_passes; // number of passes to be decoded
119119
ui32 Kmax;
120120
ui32 missing_msbs;
121121
coded_lists *next_coded;

src/core/codestream/ojph_precinct.cpp

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,21 @@
22
// This software is released under the 2-Clause BSD license, included
33
// below.
44
//
5-
// Copyright (c) 2019, Aous Naman
5+
// Copyright (c) 2019, Aous Naman
66
// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
77
// Copyright (c) 2019, The University of New South Wales, Australia
8-
//
8+
//
99
// Redistribution and use in source and binary forms, with or without
1010
// modification, are permitted provided that the following conditions are
1111
// met:
12-
//
12+
//
1313
// 1. Redistributions of source code must retain the above copyright
1414
// notice, this list of conditions and the following disclaimer.
15-
//
15+
//
1616
// 2. Redistributions in binary form must reproduce the above copyright
1717
// notice, this list of conditions and the following disclaimer in the
1818
// documentation and/or other materials provided with the distribution.
19-
//
19+
//
2020
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
2121
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
2222
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -254,7 +254,7 @@ namespace ojph {
254254
bits2 = 32 - (int)count_leading_zeros(cp->pass_length[1]);
255255
int bits = ojph_max(bits1, bits2 - extra_bit) - 3;
256256
bits = ojph_max(bits, 0);
257-
bb_put_bits(&bb, 0xFFFFFFFEu, bits+1,
257+
bb_put_bits(&bb, 0xFFFFFFFEu, bits+1,
258258
elastic, cur_coded_list, ph_bytes);
259259

260260
bb_put_bits(&bb, cp->pass_length[0], bits+3,
@@ -463,37 +463,54 @@ namespace ojph {
463463
}
464464
cp->num_passes = num_passes;
465465

466-
//parse pass lengths
467-
//for one pass, one length, but for 2 or 3 passes, two lengths
468-
int extra_bit = cp->num_passes > 2 ? 1 : 0;
469-
int bits1 = 3;
466+
// Parse pass lengths
467+
// When number of passes is one, one length.
468+
// When number of passes is two or three, two lengths.
469+
// When number of passes > 3, we have place holder passes;
470+
// In this case, subtract multiples of 3 from the number of
471+
// passes; for example, if we have 10 passes, we subtract 9,
472+
// producing 1 pass.
473+
474+
// 1 => 1, 2 => 2, 3 => 3, 4 => 1, 5 => 2, 6 => 3
475+
ui32 num_phld_passes = (num_passes - 1) / 3;
476+
cp->missing_msbs += num_phld_passes;
477+
478+
num_phld_passes *= 3;
479+
cp->num_passes = num_passes - num_phld_passes;
480+
cp->pass_length[0] = cp->pass_length[1] = 0;
481+
482+
int Lblock = 3;
470483
bit = 1;
471484
while (bit)
472485
{
486+
// add any extra bits here
473487
if (bb_read_bit(&bb, bit) == false)
474488
{ data_left = 0; throw "error reading from file p8"; }
475-
bits1 += bit;
489+
Lblock += bit;
476490
}
477491

478-
if (bb_read_bits(&bb, bits1, bit) == false)
492+
int bits = Lblock + 31 -
493+
(int)count_leading_zeros(num_phld_passes + 1);
494+
if (bb_read_bits(&bb, bits, bit) == false)
479495
{ data_left = 0; throw "error reading from file p9"; }
480-
if (bit < 2) {
496+
if (bit < 2)
481497
throw "The cleanup segment of an HT codeblock cannot contain "
482498
"less than 2 bytes";
483-
}
484-
if (bit >= 65535) {
499+
if (bit >= 65535)
485500
throw "The cleanup segment of an HT codeblock must contain "
486501
"less than 65535 bytes";
487-
}
488502
cp->pass_length[0] = bit;
489-
if (num_passes > 1)
503+
504+
if (cp->num_passes > 1)
490505
{
491-
if (bb_read_bits(&bb, bits1 + extra_bit, bit) == false)
506+
//bits = Lblock + 31 - count_leading_zeros(cp->num_passes - 1);
507+
// The following is simpler than the above, I think?
508+
bits = Lblock + (cp->num_passes > 2 ? 1 : 0);
509+
if (bb_read_bits(&bb, bits, bit) == false)
492510
{ data_left = 0; throw "error reading from file p10"; }
493-
if (bit >= 2047) {
511+
if (bit >= 2047)
494512
throw "The refinement segment (SigProp and MagRep passes) of "
495513
"an HT codeblock must contain less than 2047 bytes";
496-
}
497514
cp->pass_length[1] = bit;
498515
}
499516
}
@@ -532,7 +549,7 @@ namespace ojph {
532549
ui32 t = ojph_min(num_bytes, bb.bytes_left);
533550
file->seek(t, infile_base::OJPH_SEEK_CUR);
534551
ui32 bytes_read = (ui32)(file->tell() - cur_loc);
535-
cp->pass_length[0] = cp->pass_length[1] = 0;
552+
cp->pass_length[0] = cp->pass_length[1] = 0;
536553
bb.bytes_left -= bytes_read;
537554
assert(bytes_read == t || bb.bytes_left == 0);
538555
}

src/core/coding/ojph_block_encoder_avx512.cpp

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
539539
/* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
540540
val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
541541
_s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
542-
_s_vec[i] =
542+
_s_vec[i] =
543543
_mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
544544
/* } */
545545
}
@@ -571,18 +571,18 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
571571
ui32 o_idx = i & 0x1;
572572

573573
eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
574-
eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
575-
idx[e_idx],
574+
eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
575+
idx[e_idx],
576576
_eq_vec[o_idx + 2]);
577577

578578
s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
579579
s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
580-
idx[e_idx],
580+
idx[e_idx],
581581
_s_vec[o_idx + 2]);
582582

583583
_rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
584584
_rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
585-
idx[e_idx],
585+
idx[e_idx],
586586
val_vec[o_idx + 2]);
587587
_rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
588588

@@ -695,11 +695,11 @@ static void proc_ms_encode(ms_struct *msp,
695695
/* cwd = s[i * 4 + 0] & ((1U << m) - 1)
696696
* cwd_len = m
697697
*/
698-
_mm512_store_epi32(cwd_len, m_vec[i]);
698+
_mm512_storeu_si512(cwd_len, m_vec[i]);
699699
tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
700700
tmp = _mm512_sub_epi32(tmp, ONE);
701701
tmp = _mm512_and_epi32(tmp, s_vec[i]);
702-
_mm512_store_epi32(cwd, tmp);
702+
_mm512_storeu_si512(cwd, tmp);
703703

704704
for (ui32 j = 0; j < 8; ++j) {
705705
ui32 idx = j * 2;
@@ -712,7 +712,7 @@ static void proc_ms_encode(ms_struct *msp,
712712
}
713713
}
714714

715-
static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
715+
static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
716716
__m512i &e_qmax_vec)
717717
{
718718
/* if (u_q[i] > 0) {
@@ -755,7 +755,7 @@ static void update_lep(ui32 x, __m512i &prev_e_val_vec,
755755
*/
756756
auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
757757
left_shift, eq_vec[3]);
758-
prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
758+
prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
759759
eq_vec[3]);
760760
e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
761761
}
@@ -769,9 +769,9 @@ static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
769769
* lcxp[0] = (ui8)((rho[0] & 8) >> 3);
770770
* Or (rho[0] & 2) and (rho[0] of the previous round & 8).
771771
*/
772-
auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
772+
auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
773773
left_shift, rho_vec);
774-
prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
774+
prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
775775
rho_vec);
776776

777777
tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
@@ -793,7 +793,7 @@ static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
793793
return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
794794
}
795795

796-
static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
796+
static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
797797
const __m512i right_shift)
798798
{
799799
ojph_unused(x);
@@ -809,8 +809,8 @@ static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
809809
static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
810810
const __m512i right_shift)
811811
{
812-
// c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813-
// | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
812+
// c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813+
// | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
814814
auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
815815
auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
816816
auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
@@ -831,7 +831,7 @@ static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
831831

832832
using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
833833

834-
static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
834+
static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
835835
__m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
836836
const __m512i right_shift)
837837
{
@@ -849,7 +849,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
849849

850850
/* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
851851
auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
852-
mel_need_encode2 =
852+
mel_need_encode2 =
853853
mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
854854

855855
ui32 i_max = 16 - (ignore / 2);
@@ -873,7 +873,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
873873
}
874874
}
875875

876-
static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
876+
static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
877877
__m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
878878
const __m512i right_shift)
879879
{
@@ -897,7 +897,7 @@ static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
897897
}
898898
}
899899

900-
using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
900+
using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
901901
__m512i, ui32, const __m512i);
902902

903903
static void proc_vlc_encode1(vlc_struct_avx512 *vlcp, ui32 *tuple,
@@ -1006,8 +1006,8 @@ static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
10061006

10071007
using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
10081008

1009-
void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1010-
ui32 num_passes, ui32 _width, ui32 height,
1009+
void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1010+
ui32 num_passes, ui32 _width, ui32 height,
10111011
ui32 stride, ui32* lengths,
10121012
ojph::mem_elastic_allocator *elastic,
10131013
ojph::coded_lists *& coded)
@@ -1111,7 +1111,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11111111

11121112
if (y + 1 < height) {
11131113
src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1114-
src_vec[3] =
1114+
src_vec[3] =
11151115
_mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
11161116
} else {
11171117
src_vec[1] = ZERO;
@@ -1148,7 +1148,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11481148
tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
11491149
auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
11501150
left_shift, tmp);
1151-
prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
1151+
prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
11521152
tmp);
11531153

11541154
update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
@@ -1163,7 +1163,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11631163
__m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
11641164
ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
11651165

1166-
proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1166+
proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
11671167
right_shift);
11681168

11691169
proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
@@ -1177,8 +1177,8 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11771177
* So in the vlc_encode, the tuple will only be scaled by 2.
11781178
*/
11791179
tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1180-
_mm512_store_epi32(tuple, tuple_vec);
1181-
_mm512_store_epi32(u_q, u_q_vec);
1180+
_mm512_storeu_si512(tuple, tuple_vec);
1181+
_mm512_storeu_si512(u_q, u_q_vec);
11821182
proc_vlc_encode(&vlc, tuple, u_q, _ignore);
11831183
}
11841184

0 commit comments

Comments
 (0)