Merge branch 'aous72:master' into feature/add-openexr-support

michaeldsmith · web-flow · commit ae77a310e79a · 2025-08-20T22:15:14.000-07:00
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
@@ -40,7 +40,7 @@ if(EMSCRIPTEN)
   endif()
 else()
   if (NOT OJPH_DISABLE_SIMD)
-    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64") 
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")
       OR ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_I386")
       OR MULTI_GEN_X86_64)
 
@@ -95,7 +95,7 @@ else()
         set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3)
         set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
         set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
-        set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd)
+        set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512cd")
         set_source_files_properties(transform/ojph_colour_sse.cpp PROPERTIES COMPILE_FLAGS -msse)
         set_source_files_properties(transform/ojph_colour_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
         set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
@@ -104,7 +104,7 @@ else()
         set_source_files_properties(transform/ojph_transform_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
         set_source_files_properties(transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
         set_source_files_properties(transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
-        set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
+        set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512cd")
       endif()
     endif()
 
diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h
@@ -3,21 +3,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -71,11 +71,11 @@ namespace ojph {
       };
 
     public:
-      static void pre_alloc(codestream *codestream, const size& nominal, 
+      static void pre_alloc(codestream *codestream, const size& nominal,
                             ui32 precision);
       void finalize_alloc(codestream *codestream, subband* parent,
                           const size& nominal, const size& cb_size,
-                          coded_cb_header* coded_cb, ui32 K_max, 
+                          coded_cb_header* coded_cb, ui32 K_max,
                           int tbx0, ui32 precision, ui32 comp_idx);
       void push(line_buf *line);
       void encode(mem_elastic_allocator *elastic);
@@ -115,7 +115,7 @@ namespace ojph {
     struct coded_cb_header
     {
       ui32 pass_length[2];
-      ui32 num_passes;
+      ui32 num_passes;       // number of passes to be decoded
       ui32 Kmax;
       ui32 missing_msbs;
       coded_lists *next_coded;
diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -254,7 +254,7 @@ namespace ojph {
               bits2 = 32 - (int)count_leading_zeros(cp->pass_length[1]);
             int bits = ojph_max(bits1, bits2 - extra_bit) - 3;
             bits = ojph_max(bits, 0);
-            bb_put_bits(&bb, 0xFFFFFFFEu, bits+1, 
+            bb_put_bits(&bb, 0xFFFFFFFEu, bits+1,
               elastic, cur_coded_list, ph_bytes);
 
             bb_put_bits(&bb, cp->pass_length[0], bits+3,
@@ -463,37 +463,54 @@ namespace ojph {
             }
             cp->num_passes = num_passes;
 
-            //parse pass lengths
-            //for one pass, one length, but for 2 or 3 passes, two lengths
-            int extra_bit = cp->num_passes > 2 ? 1 : 0;
-            int bits1 = 3;
+            // Parse pass lengths
+            // When number of passes is one, one length.
+            // When number of passes is two or three, two lengths.
+            // When number of passes > 3, we have place holder passes;
+            // In this case, subtract multiples of 3 from the number of
+            // passes; for example, if we have 10 passes, we subtract 9,
+            // producing 1 pass.
+
+            // 1 => 1, 2 => 2, 3 => 3, 4 => 1, 5 => 2, 6 => 3
+            ui32 num_phld_passes = (num_passes - 1) / 3;
+            cp->missing_msbs += num_phld_passes;
+
+            num_phld_passes *= 3;
+            cp->num_passes = num_passes - num_phld_passes;
+            cp->pass_length[0] = cp->pass_length[1] = 0;
+
+            int Lblock = 3;
             bit = 1;
             while (bit)
             {
+              // add any extra bits here
               if (bb_read_bit(&bb, bit) == false)
               { data_left = 0; throw "error reading from file p8"; }
-              bits1 += bit;
+              Lblock += bit;
             }
 
-            if (bb_read_bits(&bb, bits1, bit) == false)
+            int bits = Lblock + 31 -
+              (int)count_leading_zeros(num_phld_passes + 1);
+            if (bb_read_bits(&bb, bits, bit) == false)
             { data_left = 0; throw "error reading from file p9"; }
-            if (bit < 2) { 
+            if (bit < 2)
               throw "The cleanup segment of an HT codeblock cannot contain "
                 "less than 2 bytes";
-            }
-            if (bit >= 65535) {
+            if (bit >= 65535)
               throw "The cleanup segment of an HT codeblock must contain "
                 "less than 65535 bytes";
-            }
             cp->pass_length[0] = bit;
-            if (num_passes > 1)
+
+            if (cp->num_passes > 1)
             {
-              if (bb_read_bits(&bb, bits1 + extra_bit, bit) == false)
+              //bits = Lblock + 31 - count_leading_zeros(cp->num_passes - 1);
+              // The following is simpler than the above, I think?
+              bits = Lblock + (cp->num_passes > 2 ? 1 : 0);
+              if (bb_read_bits(&bb, bits, bit) == false)
               { data_left = 0; throw "error reading from file p10"; }
-              if (bit >= 2047) {
+              if (bit >= 2047)
                 throw "The refinement segment (SigProp and MagRep passes) of "
                   "an HT codeblock must contain less than 2047 bytes";
-              }
               cp->pass_length[1] = bit;
             }
           }
@@ -532,7 +549,7 @@ namespace ojph {
                   ui32 t = ojph_min(num_bytes, bb.bytes_left);
                   file->seek(t, infile_base::OJPH_SEEK_CUR);
                   ui32 bytes_read = (ui32)(file->tell() - cur_loc);
-                  cp->pass_length[0] = cp->pass_length[1] = 0; 
+                  cp->pass_length[0] = cp->pass_length[1] = 0;
                   bb.bytes_left -= bytes_read;
                   assert(bytes_read == t || bb.bytes_left == 0);
                 }
diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp
@@ -539,7 +539,7 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
         /*   s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
         val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
         _s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
-        _s_vec[i] = 
+        _s_vec[i] =
           _mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
         /* } */
     }
@@ -571,18 +571,18 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
         ui32 o_idx = i & 0x1;
 
         eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
-        eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00, 
-                                                  idx[e_idx], 
+        eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
+                                                  idx[e_idx],
                                                   _eq_vec[o_idx + 2]);
 
         s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
         s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
-                                                 idx[e_idx], 
+                                                 idx[e_idx],
                                                  _s_vec[o_idx + 2]);
 
         _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
         _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
-                                                    idx[e_idx], 
+                                                    idx[e_idx],
                                                     val_vec[o_idx + 2]);
         _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
 
@@ -695,11 +695,11 @@ static void proc_ms_encode(ms_struct *msp,
         /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
          * cwd_len = m
          */
-        _mm512_store_epi32(cwd_len, m_vec[i]);
+        _mm512_storeu_si512(cwd_len, m_vec[i]);
         tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
         tmp = _mm512_sub_epi32(tmp, ONE);
         tmp = _mm512_and_epi32(tmp, s_vec[i]);
-        _mm512_store_epi32(cwd, tmp);
+        _mm512_storeu_si512(cwd, tmp);
 
         for (ui32 j = 0; j < 8; ++j) {
             ui32 idx = j * 2;
@@ -712,7 +712,7 @@ static void proc_ms_encode(ms_struct *msp,
     }
 }
 
-static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec, 
+static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
                            __m512i &e_qmax_vec)
 {
     /* if (u_q[i] > 0) {
@@ -755,7 +755,7 @@ static void update_lep(ui32 x, __m512i &prev_e_val_vec,
      */
     auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
                                              left_shift, eq_vec[3]);
-    prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift, 
+    prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
                                                    eq_vec[3]);
     e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
 }
@@ -769,9 +769,9 @@ static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
      * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
      * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
      */
-    auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE, 
+    auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
                                              left_shift, rho_vec);
-    prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift, 
+    prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
                                                     rho_vec);
 
     tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
@@ -793,7 +793,7 @@ static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
     return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
 }
 
-static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec, 
+static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
                         const __m512i right_shift)
 {
     ojph_unused(x);
@@ -809,8 +809,8 @@ static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
 static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
                         const __m512i right_shift)
 {
-    // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2)) 
-    //            | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2)); 
+    // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
+    //            | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
     auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
     auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
     auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
@@ -831,7 +831,7 @@ static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
 
 using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
 
-static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec, 
+static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
                              __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
                              const __m512i right_shift)
 {
@@ -849,7 +849,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
 
     /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
     auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
-    mel_need_encode2 = 
+    mel_need_encode2 =
       mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
 
     ui32 i_max = 16 - (ignore / 2);
@@ -873,7 +873,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
     }
 }
 
-static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec, 
+static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
                              __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
                              const __m512i right_shift)
 {
@@ -897,7 +897,7 @@ static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
     }
 }
 
-using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &, 
+using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
                                     __m512i, ui32, const __m512i);
 
 static void proc_vlc_encode1(vlc_struct_avx512 *vlcp, ui32 *tuple,
@@ -1006,8 +1006,8 @@ static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
 
 using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
 
-void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs, 
-                                  ui32 num_passes, ui32 _width, ui32 height, 
+void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
+                                  ui32 num_passes, ui32 _width, ui32 height,
                                   ui32 stride, ui32* lengths,
                                   ojph::mem_elastic_allocator *elastic,
                                   ojph::coded_lists *& coded)
@@ -1111,7 +1111,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
 
             if (y + 1 < height) {
                 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
-                src_vec[3] = 
+                src_vec[3] =
                   _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
             } else {
                 src_vec[1] = ZERO;
@@ -1148,7 +1148,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
             tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
             auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
                                                         left_shift, tmp);
-            prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift, 
+            prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
                                                         tmp);
 
             update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
@@ -1163,7 +1163,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
             __m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
             ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
 
-            proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore, 
+            proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
                             right_shift);
 
             proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
@@ -1177,8 +1177,8 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
              * So in the vlc_encode, the tuple will only be scaled by 2.
              */
             tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
-            _mm512_store_epi32(tuple, tuple_vec);
-            _mm512_store_epi32(u_q, u_q_vec);
+            _mm512_storeu_si512(tuple, tuple_vec);
+            _mm512_storeu_si512(u_q, u_q_vec);
             proc_vlc_encode(&vlc, tuple, u_q, _ignore);
         }
 
diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h