@@ -539,7 +539,7 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
539539 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
540540 val_vec[i] = _mm512_mask_sub_epi32 (ZERO, val_mask[i], val_vec[i], ONE);
541541 _s_vec[i] = _mm512_mask_srli_epi32 (ZERO, val_mask[i], src_vec[i], 31 );
542- _s_vec[i] =
542+ _s_vec[i] =
543543 _mm512_mask_add_epi32 (ZERO, val_mask[i], _s_vec[i], val_vec[i]);
544544 /* } */
545545 }
@@ -571,18 +571,18 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
571571 ui32 o_idx = i & 0x1 ;
572572
573573 eq_vec[i] = _mm512_permutexvar_epi32 (idx[e_idx], _eq_vec[o_idx]);
574- eq_vec[i] = _mm512_mask_permutexvar_epi32 (eq_vec[i], 0xFF00 ,
575- idx[e_idx],
574+ eq_vec[i] = _mm512_mask_permutexvar_epi32 (eq_vec[i], 0xFF00 ,
575+ idx[e_idx],
576576 _eq_vec[o_idx + 2 ]);
577577
578578 s_vec[i] = _mm512_permutexvar_epi32 (idx[e_idx], _s_vec[o_idx]);
579579 s_vec[i] = _mm512_mask_permutexvar_epi32 (s_vec[i], 0xFF00 ,
580- idx[e_idx],
580+ idx[e_idx],
581581 _s_vec[o_idx + 2 ]);
582582
583583 _rho_vec[i] = _mm512_permutexvar_epi32 (idx[e_idx], val_vec[o_idx]);
584584 _rho_vec[i] = _mm512_mask_permutexvar_epi32 (_rho_vec[i], 0xFF00 ,
585- idx[e_idx],
585+ idx[e_idx],
586586 val_vec[o_idx + 2 ]);
587587 _rho_vec[i] = _mm512_slli_epi32 (_rho_vec[i], i);
588588
@@ -695,11 +695,11 @@ static void proc_ms_encode(ms_struct *msp,
695695 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
696696 * cwd_len = m
697697 */
698- _mm512_store_epi32 (cwd_len, m_vec[i]);
698+ _mm512_storeu_epi32 (cwd_len, m_vec[i]);
699699 tmp = _mm512_sllv_epi32 (ONE, m_vec[i]);
700700 tmp = _mm512_sub_epi32 (tmp, ONE);
701701 tmp = _mm512_and_epi32 (tmp, s_vec[i]);
702- _mm512_store_epi32 (cwd, tmp);
702+ _mm512_storeu_epi32 (cwd, tmp);
703703
704704 for (ui32 j = 0 ; j < 8 ; ++j) {
705705 ui32 idx = j * 2 ;
@@ -712,7 +712,7 @@ static void proc_ms_encode(ms_struct *msp,
712712 }
713713}
714714
715- static __m512i cal_eps_vec (__m512i *eq_vec, __m512i &u_q_vec,
715+ static __m512i cal_eps_vec (__m512i *eq_vec, __m512i &u_q_vec,
716716 __m512i &e_qmax_vec)
717717{
718718 /* if (u_q[i] > 0) {
@@ -755,7 +755,7 @@ static void update_lep(ui32 x, __m512i &prev_e_val_vec,
755755 */
756756 auto tmp = _mm512_mask_permutexvar_epi32 (prev_e_val_vec, 0xFFFE ,
757757 left_shift, eq_vec[3 ]);
758- prev_e_val_vec = _mm512_mask_permutexvar_epi32 (ZERO, 0x1 , left_shift,
758+ prev_e_val_vec = _mm512_mask_permutexvar_epi32 (ZERO, 0x1 , left_shift,
759759 eq_vec[3 ]);
760760 e_val_vec[x] = _mm512_max_epi32 (eq_vec[1 ], tmp);
761761}
@@ -769,9 +769,9 @@ static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
769769 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
770770 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
771771 */
772- auto tmp = _mm512_mask_permutexvar_epi32 (prev_cx_val_vec, 0xFFFE ,
772+ auto tmp = _mm512_mask_permutexvar_epi32 (prev_cx_val_vec, 0xFFFE ,
773773 left_shift, rho_vec);
774- prev_cx_val_vec = _mm512_mask_permutexvar_epi32 (ZERO, 0x1 , left_shift,
774+ prev_cx_val_vec = _mm512_mask_permutexvar_epi32 (ZERO, 0x1 , left_shift,
775775 rho_vec);
776776
777777 tmp = _mm512_and_epi32 (tmp, _mm512_set1_epi32 (8 ));
@@ -793,7 +793,7 @@ static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
793793 return _mm512_i32gather_epi32 (tmp, vlc_tbl, 4 );
794794}
795795
796- static __m512i proc_cq1 (ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
796+ static __m512i proc_cq1 (ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
797797 const __m512i right_shift)
798798{
799799 ojph_unused (x);
@@ -809,8 +809,8 @@ static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
809809static __m512i proc_cq2 (ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
810810 const __m512i right_shift)
811811{
812- // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813- // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
812+ // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813+ // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
814814 auto lcxp1_vec = _mm512_permutexvar_epi32 (right_shift, cx_val_vec[x]);
815815 auto lcxp2_vec = _mm512_permutexvar_epi32 (right_shift, cx_val_vec[x + 1 ]);
816816 auto tmp = _mm512_permutexvar_epi32 (right_shift, lcxp1_vec);
@@ -831,7 +831,7 @@ static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
831831
832832using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
833833
834- static void proc_mel_encode1 (mel_struct *melp, __m512i &cq_vec,
834+ static void proc_mel_encode1 (mel_struct *melp, __m512i &cq_vec,
835835 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
836836 const __m512i right_shift)
837837{
@@ -849,7 +849,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
849849
850850 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
851851 auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask (u_q_vec, ZERO);
852- mel_need_encode2 =
852+ mel_need_encode2 =
853853 mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask (tmp, ZERO);
854854
855855 ui32 i_max = 16 - (ignore / 2 );
@@ -873,7 +873,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
873873 }
874874}
875875
876- static void proc_mel_encode2 (mel_struct *melp, __m512i &cq_vec,
876+ static void proc_mel_encode2 (mel_struct *melp, __m512i &cq_vec,
877877 __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
878878 const __m512i right_shift)
879879{
@@ -897,7 +897,7 @@ static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
897897 }
898898}
899899
900- using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
900+ using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
901901 __m512i, ui32, const __m512i);
902902
903903static void proc_vlc_encode1 (vlc_struct_avx512 *vlcp, ui32 *tuple,
@@ -1006,8 +1006,8 @@ static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
10061006
10071007using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
10081008
1009- void ojph_encode_codeblock_avx512 (ui32* buf, ui32 missing_msbs,
1010- ui32 num_passes, ui32 _width, ui32 height,
1009+ void ojph_encode_codeblock_avx512 (ui32* buf, ui32 missing_msbs,
1010+ ui32 num_passes, ui32 _width, ui32 height,
10111011 ui32 stride, ui32* lengths,
10121012 ojph::mem_elastic_allocator *elastic,
10131013 ojph::coded_lists *& coded)
@@ -1111,7 +1111,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11111111
11121112 if (y + 1 < height) {
11131113 src_vec[1 ] = _mm512_maskz_loadu_epi32 (load_mask0, sp + stride);
1114- src_vec[3 ] =
1114+ src_vec[3 ] =
11151115 _mm512_maskz_loadu_epi32 (load_mask1, sp + 16 + stride);
11161116 } else {
11171117 src_vec[1 ] = ZERO;
@@ -1148,7 +1148,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11481148 tmp = proc_cq (x, cx_val_vec, rho_vec, right_shift);
11491149 auto cq_vec = _mm512_mask_permutexvar_epi32 (prev_cq_vec, 0xFFFE ,
11501150 left_shift, tmp);
1151- prev_cq_vec = _mm512_mask_permutexvar_epi32 (ZERO, 0x1 , left_shift,
1151+ prev_cq_vec = _mm512_mask_permutexvar_epi32 (ZERO, 0x1 , left_shift,
11521152 tmp);
11531153
11541154 update_lep (x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
@@ -1163,7 +1163,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11631163 __m512i tuple_vec = cal_tuple (cq_vec, rho_vec, eps_vec, vlc_tbl);
11641164 ui32 _ignore = ((n_loop - 1 ) == x) ? ignore : 0 ;
11651165
1166- proc_mel_encode (&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1166+ proc_mel_encode (&mel, cq_vec, rho_vec, u_q_vec, _ignore,
11671167 right_shift);
11681168
11691169 proc_ms_encode (&ms, tuple_vec, uq_vec, rho_vec, s_vec);
@@ -1177,8 +1177,8 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11771177 * So in the vlc_encode, the tuple will only be scaled by 2.
11781178 */
11791179 tuple_vec = _mm512_srli_epi32 (tuple_vec, 4 );
1180- _mm512_store_epi32 (tuple, tuple_vec);
1181- _mm512_store_epi32 (u_q, u_q_vec);
1180+ _mm512_storeu_epi32 (tuple, tuple_vec);
1181+ _mm512_storeu_epi32 (u_q, u_q_vec);
11821182 proc_vlc_encode (&vlc, tuple, u_q, _ignore);
11831183 }
11841184
0 commit comments