Skip to content

Commit 4ca0a9d

Browse files
committed
Merge branch 'master' into placeholder_passes
2 parents 8246e79 + 2ff2b1b commit 4ca0a9d

File tree

1 file changed

+25
-25
lines changed

1 file changed

+25
-25
lines changed

src/core/coding/ojph_block_encoder_avx512.cpp

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
539539
/* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
540540
val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
541541
_s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
542-
_s_vec[i] =
542+
_s_vec[i] =
543543
_mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
544544
/* } */
545545
}
@@ -571,18 +571,18 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
571571
ui32 o_idx = i & 0x1;
572572

573573
eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
574-
eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
575-
idx[e_idx],
574+
eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
575+
idx[e_idx],
576576
_eq_vec[o_idx + 2]);
577577

578578
s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
579579
s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
580-
idx[e_idx],
580+
idx[e_idx],
581581
_s_vec[o_idx + 2]);
582582

583583
_rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
584584
_rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
585-
idx[e_idx],
585+
idx[e_idx],
586586
val_vec[o_idx + 2]);
587587
_rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
588588

@@ -695,11 +695,11 @@ static void proc_ms_encode(ms_struct *msp,
695695
/* cwd = s[i * 4 + 0] & ((1U << m) - 1)
696696
* cwd_len = m
697697
*/
698-
_mm512_store_epi32(cwd_len, m_vec[i]);
698+
_mm512_storeu_epi32(cwd_len, m_vec[i]);
699699
tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
700700
tmp = _mm512_sub_epi32(tmp, ONE);
701701
tmp = _mm512_and_epi32(tmp, s_vec[i]);
702-
_mm512_store_epi32(cwd, tmp);
702+
_mm512_storeu_epi32(cwd, tmp);
703703

704704
for (ui32 j = 0; j < 8; ++j) {
705705
ui32 idx = j * 2;
@@ -712,7 +712,7 @@ static void proc_ms_encode(ms_struct *msp,
712712
}
713713
}
714714

715-
static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
715+
static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
716716
__m512i &e_qmax_vec)
717717
{
718718
/* if (u_q[i] > 0) {
@@ -755,7 +755,7 @@ static void update_lep(ui32 x, __m512i &prev_e_val_vec,
755755
*/
756756
auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
757757
left_shift, eq_vec[3]);
758-
prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
758+
prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
759759
eq_vec[3]);
760760
e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
761761
}
@@ -769,9 +769,9 @@ static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
769769
* lcxp[0] = (ui8)((rho[0] & 8) >> 3);
770770
* Or (rho[0] & 2) and (rho[0] of the previous round & 8).
771771
*/
772-
auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
772+
auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
773773
left_shift, rho_vec);
774-
prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
774+
prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
775775
rho_vec);
776776

777777
tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
@@ -793,7 +793,7 @@ static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
793793
return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
794794
}
795795

796-
static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
796+
static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
797797
const __m512i right_shift)
798798
{
799799
ojph_unused(x);
@@ -809,8 +809,8 @@ static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
809809
static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
810810
const __m512i right_shift)
811811
{
812-
// c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813-
// | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
812+
// c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
813+
// | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
814814
auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
815815
auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
816816
auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
@@ -831,7 +831,7 @@ static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
831831

832832
using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
833833

834-
static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
834+
static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
835835
__m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
836836
const __m512i right_shift)
837837
{
@@ -849,7 +849,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
849849

850850
/* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
851851
auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
852-
mel_need_encode2 =
852+
mel_need_encode2 =
853853
mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
854854

855855
ui32 i_max = 16 - (ignore / 2);
@@ -873,7 +873,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
873873
}
874874
}
875875

876-
static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
876+
static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
877877
__m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
878878
const __m512i right_shift)
879879
{
@@ -897,7 +897,7 @@ static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
897897
}
898898
}
899899

900-
using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
900+
using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
901901
__m512i, ui32, const __m512i);
902902

903903
static void proc_vlc_encode1(vlc_struct_avx512 *vlcp, ui32 *tuple,
@@ -1006,8 +1006,8 @@ static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
10061006

10071007
using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
10081008

1009-
void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1010-
ui32 num_passes, ui32 _width, ui32 height,
1009+
void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
1010+
ui32 num_passes, ui32 _width, ui32 height,
10111011
ui32 stride, ui32* lengths,
10121012
ojph::mem_elastic_allocator *elastic,
10131013
ojph::coded_lists *& coded)
@@ -1111,7 +1111,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11111111

11121112
if (y + 1 < height) {
11131113
src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
1114-
src_vec[3] =
1114+
src_vec[3] =
11151115
_mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
11161116
} else {
11171117
src_vec[1] = ZERO;
@@ -1148,7 +1148,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11481148
tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
11491149
auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
11501150
left_shift, tmp);
1151-
prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
1151+
prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
11521152
tmp);
11531153

11541154
update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
@@ -1163,7 +1163,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11631163
__m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
11641164
ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
11651165

1166-
proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1166+
proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
11671167
right_shift);
11681168

11691169
proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
@@ -1177,8 +1177,8 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
11771177
* So in the vlc_encode, the tuple will only be scaled by 2.
11781178
*/
11791179
tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
1180-
_mm512_store_epi32(tuple, tuple_vec);
1181-
_mm512_store_epi32(u_q, u_q_vec);
1180+
_mm512_storeu_epi32(tuple, tuple_vec);
1181+
_mm512_storeu_epi32(u_q, u_q_vec);
11821182
proc_vlc_encode(&vlc, tuple, u_q, _ignore);
11831183
}
11841184

0 commit comments

Comments
 (0)