diff --git a/CMakeLists.txt b/CMakeLists.txt index 292b21321..dfd3b5811 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,6 +135,13 @@ if (APPLE) option(OPJ_USE_DSYMUTIL "Call dsymutil on binaries after build." OFF) endif() +if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "loongarch64") + # Add -mlsx -mlasx flags to use simd optimizations on loongarch platform. + # Add -fno-expensive-optimizations option to avoid below three unit tests failure on Release version: + # (NR-C1P1-p1_05.j2k-compare2base (Failed), NR-JP2-file2.jp2-compare2base (Failed), NR-DEC-issue205.jp2-253-decode-md5 (Failed)) + list (APPEND CMAKE_C_FLAGS ${CMAKE_C_FLAGS} " -mlsx -mlasx -fno-expensive-optimizations" ) +endif() + #----------------------------------------------------------------------------- # Big endian test: if (NOT EMSCRIPTEN) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index abc500eca..19a60abeb 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -55,6 +55,12 @@ #ifdef __AVX2__ #include #endif +#ifdef __loongarch_sx +#include +#endif +#ifdef __loongarch_asx +#include +#endif #if defined(__GNUC__) #pragma GCC poison malloc calloc realloc free @@ -66,7 +72,7 @@ #define OPJ_WS(i) v->mem[(i)*2] #define OPJ_WD(i) v->mem[(1+(i)*2)] -#ifdef __AVX2__ +#if defined(__AVX2__) || defined(__loongarch_asx) /** Number of int32 values in a AVX2 register */ #define VREG_INT_COUNT 8 #else @@ -511,7 +517,7 @@ static void opj_idwt53_h(const opj_dwt_t *dwt, #endif } -#if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) +#if (defined(__SSE2__) || defined(__AVX2__) || defined(__loongarch_asx)) && !defined(STANDARD_SLOW_VERSION) /* Conveniency macros to improve the readability of the formulas */ #if __AVX2__ @@ -524,6 +530,16 @@ static void opj_idwt53_h(const opj_dwt_t *dwt, #define ADD(x,y) _mm256_add_epi32((x),(y)) #define SUB(x,y) _mm256_sub_epi32((x),(y)) #define SAR(x,y) _mm256_srai_epi32((x),(y)) +#elif __loongarch_asx +#define VREG __m256i +#define LOAD_CST(x) __lasx_xvreplgr2vr_w(x) +#define LOAD(x) __lasx_xvld((const VREG*)(x), 0) +#define LOADU(x) __lasx_xvld((const VREG*)(x), 0) +#define STORE(x,y) __lasx_xvst((y), (VREG*)(x), 0) +#define STOREU(x,y) __lasx_xvst((y), (VREG*)(x), 0) +#define ADD(x,y) __lasx_xvadd_w((x),(y)) +#define SUB(x,y) __lasx_xvsub_w((x),(y)) +#define SAR(x,y) __lasx_xvsrai_w((x),(y)) #else #define VREG __m128i #define LOAD_CST(x) _mm_set1_epi32(x) @@ -558,8 +574,8 @@ void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col, } /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or - * 16 in AVX2, when top-most pixel is on even coordinate */ -static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( + * 16 in AVX2, or 16 in LASX, when top-most pixel is on even coordinate */ +static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2_OR_LASX( OPJ_INT32* tmp, const OPJ_INT32 sn, const OPJ_INT32 len, @@ -576,7 +592,7 @@ static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( const VREG two = LOAD_CST(2); assert(len > 1); -#if __AVX2__ +#if defined(__AVX2__) || defined(__loongarch_asx) assert(PARALLEL_COLS_53 == 16); assert(VREG_INT_COUNT == 8); #else @@ -659,8 +675,8 @@ static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or - * 16 in AVX2, when top-most pixel is on odd coordinate */ -static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( + * 16 in AVX2, or 16 in LASX, when top-most pixel is on odd coordinate */ +static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2_OR_LASX( OPJ_INT32* tmp, const OPJ_INT32 sn, const OPJ_INT32 len, @@ -678,7 +694,7 @@ static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( const OPJ_INT32* in_odd = &tiledp_col[0]; assert(len > 2); -#if __AVX2__ +#if defined(__AVX2__) || defined(__loongarch_asx) assert(PARALLEL_COLS_53 == 16); assert(VREG_INT_COUNT == 8); #else @@ -767,7 +783,7 @@ static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( #undef SUB #undef SAR -#endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */ +#endif /* (defined(__SSE2__) || defined(__AVX2__) || defined(__loongarch_asx)) && !defined(STANDARD_SLOW_VERSION) */ #if !defined(STANDARD_SLOW_VERSION) /** Vertical inverse 5x3 wavelet transform for one column, when top-most @@ -894,11 +910,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt, if (dwt->cas == 0) { /* If len == 1, unmodified value */ -#if (defined(__SSE2__) || defined(__AVX2__)) +#if (defined(__SSE2__) || defined(__AVX2__) || defined(__loongarch_asx)) if (len > 1 && nb_cols == PARALLEL_COLS_53) { /* Same as below general case, except that thanks to SSE2/AVX2 */ /* we can efficiently process 8/16 columns in parallel */ - opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); + opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2_OR_LASX(dwt->mem, sn, len, tiledp_col, stride); return; } #endif @@ -937,11 +953,11 @@ static void opj_idwt53_v(const opj_dwt_t *dwt, return; } -#if (defined(__SSE2__) || defined(__AVX2__)) +#if (defined(__SSE2__) || defined(__AVX2__) || defined(__loongarch_asx)) if (len > 2 && nb_cols == PARALLEL_COLS_53) { /* Same as below general case, except that thanks to SSE2/AVX2 */ /* we can efficiently process 8/16 columns in parallel */ - opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); + opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2_OR_LASX(dwt->mem, sn, len, tiledp_col, stride); return; } #endif @@ -2428,6 +2444,24 @@ static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a, } } #endif +#ifdef __loongarch_sx + if (i + 1 < i_max) { + const __m128i two = __lsx_vreplgr2vr_w(2); + __m128i Dm1 = __lsx_vld((__m128i * const)(a + 4 + (i - 1) * 8), 0); + for (; i + 1 < i_max; i += 2) { + /* No bound checking */ + __m128i S = __lsx_vld((__m128i * const)(a + i * 8), 0); + __m128i D = __lsx_vld((__m128i * const)(a + 4 + i * 8), 0); + __m128i S1 = __lsx_vld((__m128i * const)(a + (i + 1) * 8), 0); + __m128i D1 = __lsx_vld((__m128i * const)(a + 4 + (i + 1) * 8), 0); + S = __lsx_vsub_w(S, __lsx_vsrai_w(__lsx_vadd_w(__lsx_vadd_w(Dm1, D), two), 2)); + S1 = __lsx_vsub_w(S1, __lsx_vsrai_w(__lsx_vadd_w(__lsx_vadd_w(D, D1), two), 2)); + __lsx_vst(S, (__m128i*)(a + i * 8), 0); + __lsx_vst(S1, (__m128i*)(a + (i + 1) * 8), 0); + Dm1 = D1; + } + } +#endif for (; i < i_max; i++) { /* No bound checking */ @@ -2467,6 +2501,23 @@ static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a, } } #endif +#ifdef __loongarch_sx + if (i + 1 < i_max) { + __m128i S = __lsx_vld((__m128i * const)(a + i * 8), 0); + for (; i + 1 < i_max; i += 2) { + /* No bound checking */ + __m128i D = __lsx_vld((__m128i * const)(a + 4 + i * 8), 0); + __m128i S1 = __lsx_vld((__m128i * const)(a + (i + 1) * 8), 0); + __m128i D1 = __lsx_vld((__m128i * const)(a + 4 + (i + 1) * 8), 0); + __m128i S2 = __lsx_vld((__m128i * const)(a + (i + 2) * 8), 0); + D = __lsx_vadd_w(D, __lsx_vsrai_w(__lsx_vadd_w(S, S1), 1)); + D1 = __lsx_vadd_w(D1, __lsx_vsrai_w(__lsx_vadd_w(S1, S2), 1)); + __lsx_vst(D, (__m128i*)(a + 4 + i * 8), 0); + __lsx_vst(D1, (__m128i*)(a + 4 + (i + 1) * 8), 0); + S = S2; + } + } +#endif for (; i < i_max; i++) { /* No bound checking */ @@ -3015,7 +3066,7 @@ static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt, OPJ_UNUSED(ret); } -#ifdef __SSE__ +#if defined(__SSE__) static void opj_v8dwt_decode_step1_sse(opj_v8_t* w, OPJ_UINT32 start, @@ -3070,6 +3121,66 @@ static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w, } } +#elif defined(__loongarch_asx) + +static void opj_v8dwt_decode_step1_lasx(opj_v8_t* w, + OPJ_UINT32 start, + OPJ_UINT32 end, + const OPJ_FLOAT32 c) +{ + OPJ_UINT32 i; + OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; + __m256 vfw, vmul, vc; + /* To be adapted if NB_ELTS_V8 changes */ + fw += start * 16; + vc = (__m256)__lasx_xvldrepl_w(&c, 0); + for (i = start; i < end; ++i) { + vfw = (__m256)__lasx_xvld(fw, 0); + vmul = __lasx_xvfmul_s(vfw, vc); + __lasx_xvst(vmul, fw, 0); + fw += 16; + } +} + +static void opj_v8dwt_decode_step2_lasx(opj_v8_t* l, opj_v8_t* w, + OPJ_UINT32 start, + OPJ_UINT32 end, + OPJ_UINT32 m, + OPJ_FLOAT32 c) +{ + OPJ_UINT32 i; + OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; + OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; + OPJ_UINT32 imax = opj_uint_min(end, m); + __m256 vfl0, vfw0, vfw1, vret; + __m256 vc = (__m256)__lasx_xvldrepl_w(&c, 0); + if (start > 0) { + fw += 2 * NB_ELTS_V8 * start; + fl = fw - 2 * NB_ELTS_V8; + } + /* To be adapted if NB_ELTS_V8 changes */ + for (i = start; i < imax; ++i) { + vfl0 = (__m256)__lasx_xvld(fl, 0); + vfw0 = (__m256)__lasx_xvld(fw, 0); + vfw1 = (__m256)__lasx_xvld(fw, -32); + vret = __lasx_xvfadd_s(vfl0, vfw0); + vret = __lasx_xvfmul_s(vret, vc); + vret = __lasx_xvfadd_s(vret, vfw1); + __lasx_xvst(vret, fw, -32); + fl = fw; + fw += 2 * NB_ELTS_V8; + } + if (m < end) { + assert(m + 1 == end); + vc = __lasx_xvfadd_s(vc, vc); + vfl0 = (__m256)__lasx_xvld(fl, 0); + vfw1 = (__m256)__lasx_xvld(fw, -32); + vret = __lasx_xvfmul_s(vfl0, vc); + vret = __lasx_xvfadd_s(vret, vfw1); + __lasx_xvst(vret, fw, -32); + } +} + #else static void opj_v8dwt_decode_step1(opj_v8_t* w, @@ -3162,7 +3273,7 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) a = 1; b = 0; } -#ifdef __SSE__ +#if defined(__SSE__) opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, _mm_set1_ps(opj_K)); opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, @@ -3183,6 +3294,27 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(-opj_dwt_alpha)); +#elif defined(__loongarch_asx) + opj_v8dwt_decode_step1_lasx(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, + opj_K); + opj_v8dwt_decode_step1_lasx(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, + two_invK); + opj_v8dwt_decode_step2_lasx(dwt->wavelet + b, dwt->wavelet + a + 1, + dwt->win_l_x0, dwt->win_l_x1, + (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), + -opj_dwt_delta); + opj_v8dwt_decode_step2_lasx(dwt->wavelet + a, dwt->wavelet + b + 1, + dwt->win_h_x0, dwt->win_h_x1, + (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), + -opj_dwt_gamma); + opj_v8dwt_decode_step2_lasx(dwt->wavelet + b, dwt->wavelet + a + 1, + dwt->win_l_x0, dwt->win_l_x1, + (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), + -opj_dwt_beta); + opj_v8dwt_decode_step2_lasx(dwt->wavelet + a, dwt->wavelet + b + 1, + dwt->win_h_x0, dwt->win_h_x1, + (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), + -opj_dwt_alpha); #else opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K); diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 52e466eb9..5b24204bb 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1786,6 +1786,34 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) } } #endif + +#ifdef __loongarch_asx + { + asm volatile + ( + "srli.w $t0, %[cblk], 4 \n\t" + "xvldrepl.w $xr0, %[step], 0 \n\t" + "beqz $t0, 999f \n\t" + "16: \n\t" + "xvld $xr1, %[data], 0 \n\t" + "xvld $xr2, %[data], 32 \n\t" + "xvffint.s.w $xr1, $xr1 \n\t" + "xvffint.s.w $xr2, $xr2 \n\t" + "xvfmul.s $xr1, $xr1, $xr0 \n\t" + "xvfmul.s $xr2, $xr2, $xr0 \n\t" + "xvst $xr1, %[data], 0 \n\t" + "xvst $xr2, %[data], 32 \n\t" + "addi.d %[data], %[data], 64 \n\t" + "addi.w $t0, $t0, -1 \n\t" + "bnez $t0, 16b \n\t" + "999: \n\t" + : [data]"+r"(datap) + : [step]"r"(&stepsize), [cblk]"r"(cblk_size) + : "$t0", "$xr0", "$xr1", "$xr2", "memory" + ); + i = cblk_size & ~15U; + } +#endif for (; i < cblk_size; ++i) { OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * stepsize; memcpy(datap, &tmp, sizeof(tmp)); @@ -1797,6 +1825,34 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) (OPJ_SIZE_T)x]; for (j = 0; j < cblk_h; ++j) { i = 0; +#ifdef __loongarch_asx + { + OPJ_INT32* ptr1 = datap + j * cblk_w; + OPJ_INT32* ptr2 = tiledp + j * (OPJ_SIZE_T)tile_w; + OPJ_UINT32 step_size = 0; + asm volatile + ( + "srli.w $t0, %[cblk], 3 \n\t" + "xvxor.v $xr1, $xr1, $xr1 \n\t" + "xvaddi.wu $xr1, $xr1, 2 \n\t" + "beqz $t0, 999f \n\t" + "8: \n\t" + "xvldx $xr0, %[ptr1], %[step] \n\t" + "xvdiv.w $xr0, $xr0, $xr1 \n\t" + "xvstx $xr0, %[ptr2], %[step] \n\t" + "addi.w %[step], %[step], 32 \n\t" + "addi.w $t0, $t0, -1 \n\t" + "bnez $t0, 8b \n\t" + "999: \n\t" + : [step]"+r"(step_size) + : [ptr1]"r"(ptr1), + [ptr2]"r"(ptr2), + [cblk]"r"(cblk_w) + : "$t0", "$xr0", "memory" + ); + i = cblk_w & ~7U; + } +#endif for (; i < (cblk_w & ~(OPJ_UINT32)3U); i += 4U) { OPJ_INT32 tmp0 = datap[(j * cblk_w) + i + 0U]; OPJ_INT32 tmp1 = datap[(j * cblk_w) + i + 1U]; @@ -1807,6 +1863,7 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 2U] = tmp2 / 2; ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 3U] = tmp3 / 2; } + for (; i < cblk_w; ++i) { OPJ_INT32 tmp = datap[(j * cblk_w) + i]; ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i] = tmp / 2;