@@ -3224,6 +3224,35 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
32243224 vint16m1_t out3 = __riscv_vnsra_wx_i16m1 (__riscv_vadd_vv_i32m2 (x3, x4, vl), 17 , vl);
32253225 vint16m1_t out4 = __riscv_vnsra_wx_i16m1 (__riscv_vsub_vv_i32m2 (x3, x4, vl), 17 , vl);
32263226
3227+ #ifdef __THEAD_VERSION__
3228+ // clamp 0~255
3229+ out0 = __riscv_vmax_vx_i16m1 (out0, 0 , vl);
3230+ out7 = __riscv_vmax_vx_i16m1 (out7, 0 , vl);
3231+ out1 = __riscv_vmax_vx_i16m1 (out1, 0 , vl);
3232+ out6 = __riscv_vmax_vx_i16m1 (out6, 0 , vl);
3233+ out2 = __riscv_vmax_vx_i16m1 (out2, 0 , vl);
3234+ out5 = __riscv_vmax_vx_i16m1 (out5, 0 , vl);
3235+ out3 = __riscv_vmax_vx_i16m1 (out3, 0 , vl);
3236+ out4 = __riscv_vmax_vx_i16m1 (out4, 0 , vl);
3237+ vuint8m1_t out0u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out0, out0)), 0 , __RISCV_VXRM_RNU, vl);
3238+ vuint8m1_t out7u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out7, out7)), 0 , __RISCV_VXRM_RNU, vl);
3239+ vuint8m1_t out1u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out1, out1)), 0 , __RISCV_VXRM_RNU, vl);
3240+ vuint8m1_t out6u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out6, out6)), 0 , __RISCV_VXRM_RNU, vl);
3241+ vuint8m1_t out2u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out2, out2)), 0 , __RISCV_VXRM_RNU, vl);
3242+ vuint8m1_t out5u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out5, out5)), 0 , __RISCV_VXRM_RNU, vl);
3243+ vuint8m1_t out3u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out3, out3)), 0 , __RISCV_VXRM_RNU, vl);
3244+ vuint8m1_t out4u8 = __riscv_vnclipu_wx_u8m1 (__riscv_vreinterpret_v_i16m2_u16m2 (__riscv_vcreate_v_i16m1_i16m2 (out4, out4)), 0 , __RISCV_VXRM_RNU, vl);
3245+
3246+ // 8x8 transpose
3247+ __riscv_vsse8_v_u8m1 (out + 0 , out_stride, out0u8, vl);
3248+ __riscv_vsse8_v_u8m1 (out + 1 , out_stride, out1u8, vl);
3249+ __riscv_vsse8_v_u8m1 (out + 2 , out_stride, out2u8, vl);
3250+ __riscv_vsse8_v_u8m1 (out + 3 , out_stride, out3u8, vl);
3251+ __riscv_vsse8_v_u8m1 (out + 4 , out_stride, out4u8, vl);
3252+ __riscv_vsse8_v_u8m1 (out + 5 , out_stride, out5u8, vl);
3253+ __riscv_vsse8_v_u8m1 (out + 6 , out_stride, out6u8, vl);
3254+ __riscv_vsse8_v_u8m1 (out + 7 , out_stride, out7u8, vl);
3255+ #else // __THEAD_VERSION__
32273256 // clamp 0~255
32283257 vuint8mf2_t out0u8 = __riscv_vnclipu_wx_u8mf2 (__riscv_vreinterpret_v_i16m1_u16m1 (__riscv_vmax_vx_i16m1 (out0, 0 , vl)), 0 , __RISCV_VXRM_RNU, vl);
32293258 vuint8mf2_t out7u8 = __riscv_vnclipu_wx_u8mf2 (__riscv_vreinterpret_v_i16m1_u16m1 (__riscv_vmax_vx_i16m1 (out7, 0 , vl)), 0 , __RISCV_VXRM_RNU, vl);
@@ -3243,6 +3272,7 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
32433272 __riscv_vsse8_v_u8mf2 (out + 5 , out_stride, out5u8, vl);
32443273 __riscv_vsse8_v_u8mf2 (out + 6 , out_stride, out6u8, vl);
32453274 __riscv_vsse8_v_u8mf2 (out + 7 , out_stride, out7u8, vl);
3275+ #endif // __THEAD_VERSION__
32463276 }
32473277#endif
32483278}
0 commit comments