Skip to content

Commit 60adb8d

Browse files
mshabuninklatism
authored andcommitted
Merge pull request opencv#25364 from mshabunin:fix-unaligned-filter
imgproc: fix unaligned memory access in filters and Gaussian blur opencv#25364 * filter/SIMD: removed parts which casted 8u pointers to int causing unaligned memory access on RISC-V platform. * GaussianBlur/fixed_point: replaced casts from s16 to u32 with union operations Performance comparison: - [x] check performance on x86_64 - (4 threads, `-DCPU_BASELINE=AVX2`, GCC 11.4, Ubuntu 22) - [report_imgproc_x86_64.ods](https://github.com/opencv/opencv/files/14904702/report_x86_64.ods) - [x] check performance on AArch64 - (4 cores of RK3588, GCC 11.4 aarch64, Raspbian) - [report_imgproc_aarch64.ods](https://github.com/opencv/opencv/files/14908437/report_aarch64.ods) Note: for some reason my performance results are quite unstable, unaffected functions show speedups and slowdowns in many cases. Filter2D and GaussianBlur seem to be OK. Slightly related PR: opencv/ci-gha-workflow#165
1 parent e5f2aae commit 60adb8d

File tree

3 files changed

+12
-47
lines changed

3 files changed

+12
-47
lines changed

modules/imgproc/src/filter.simd.hpp

-44
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ Ptr<BaseFilter> getLinearFilter(
8686

8787
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
8888

89-
typedef int CV_DECL_ALIGNED(1) unaligned_int;
9089
#define VEC_ALIGN CV_MALLOC_ALIGN
9190

9291
int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs)
@@ -1083,21 +1082,6 @@ struct SymmColumnVec_32s8u
10831082
v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
10841083
i += VTraits<v_uint16>::vlanes();
10851084
}
1086-
#if CV_SIMD_WIDTH > 16
1087-
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
1088-
#else
1089-
if( i <= width - VTraits<v_int32>::vlanes() )
1090-
#endif
1091-
{
1092-
v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta));
1093-
s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), s0);
1094-
for( k = 2; k <= ksize2; k++ )
1095-
s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0);
1096-
v_int32 s32 = v_round(s0);
1097-
v_int16 s16 = v_pack(s32, s32);
1098-
*(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
1099-
i += 4 /*v_int32x4::nlanes*/ ;
1100-
}
11011085
}
11021086
else
11031087
{
@@ -1139,20 +1123,6 @@ struct SymmColumnVec_32s8u
11391123
v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
11401124
i += VTraits<v_uint16>::vlanes();
11411125
}
1142-
#if CV_SIMD_WIDTH > 16
1143-
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
1144-
#else
1145-
if( i <= width - VTraits<v_int32>::vlanes() )
1146-
#endif
1147-
{
1148-
v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta));
1149-
for (k = 2; k <= ksize2; k++)
1150-
s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0);
1151-
v_int32 s32 = v_round(s0);
1152-
v_int16 s16 = v_pack(s32, s32);
1153-
*(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
1154-
i += 4 /*v_int32x4::nlanes*/ ;
1155-
}
11561126
}
11571127
return i;
11581128
}
@@ -2236,20 +2206,6 @@ struct FilterVec_8u
22362206
v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
22372207
i += VTraits<v_uint16>::vlanes();
22382208
}
2239-
#if CV_SIMD_WIDTH > 16
2240-
while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
2241-
#else
2242-
if( i <= width - VTraits<v_int32>::vlanes() )
2243-
#endif
2244-
{
2245-
v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta));
2246-
for( k = 1; k < nz; k++ )
2247-
s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
2248-
v_int32 s32 = v_round(s0);
2249-
v_int16 s16 = v_pack(s32, s32);
2250-
*(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
2251-
i += 4 /*VTraits<v_int32x4>::vlanes()*/ ;
2252-
}
22532209
return i;
22542210
}
22552211

modules/imgproc/src/fixedpoint.inl.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ class ufixedpoint16
370370
static CV_ALWAYS_INLINE ufixedpoint16 one() { return ufixedpoint16((uint16_t)(1 << fixedShift)); }
371371

372372
static CV_ALWAYS_INLINE ufixedpoint16 fromRaw(uint16_t v) { return ufixedpoint16(v); }
373-
CV_ALWAYS_INLINE uint16_t raw() { return val; }
373+
CV_ALWAYS_INLINE uint16_t raw() const { return val; }
374374
};
375375

376376
}

modules/imgproc/src/smooth.simd.hpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -1634,6 +1634,15 @@ void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len)
16341634
dst[i] = val;
16351635
}
16361636
}
1637+
1638+
inline uint32_t read_pair_as_u32(const ufixedpoint16 * mem)
1639+
{
1640+
union Cv32sufX2 { uint32_t v32; int16_t v16[2]; } res;
1641+
res.v16[0] = mem->raw();
1642+
res.v16[1] = (mem + 1)->raw();
1643+
return res.v32;
1644+
}
1645+
16371646
template <>
16381647
void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
16391648
{
@@ -1655,7 +1664,7 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
16551664
v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
16561665
v_int16 v_tmp0, v_tmp1;
16571666

1658-
v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));
1667+
v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(read_pair_as_u32(m)));
16591668

16601669
const int16_t* src0 = (const int16_t*)src[0] + i;
16611670
const int16_t* src1 = (const int16_t*)src[1] + i;
@@ -1683,7 +1692,7 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
16831692
int j = 2;
16841693
for (; j < n - 1; j+=2)
16851694
{
1686-
v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m+j))));
1695+
v_mul = v_reinterpret_as_s16(vx_setall_u32(read_pair_as_u32(m + j)));
16871696

16881697
const int16_t* srcj0 = (const int16_t*)src[j] + i;
16891698
const int16_t* srcj1 = (const int16_t*)src[j + 1] + i;

0 commit comments

Comments
 (0)