Skip to content

Commit 5f99c89

Browse files
committed
Most of the SIMD is done. No wasm yet.
1 parent eafb965 commit 5f99c89

20 files changed

+1834
-280
lines changed

src/core/codestream/ojph_codeblock.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ namespace ojph {
245245
cb_size.w);
246246
}
247247
else
248-
this->codeblock_functions.mem_clear32(dp, cb_size.w * sizeof(ui32));
248+
this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32));
249249
}
250250
else
251251
{
@@ -259,9 +259,7 @@ namespace ojph {
259259
cb_size.w);
260260
}
261261
else
262-
this->codeblock_functions.mem_clear64(dp, cb_size.w * sizeof(*dp));
263-
264-
262+
this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
265263
}
266264

267265
++cur_line;

src/core/codestream/ojph_codeblock_fun.cpp

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,10 @@ namespace ojph {
5757
{
5858

5959
//////////////////////////////////////////////////////////////////////////
60-
void gen_mem_clear32(si32* addr, size_t count);
61-
void sse_mem_clear32(si32* addr, size_t count);
62-
void avx_mem_clear32(si32* addr, size_t count);
63-
void wasm_mem_clear32(si32* addr, size_t count);
64-
65-
void gen_mem_clear64(si64* addr, size_t count);
66-
void sse_mem_clear64(si64* addr, size_t count);
67-
void avx_mem_clear64(si64* addr, size_t count);
68-
void wasm_mem_clear64(si64* addr, size_t count);
60+
void gen_mem_clear(void* addr, size_t count);
61+
void sse_mem_clear(void* addr, size_t count);
62+
void avx_mem_clear(void* addr, size_t count);
63+
void wasm_mem_clear(void* addr, size_t count);
6964

7065
//////////////////////////////////////////////////////////////////////////
7166
ui32 gen_find_max_val32(ui32* address);
@@ -135,7 +130,7 @@ namespace ojph {
135130
// Default path, no acceleration. We may change this later
136131
decode_cb32 = ojph_decode_codeblock32;
137132
find_max_val32 = gen_find_max_val32;
138-
mem_clear32 = gen_mem_clear32;
133+
mem_clear = gen_mem_clear;
139134
if (reversible) {
140135
tx_to_cb32 = gen_rev_tx_to_cb32;
141136
tx_from_cb32 = gen_rev_tx_from_cb32;
@@ -149,7 +144,6 @@ namespace ojph {
149144

150145
decode_cb64 = ojph_decode_codeblock64;
151146
find_max_val64 = gen_find_max_val64;
152-
mem_clear64 = gen_mem_clear64;
153147
if (reversible) {
154148
tx_to_cb64 = gen_rev_tx_to_cb64;
155149
tx_from_cb64 = gen_rev_tx_from_cb64;
@@ -168,7 +162,7 @@ namespace ojph {
168162
// Accelerated functions for INTEL/AMD CPUs
169163
#ifndef OJPH_DISABLE_SSE
170164
if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
171-
mem_clear32 = sse_mem_clear32;
165+
mem_clear = sse_mem_clear;
172166
#endif // !OJPH_DISABLE_SSE
173167

174168
#ifndef OJPH_DISABLE_SSE2
@@ -182,6 +176,16 @@ namespace ojph {
182176
tx_to_cb32 = sse2_irv_tx_to_cb32;
183177
tx_from_cb32 = sse2_irv_tx_from_cb32;
184178
}
179+
find_max_val64 = sse2_find_max_val64;
180+
if (reversible) {
181+
tx_to_cb64 = sse2_rev_tx_to_cb64;
182+
tx_from_cb64 = sse2_rev_tx_from_cb64;
183+
}
184+
else
185+
{
186+
tx_to_cb64 = NULL;
187+
tx_from_cb64 = NULL;
188+
}
185189
}
186190
#endif // !OJPH_DISABLE_SSE2
187191

@@ -192,7 +196,7 @@ namespace ojph {
192196

193197
#ifndef OJPH_DISABLE_AVX
194198
if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
195-
mem_clear32 = avx_mem_clear32;
199+
mem_clear = avx_mem_clear;
196200
#endif // !OJPH_DISABLE_AVX
197201

198202
#ifndef OJPH_DISABLE_AVX2
@@ -208,6 +212,17 @@ namespace ojph {
208212
}
209213
encode_cb32 = ojph_encode_codeblock_avx2;
210214
decode_cb32 = ojph_decode_codeblock_avx2;
215+
216+
find_max_val64 = avx2_find_max_val64;
217+
if (reversible) {
218+
tx_to_cb64 = avx2_rev_tx_to_cb64;
219+
tx_from_cb64 = avx2_rev_tx_from_cb64;
220+
}
221+
else
222+
{
223+
tx_to_cb64 = NULL;
224+
tx_from_cb64 = NULL;
225+
}
211226
}
212227
#endif // !OJPH_DISABLE_AVX2
213228

src/core/codestream/ojph_codeblock_fun.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ namespace ojph {
4848
namespace local {
4949

5050
// define function signature simple memory clearing
51-
typedef void (*mem_clear_fun32)(si32* addr, size_t count);
52-
typedef void (*mem_clear_fun64)(si64* addr, size_t count);
51+
typedef void (*mem_clear_fun)(void* addr, size_t count);
5352

5453
// define function signature for max value finding
5554
typedef ui32 (*find_max_val_fun32)(ui32* addr);
@@ -96,8 +95,7 @@ namespace ojph {
9695
void init(bool reversible);
9796

9897
// a pointer to the max value finding function
99-
mem_clear_fun32 mem_clear32;
100-
mem_clear_fun64 mem_clear64;
98+
mem_clear_fun mem_clear;
10199

102100
// a pointer to the max value finding function
103101
find_max_val_fun32 find_max_val32;

src/core/codestream/ojph_codestream_avx.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ namespace ojph {
4242
namespace local {
4343

4444
//////////////////////////////////////////////////////////////////////////
45-
void avx_mem_clear32(si32* addr, size_t count)
45+
void avx_mem_clear(void* addr, size_t count)
4646
{
4747
float* p = (float*)addr;
4848
__m256 zero = _mm256_setzero_ps();

src/core/codestream/ojph_codestream_avx2.cpp

Lines changed: 72 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,18 @@ namespace ojph {
5555
return t;
5656
}
5757

58+
//////////////////////////////////////////////////////////////////////////
59+
ui64 avx2_find_max_val64(ui64* address)
60+
{
61+
__m128i x0 = _mm_loadu_si128((__m128i*)address);
62+
__m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
63+
x0 = _mm_or_si128(x0, x1);
64+
x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
65+
x0 = _mm_or_si128(x0, x1);
66+
ui64 t = (ui64)_mm_extract_epi64(x0, 0);
67+
return t;
68+
}
69+
5870
//////////////////////////////////////////////////////////////////////////
5971
void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
6072
float delta_inv, ui32 count, ui32* max_val)
@@ -78,7 +90,7 @@ namespace ojph {
7890
}
7991
_mm256_storeu_si256((__m256i*)max_val, tmax);
8092
}
81-
93+
8294
//////////////////////////////////////////////////////////////////////////
8395
void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
8496
float delta_inv, ui32 count, ui32* max_val)
@@ -115,11 +127,11 @@ namespace ojph {
115127
si32 *p = (si32*)dp;
116128
for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
117129
{
118-
__m256i v = _mm256_load_si256((__m256i*)sp);
119-
__m256i val = _mm256_and_si256(v, m1);
120-
val = _mm256_srli_epi32(val, (int)shift);
121-
val = _mm256_sign_epi32(val, v);
122-
_mm256_storeu_si256((__m256i*)p, val);
130+
__m256i v = _mm256_load_si256((__m256i*)sp);
131+
__m256i val = _mm256_and_si256(v, m1);
132+
val = _mm256_srli_epi32(val, (int)shift);
133+
val = _mm256_sign_epi32(val, v);
134+
_mm256_storeu_si256((__m256i*)p, val);
123135
}
124136
}
125137

@@ -142,5 +154,58 @@ namespace ojph {
142154
_mm256_storeu_ps(p, valf);
143155
}
144156
}
157+
158+
//////////////////////////////////////////////////////////////////////////
159+
void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
160+
float delta_inv, ui32 count, ui64* max_val)
161+
{
162+
ojph_unused(delta_inv);
163+
164+
// convert to sign and magnitude and keep max_val
165+
ui32 shift = 63 - K_max;
166+
__m256i m0 = _mm256_set1_epi64x(0x8000000000000000LL);
167+
__m256i zero = _mm256_setzero_si256();
168+
__m256i one = _mm256_set1_epi64x(1);
169+
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
170+
__m256i *p = (__m256i*)sp;
171+
for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
172+
{
173+
__m256i v = _mm256_loadu_si256(p);
174+
__m256i sign = _mm256_cmpgt_epi64(zero, v);
175+
__m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
176+
__m256i ones = _mm256_and_si256(sign, one);
177+
val = _mm256_add_epi64(val, ones); // 2's complement
178+
sign = _mm256_and_si256(sign, m0);
179+
val = _mm256_slli_epi64(val, (int)shift);
180+
tmax = _mm256_or_si256(tmax, val);
181+
val = _mm256_or_si256(val, sign);
182+
_mm256_storeu_si256((__m256i*)dp, val);
183+
}
184+
_mm256_storeu_si256((__m256i*)max_val, tmax);
185+
}
186+
187+
//////////////////////////////////////////////////////////////////////////
188+
void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
189+
float delta, ui32 count)
190+
{
191+
ojph_unused(delta);
192+
193+
ui32 shift = 63 - K_max;
194+
__m256i m1 = _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
195+
__m256i zero = _mm256_setzero_si256();
196+
__m256i one = _mm256_set1_epi64x(1);
197+
si64 *p = (si64*)dp;
198+
for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
199+
{
200+
__m256i v = _mm256_load_si256((__m256i*)sp);
201+
__m256i val = _mm256_and_si256(v, m1);
202+
val = _mm256_srli_epi64(val, (int)shift);
203+
__m256i sign = _mm256_cmpgt_epi64(zero, v);
204+
val = _mm256_xor_si256(val, sign); // negate 1's complement
205+
__m256i ones = _mm256_and_si256(sign, one);
206+
val = _mm256_add_epi64(val, ones); // 2's complement
207+
_mm256_storeu_si256((__m256i*)p, val);
208+
}
209+
}
145210
}
146-
}
211+
}

src/core/codestream/ojph_codestream_gen.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,11 @@ namespace ojph {
4242
namespace local {
4343

4444
//////////////////////////////////////////////////////////////////////////
45-
void gen_mem_clear32(si32* addr, size_t count)
46-
{
47-
for (size_t i = 0; i < count; i += 4)
48-
*addr++ = 0;
49-
}
50-
51-
//////////////////////////////////////////////////////////////////////////
52-
void gen_mem_clear64(si64* addr, size_t count)
45+
void gen_mem_clear(void* addr, size_t count)
5346
{
47+
si64* p = (si64*)addr;
5448
for (size_t i = 0; i < count; i += 8)
55-
*addr++ = 0;
49+
*p++ = 0;
5650
}
5751

5852
//////////////////////////////////////////////////////////////////////////

src/core/codestream/ojph_codestream_sse.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ namespace ojph {
4242
namespace local {
4343

4444
//////////////////////////////////////////////////////////////////////////
45-
void sse_mem_clear32(si32* addr, size_t count)
45+
void sse_mem_clear(void* addr, size_t count)
4646
{
4747
float* p = (float*)addr;
4848
__m128 zero = _mm_setzero_ps();
4949
for (size_t i = 0; i < count; i += 16, p += 4)
5050
_mm_storeu_ps(p, zero);
5151
}
52-
5352
}
5453
}

src/core/codestream/ojph_codestream_sse2.cpp

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,21 @@ namespace ojph {
5858
// return t;
5959
}
6060

61+
//////////////////////////////////////////////////////////////////////////
62+
ui64 sse2_find_max_val64(ui64* address)
63+
{
64+
__m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
65+
x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
66+
x0 = _mm_or_si128(x0, x1);
67+
_mm_storeu_si128((__m128i*)address, x0);
68+
return *address;
69+
// A single movd t, xmm0 can do the trick, but it is not available
70+
// in SSE2 intrinsics. extract_epi32 is available in sse4.1
71+
// ui32 t = (ui32)_mm_extract_epi16(x0, 0);
72+
// t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
73+
// return t;
74+
}
75+
6176
//////////////////////////////////////////////////////////////////////////
6277
void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
6378
float delta_inv, ui32 count, ui32* max_val)
@@ -129,14 +144,14 @@ namespace ojph {
129144
si32 *p = (si32*)dp;
130145
for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
131146
{
132-
__m128i v = _mm_load_si128((__m128i*)sp);
133-
__m128i val = _mm_and_si128(v, m1);
134-
val = _mm_srli_epi32(val, (int)shift);
135-
__m128i sign = _mm_cmplt_epi32(v, zero);
136-
val = _mm_xor_si128(val, sign); // negate 1's complement
137-
__m128i ones = _mm_and_si128(sign, one);
138-
val = _mm_add_epi32(val, ones); // 2's complement
139-
_mm_storeu_si128((__m128i*)p, val);
147+
__m128i v = _mm_load_si128((__m128i*)sp);
148+
__m128i val = _mm_and_si128(v, m1);
149+
val = _mm_srli_epi32(val, (int)shift);
150+
__m128i sign = _mm_cmplt_epi32(v, zero);
151+
val = _mm_xor_si128(val, sign); // negate 1's complement
152+
__m128i ones = _mm_and_si128(sign, one);
153+
val = _mm_add_epi32(val, ones); // 2's complement
154+
_mm_storeu_si128((__m128i*)p, val);
140155
}
141156
}
142157

@@ -159,5 +174,59 @@ namespace ojph {
159174
_mm_storeu_ps(p, valf);
160175
}
161176
}
177+
178+
//////////////////////////////////////////////////////////////////////////
179+
void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
180+
float delta_inv, ui32 count, ui64* max_val)
181+
{
182+
ojph_unused(delta_inv);
183+
184+
// convert to sign and magnitude and keep max_val
185+
ui32 shift = 63 - K_max;
186+
__m128i m0 = _mm_set1_epi64x(0x8000000000000000LL);
187+
__m128i zero = _mm_setzero_si128();
188+
__m128i one = _mm_set1_epi64x(1);
189+
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
190+
__m128i *p = (__m128i*)sp;
191+
for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
192+
{
193+
__m128i v = _mm_loadu_si128(p);
194+
__m128i sign = _mm_cmplt_epi32(v, zero);
195+
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
196+
__m128i val = _mm_xor_si128(v, sign); // negate 1's complement
197+
__m128i ones = _mm_and_si128(sign, one);
198+
val = _mm_add_epi64(val, ones); // 2's complement
199+
sign = _mm_and_si128(sign, m0);
200+
val = _mm_slli_epi64(val, (int)shift);
201+
tmax = _mm_or_si128(tmax, val);
202+
val = _mm_or_si128(val, sign);
203+
_mm_storeu_si128((__m128i*)dp, val);
204+
}
205+
_mm_storeu_si128((__m128i*)max_val, tmax);
206+
}
207+
208+
//////////////////////////////////////////////////////////////////////////
209+
void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
210+
float delta, ui32 count)
211+
{
212+
ojph_unused(delta);
213+
ui32 shift = 63 - K_max;
214+
__m128i m1 = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
215+
__m128i zero = _mm_setzero_si128();
216+
__m128i one = _mm_set1_epi64x(1);
217+
si64 *p = (si64*)dp;
218+
for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
219+
{
220+
__m128i v = _mm_load_si128((__m128i*)sp);
221+
__m128i val = _mm_and_si128(v, m1);
222+
val = _mm_srli_epi64(val, (int)shift);
223+
__m128i sign = _mm_cmplt_epi32(v, zero);
224+
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
225+
val = _mm_xor_si128(val, sign); // negate 1's complement
226+
__m128i ones = _mm_and_si128(sign, one);
227+
val = _mm_add_epi64(val, ones); // 2's complement
228+
_mm_storeu_si128((__m128i*)p, val);
229+
}
230+
}
162231
}
163232
}

0 commit comments

Comments
 (0)