Skip to content

Commit d5618a8

Browse files
Merge branch 'aous72:master' into feature/add-openexr-support
2 parents 442aff3 + 5df0f8c commit d5618a8

File tree

4 files changed

+179
-13
lines changed

4 files changed

+179
-13
lines changed

src/core/codestream/ojph_codestream_avx2.cpp

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ namespace ojph {
8888
__m256i m0 = _mm256_set1_epi32(INT_MIN);
8989
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
9090
__m256i *p = (__m256i*)sp;
91-
for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
91+
for ( ; count >= 8; count -= 8, p += 1, dp += 8)
9292
{
9393
__m256i v = _mm256_loadu_si256(p);
9494
__m256i sign = _mm256_and_si256(v, m0);
@@ -98,6 +98,22 @@ namespace ojph {
9898
val = _mm256_or_si256(val, sign);
9999
_mm256_storeu_si256((__m256i*)dp, val);
100100
}
101+
if (count)
102+
{
103+
__m256i v = _mm256_loadu_si256(p);
104+
__m256i sign = _mm256_and_si256(v, m0);
105+
__m256i val = _mm256_abs_epi32(v);
106+
val = _mm256_slli_epi32(val, (int)shift);
107+
108+
__m256i c = _mm256_set1_epi32((si32)count);
109+
__m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
110+
__m256i mask = _mm256_cmpgt_epi32(c, idx);
111+
c = _mm256_and_si256(val, mask);
112+
tmax = _mm256_or_si256(tmax, c);
113+
114+
val = _mm256_or_si256(val, sign);
115+
_mm256_storeu_si256((__m256i*)dp, val);
116+
}
101117
_mm256_storeu_si256((__m256i*)max_val, tmax);
102118
}
103119

@@ -113,7 +129,7 @@ namespace ojph {
113129
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
114130
float *p = (float*)sp;
115131

116-
for (ui32 i = 0; i < count; i += 8, p += 8, dp += 8)
132+
for ( ; count >= 8; count -= 8, p += 8, dp += 8)
117133
{
118134
__m256 vf = _mm256_loadu_ps(p);
119135
vf = _mm256_mul_ps(vf, d); // multiply
@@ -124,6 +140,23 @@ namespace ojph {
124140
val = _mm256_or_si256(val, sign);
125141
_mm256_storeu_si256((__m256i*)dp, val);
126142
}
143+
if (count)
144+
{
145+
__m256 vf = _mm256_loadu_ps(p);
146+
vf = _mm256_mul_ps(vf, d); // multiply
147+
__m256i val = _mm256_cvtps_epi32(vf); // convert to int
148+
__m256i sign = _mm256_and_si256(val, m0); // get sign
149+
val = _mm256_abs_epi32(val);
150+
151+
__m256i c = _mm256_set1_epi32((si32)count);
152+
__m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
153+
__m256i mask = _mm256_cmpgt_epi32(c, idx);
154+
c = _mm256_and_si256(val, mask);
155+
tmax = _mm256_or_si256(tmax, c);
156+
157+
val = _mm256_or_si256(val, sign);
158+
_mm256_storeu_si256((__m256i*)dp, val);
159+
}
127160
_mm256_storeu_si256((__m256i*)max_val, tmax);
128161
}
129162

@@ -178,7 +211,7 @@ namespace ojph {
178211
__m256i one = _mm256_set1_epi64x(1);
179212
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
180213
__m256i *p = (__m256i*)sp;
181-
for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
214+
for ( ; count >= 4; count -= 4, p += 1, dp += 4)
182215
{
183216
__m256i v = _mm256_loadu_si256(p);
184217
__m256i sign = _mm256_cmpgt_epi64(zero, v);
@@ -191,6 +224,25 @@ namespace ojph {
191224
val = _mm256_or_si256(val, sign);
192225
_mm256_storeu_si256((__m256i*)dp, val);
193226
}
227+
if (count)
228+
{
229+
__m256i v = _mm256_loadu_si256(p);
230+
__m256i sign = _mm256_cmpgt_epi64(zero, v);
231+
__m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
232+
__m256i ones = _mm256_and_si256(sign, one);
233+
val = _mm256_add_epi64(val, ones); // 2's complement
234+
sign = _mm256_and_si256(sign, m0);
235+
val = _mm256_slli_epi64(val, (int)shift);
236+
237+
__m256i c = _mm256_set1_epi64x(count);
238+
__m256i idx = _mm256_set_epi64x(3, 2, 1, 0);
239+
__m256i mask = _mm256_cmpgt_epi64(c, idx);
240+
c = _mm256_and_si256(val, mask);
241+
tmax = _mm256_or_si256(tmax, c);
242+
243+
val = _mm256_or_si256(val, sign);
244+
_mm256_storeu_si256((__m256i*)dp, val);
245+
}
194246
_mm256_storeu_si256((__m256i*)max_val, tmax);
195247
}
196248

src/core/codestream/ojph_codestream_sse2.cpp

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ namespace ojph {
8787
__m128i one = _mm_set1_epi32(1);
8888
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
8989
__m128i *p = (__m128i*)sp;
90-
for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
90+
for ( ; count >= 4; count -= 4, p += 1, dp += 4)
9191
{
9292
__m128i v = _mm_loadu_si128(p);
9393
__m128i sign = _mm_cmplt_epi32(v, zero);
@@ -100,6 +100,25 @@ namespace ojph {
100100
val = _mm_or_si128(val, sign);
101101
_mm_storeu_si128((__m128i*)dp, val);
102102
}
103+
if (count)
104+
{
105+
__m128i v = _mm_loadu_si128(p);
106+
__m128i sign = _mm_cmplt_epi32(v, zero);
107+
__m128i val = _mm_xor_si128(v, sign); // negate 1's complement
108+
__m128i ones = _mm_and_si128(sign, one);
109+
val = _mm_add_epi32(val, ones); // 2's complement
110+
sign = _mm_and_si128(sign, m0);
111+
val = _mm_slli_epi32(val, (int)shift);
112+
113+
__m128i c = _mm_set1_epi32((si32)count);
114+
__m128i idx = _mm_set_epi32(3, 2, 1, 0);
115+
__m128i mask = _mm_cmpgt_epi32(c, idx);
116+
c = _mm_and_si128(val, mask);
117+
tmax = _mm_or_si128(tmax, c);
118+
119+
val = _mm_or_si128(val, sign);
120+
_mm_storeu_si128((__m128i*)dp, val);
121+
}
103122
_mm_storeu_si128((__m128i*)max_val, tmax);
104123
}
105124

@@ -116,7 +135,7 @@ namespace ojph {
116135
__m128i one = _mm_set1_epi32(1);
117136
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
118137
float *p = (float*)sp;
119-
for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
138+
for ( ; count >= 4; count -= 4, p += 4, dp += 4)
120139
{
121140
__m128 vf = _mm_loadu_ps(p);
122141
vf = _mm_mul_ps(vf, d); // multiply
@@ -130,6 +149,26 @@ namespace ojph {
130149
val = _mm_or_si128(val, sign);
131150
_mm_storeu_si128((__m128i*)dp, val);
132151
}
152+
if (count)
153+
{
154+
__m128 vf = _mm_loadu_ps(p);
155+
vf = _mm_mul_ps(vf, d); // multiply
156+
__m128i val = _mm_cvtps_epi32(vf); // convert to int
157+
__m128i sign = _mm_cmplt_epi32(val, zero); // get sign
158+
val = _mm_xor_si128(val, sign); // negate 1's complement
159+
__m128i ones = _mm_and_si128(sign, one);
160+
val = _mm_add_epi32(val, ones); // 2's complement
161+
162+
__m128i c = _mm_set1_epi32((si32)count);
163+
__m128i idx = _mm_set_epi32(3, 2, 1, 0);
164+
__m128i mask = _mm_cmpgt_epi32(c, idx);
165+
c = _mm_and_si128(val, mask);
166+
tmax = _mm_or_si128(tmax, c);
167+
168+
sign = _mm_slli_epi32(sign, 31);
169+
val = _mm_or_si128(val, sign);
170+
_mm_storeu_si128((__m128i*)dp, val);
171+
}
133172
_mm_storeu_si128((__m128i*)max_val, tmax);
134173
}
135174

@@ -189,7 +228,7 @@ namespace ojph {
189228
__m128i one = _mm_set1_epi64x(1);
190229
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
191230
__m128i *p = (__m128i*)sp;
192-
for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
231+
for ( ; count >= 2; count -= 2, p += 1, dp += 2)
193232
{
194233
__m128i v = _mm_loadu_si128(p);
195234
__m128i sign = _mm_cmplt_epi32(v, zero);
@@ -203,6 +242,24 @@ namespace ojph {
203242
val = _mm_or_si128(val, sign);
204243
_mm_storeu_si128((__m128i*)dp, val);
205244
}
245+
if (count)
246+
{
247+
__m128i v = _mm_loadu_si128(p);
248+
__m128i sign = _mm_cmplt_epi32(v, zero);
249+
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
250+
__m128i val = _mm_xor_si128(v, sign); // negate 1's complement
251+
__m128i ones = _mm_and_si128(sign, one);
252+
val = _mm_add_epi64(val, ones); // 2's complement
253+
sign = _mm_and_si128(sign, m0);
254+
val = _mm_slli_epi64(val, (int)shift);
255+
256+
__m128i c = _mm_set_epi32(0, 0, (si32)0xFFFFFFFF, (si32)0xFFFFFFFF);
257+
c = _mm_and_si128(val, c);
258+
tmax = _mm_or_si128(tmax, c);
259+
260+
val = _mm_or_si128(val, sign);
261+
_mm_storeu_si128((__m128i*)dp, val);
262+
}
206263
_mm_storeu_si128((__m128i*)max_val, tmax);
207264
}
208265

@@ -222,10 +279,10 @@ namespace ojph {
222279
__m128i val = _mm_and_si128(v, m1);
223280
val = _mm_srli_epi64(val, (int)shift);
224281
__m128i sign = _mm_cmplt_epi32(v, zero);
225-
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
226-
val = _mm_xor_si128(val, sign); // negate 1's complement
282+
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
283+
val = _mm_xor_si128(val, sign); // negate 1's complement
227284
__m128i ones = _mm_and_si128(sign, one);
228-
val = _mm_add_epi64(val, ones); // 2's complement
285+
val = _mm_add_epi64(val, ones); // 2's complement
229286
_mm_storeu_si128((__m128i*)p, val);
230287
}
231288
}

src/core/codestream/ojph_codestream_wasm.cpp

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ namespace ojph {
8888
v128_t one = wasm_i32x4_splat(1);
8989
v128_t tmax = wasm_v128_load(max_val);
9090
si32 *p = (si32*)sp;
91-
for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
91+
for ( ; count >= 4; count -= 4, p += 4, dp += 4)
9292
{
9393
v128_t v = wasm_v128_load(p);
9494
v128_t sign = wasm_i32x4_lt(v, zero);
@@ -101,6 +101,25 @@ namespace ojph {
101101
val = wasm_v128_or(val, sign);
102102
wasm_v128_store(dp, val);
103103
}
104+
if (count)
105+
{
106+
v128_t v = wasm_v128_load(p);
107+
v128_t sign = wasm_i32x4_lt(v, zero);
108+
v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
109+
v128_t ones = wasm_v128_and(sign, one);
110+
val = wasm_i32x4_add(val, ones); // 2's complement
111+
sign = wasm_v128_and(sign, m0);
112+
val = wasm_i32x4_shl(val, shift);
113+
114+
v128_t c = wasm_i32x4_splat((si32)count);
115+
v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
116+
v128_t mask = wasm_i32x4_gt(c, idx);
117+
c = wasm_v128_and(val, mask);
118+
tmax = wasm_v128_or(tmax, c);
119+
120+
val = wasm_v128_or(val, sign);
121+
wasm_v128_store(dp, val);
122+
}
104123
wasm_v128_store(max_val, tmax);
105124
}
106125

@@ -117,7 +136,7 @@ namespace ojph {
117136
v128_t one = wasm_i32x4_splat(1);
118137
v128_t tmax = wasm_v128_load(max_val);
119138
float *p = (float*)sp;
120-
for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
139+
for ( ; count >= 4; count -= 4, p += 4, dp += 4)
121140
{
122141
v128_t vf = wasm_v128_load(p);
123142
vf = wasm_f32x4_mul(vf, d); // multiply
@@ -131,6 +150,26 @@ namespace ojph {
131150
val = wasm_v128_or(val, sign);
132151
wasm_v128_store(dp, val);
133152
}
153+
if (count)
154+
{
155+
v128_t vf = wasm_v128_load(p);
156+
vf = wasm_f32x4_mul(vf, d); // multiply
157+
v128_t val = wasm_i32x4_trunc_sat_f32x4(vf); // convert to signed int
158+
v128_t sign = wasm_i32x4_lt(val, zero); // get sign
159+
val = wasm_v128_xor(val, sign); // negate 1's complement
160+
v128_t ones = wasm_v128_and(sign, one);
161+
val = wasm_i32x4_add(val, ones); // 2's complement
162+
163+
v128_t c = wasm_i32x4_splat((si32)count);
164+
v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
165+
v128_t mask = wasm_i32x4_gt(c, idx);
166+
c = wasm_v128_and(val, mask);
167+
tmax = wasm_v128_or(tmax, c);
168+
169+
sign = wasm_i32x4_shl(sign, 31);
170+
val = wasm_v128_or(val, sign);
171+
wasm_v128_store(dp, val);
172+
}
134173
wasm_v128_store(max_val, tmax);
135174
}
136175

@@ -190,7 +229,7 @@ namespace ojph {
190229
v128_t one = wasm_i64x2_splat(1);
191230
v128_t tmax = wasm_v128_load(max_val);
192231
si64 *p = (si64*)sp;
193-
for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2)
232+
for ( ; count >= 2; count -= 2, p += 2, dp += 2)
194233
{
195234
v128_t v = wasm_v128_load(p);
196235
v128_t sign = wasm_i64x2_lt(v, zero);
@@ -203,6 +242,24 @@ namespace ojph {
203242
val = wasm_v128_or(val, sign);
204243
wasm_v128_store(dp, val);
205244
}
245+
if (count)
246+
{
247+
v128_t v = wasm_v128_load(p);
248+
v128_t sign = wasm_i64x2_lt(v, zero);
249+
v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
250+
v128_t ones = wasm_v128_and(sign, one);
251+
val = wasm_i64x2_add(val, ones); // 2's complement
252+
sign = wasm_v128_and(sign, m0);
253+
val = wasm_i64x2_shl(val, shift);
254+
255+
v128_t c = wasm_i32x4_make((si32)0xFFFFFFFF, (si32)0xFFFFFFFF, 0, 0);
256+
c = wasm_v128_and(val, c);
257+
tmax = wasm_v128_or(tmax, c);
258+
259+
val = wasm_v128_or(val, sign);
260+
wasm_v128_store(dp, val);
261+
}
262+
206263
wasm_v128_store(max_val, tmax);
207264
}
208265

src/core/common/ojph_version.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,4 @@
3535

3636
#define OPENJPH_VERSION_MAJOR 0
3737
#define OPENJPH_VERSION_MINOR 21
38-
#define OPENJPH_VERSION_PATCH 1
38+
#define OPENJPH_VERSION_PATCH 2

0 commit comments

Comments
 (0)