Skip to content

Commit 293eacd

Browse files
committed
Added missing path in AVX2. Not tested.
1 parent 66286f3 commit 293eacd

File tree

2 files changed

+111
-39
lines changed

2 files changed

+111
-39
lines changed

src/core/transform/ojph_colour_avx2.cpp

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,32 @@ namespace ojph {
236236
}
237237
}
238238

239+
//////////////////////////////////////////////////////////////////////////
240+
static inline
241+
__m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
242+
{
243+
// We must use _CMP_NLT_UQ or _CMP_GE_OQ, _CMP_GE_OS, or _CMP_NLT_US
244+
// It is not clear to me which to use
245+
__m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ); // 0xFFFFFFFF for x >= y
246+
__m256i c = _mm256_castps_si256(ct); // does not generate any code
247+
__m256i d = _mm256_and_si256(c, a); // keep only a, where x >= y
248+
__m256i e = _mm256_andnot_si256(c, b); // keep only b, where x < y
249+
return _mm256_or_si256(d, e); // combine
250+
}
251+
252+
//////////////////////////////////////////////////////////////////////////
253+
static inline
254+
__m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
255+
{
256+
// We must use _CMP_LT_OQ or _CMP_NGE_UQ, _CMP_LT_OS, or _CMP_NGE_US
257+
// It is not clear to me which to use
258+
__m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ); // 0xFFFFFFFF for x < y
259+
__m256i c = _mm256_castps_si256(ct); // does not generate any code
260+
__m256i d = _mm256_and_si256(c, a); // keep only a, where x < y
261+
__m256i e = _mm256_andnot_si256(c, b); // keep only b, where x >= y
262+
return _mm256_or_si256(d, e); // combine
263+
}
264+
239265
//////////////////////////////////////////////////////////////////////////
240266
void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
241267
line_buf *dst_line, ui32 dst_line_offset,
@@ -307,31 +333,42 @@ namespace ojph {
307333
// can achieve. All this is academic, because here are talking
308334
// about a number which has all the exponent bits set, meaning
309335
// it is either infinity, -infinity, qNan or sNan.
310-
float mul = (float)(1ull << bit_depth);
311-
const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
312-
const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
336+
si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
337+
__m256 mul = _mm256_set1_ps((float)(1 << bit_depth));
338+
__m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit); // val < upper
339+
__m256 fl_low_lim = _mm256_set1_ps((float)neg_limit); // val >= lower
340+
__m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
341+
__m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
313342

314343
if (is_signed)
315344
{
316-
const si32 bias = (1 << (bit_depth - 1)) + 1;
317-
for (ui32 i = width; i > 0; --i) {
318-
si64 t = ojph_round64(*sp++ * mul);
319-
t = ojph_max(t, lower_limit);
320-
t = ojph_min(t, upper_limit);
321-
si32 v = (si32)t;
322-
v = (v >= 0) ? v : (- v - bias);
323-
*dp++ = v;
345+
__m256i zero = _mm256_setzero_si256();
346+
__m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
347+
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
348+
__m256 t = _mm256_loadu_ps(sp);
349+
t = _mm256_mul_ps(t, mul);
350+
__m256i u = _mm256_cvtps_epi32(t);
351+
u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
352+
u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
353+
__m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
354+
__m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
355+
neg = _mm256_and_si256(c, neg); //keep only - bias - value
356+
__m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0
357+
v = _mm256_or_si256(neg, v); //combine
358+
_mm256_storeu_si256((__m256i*)dp, v);
324359
}
325360
}
326361
else
327362
{
328-
const si32 half = (1 << (bit_depth - 1));
329-
for (ui32 i = width; i > 0; --i) {
330-
si64 t = ojph_round64(*sp++ * mul);
331-
t = ojph_max(t, lower_limit);
332-
t = ojph_min(t, upper_limit);
333-
si32 v = (si32)t;
334-
*dp++ = v + half;
363+
__m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
364+
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
365+
__m256 t = _mm256_loadu_ps(sp);
366+
t = _mm256_mul_ps(t, mul);
367+
__m256i u = _mm256_cvtps_epi32(t);
368+
u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
369+
u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
370+
u = _mm256_add_epi32(u, half);
371+
_mm256_storeu_si256((__m256i*)dp, u);
335372
}
336373
}
337374
}

src/core/transform/ojph_colour_sse2.cpp

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ namespace ojph {
8383
}
8484

8585
//////////////////////////////////////////////////////////////////////////
86-
// This requires SSE4.1
86+
// _mm_max_epi32 requires SSE4.1, so here we implement it in SSE2
87+
static inline
8788
__m128i ojph_mm_max_epi32(__m128i a, __m128i b)
8889
{
8990
__m128i c = _mm_cmpgt_epi32(a, b); // 0xFFFFFFFF for a > b
@@ -93,7 +94,8 @@ namespace ojph {
9394
}
9495

9596
//////////////////////////////////////////////////////////////////////////
96-
// This requires SSE4.1
97+
// _mm_min_epi32 requires SSE4.1, so here we implement it in SSE2
98+
static inline
9799
__m128i ojph_mm_min_epi32 (__m128i a, __m128i b)
98100
{
99101
__m128i c = _mm_cmplt_epi32(a, b); // 0xFFFFFFFF for a < b
@@ -102,6 +104,28 @@ namespace ojph {
102104
return _mm_or_si128(d, e); // combine
103105
}
104106

107+
//////////////////////////////////////////////////////////////////////////
108+
static inline
109+
__m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
110+
{
111+
__m128 ct = _mm_cmpge_ps(x, y); // 0xFFFFFFFF for x >= y
112+
__m128i c = _mm_castps_si128(ct); // does not generate any code
113+
__m128i d = _mm_and_si128(c, a); // keep only a, where x >= y
114+
__m128i e = _mm_andnot_si128(c, b); // keep only b, where x < y
115+
return _mm_or_si128(d, e); // combine
116+
}
117+
118+
//////////////////////////////////////////////////////////////////////////
119+
static inline
120+
__m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
121+
{
122+
__m128 ct = _mm_cmplt_ps(x, y); // 0xFFFFFFFF for x < y
123+
__m128i c = _mm_castps_si128(ct); // does not generate any code
124+
__m128i d = _mm_and_si128(c, a); // keep only a, where x < y
125+
__m128i e = _mm_andnot_si128(c, b); // keep only b, where x >= y
126+
return _mm_or_si128(d, e); // combine
127+
}
128+
105129
//////////////////////////////////////////////////////////////////////////
106130
void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
107131
line_buf *dst_line, ui32 dst_line_offset,
@@ -129,7 +153,7 @@ namespace ojph {
129153
if (is_signed)
130154
{
131155
__m128i zero = _mm_setzero_si128();
132-
__m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
156+
__m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
133157
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4)
134158
{
135159
__m128 t = _mm_loadu_ps(sp);
@@ -172,31 +196,42 @@ namespace ojph {
172196
// can achieve. All this is academic, because here are talking
173197
// about a number which has all the exponent bits set, meaning
174198
// it is either infinity, -infinity, qNan or sNan.
175-
float mul = (float)(1ull << bit_depth);
176-
const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
177-
const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
199+
si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
200+
__m128 mul = _mm_set1_ps((float)(1 << bit_depth));
201+
__m128 fl_upper_limit = _mm_set1_ps(-(float)neg_limit); // val < upper
202+
__m128 fl_lower_limit = _mm_set1_ps( (float)neg_limit); // val >= lower
203+
__m128i s32_upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
204+
__m128i s32_lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
178205

179206
if (is_signed)
180207
{
181-
const si32 bias = (1 << (bit_depth - 1)) + 1;
182-
for (ui32 i = width; i > 0; --i) {
183-
si64 t = ojph_round64(*sp++ * mul);
184-
t = ojph_max(t, lower_limit);
185-
t = ojph_min(t, upper_limit);
186-
si32 v = (si32)t;
187-
v = (v >= 0) ? v : (- v - bias);
188-
*dp++ = v;
208+
__m128i zero = _mm_setzero_si128();
209+
__m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
210+
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
211+
__m128 t = _mm_loadu_ps(sp);
212+
t = _mm_mul_ps(t, mul);
213+
__m128i u = _mm_cvtps_epi32(t);
214+
u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit);
215+
u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit);
216+
__m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
217+
__m128i neg = _mm_sub_epi32(bias, u); //-bias -value
218+
neg = _mm_and_si128(c, neg); //keep only - bias - value
219+
__m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0
220+
v = _mm_or_si128(neg, v); //combine
221+
_mm_storeu_si128((__m128i*)dp, v);
189222
}
190223
}
191224
else
192225
{
193-
const si32 half = (1 << (bit_depth - 1));
194-
for (ui32 i = width; i > 0; --i) {
195-
si64 t = ojph_round64(*sp++ * mul);
196-
t = ojph_max(t, lower_limit);
197-
t = ojph_min(t, upper_limit);
198-
si32 v = (si32)t;
199-
*dp++ = v + half;
226+
__m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
227+
for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
228+
__m128 t = _mm_loadu_ps(sp);
229+
t = _mm_mul_ps(t, mul);
230+
__m128i u = _mm_cvtps_epi32(t);
231+
u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit);
232+
u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit);
233+
u = _mm_add_epi32(u, half);
234+
_mm_storeu_si128((__m128i*)dp, u);
200235
}
201236
}
202237
}

0 commit comments

Comments
 (0)