Skip to content

Commit 9834f17

Browse files
committed
Incomplete colour_avx2, avx512, and wasm.
1 parent 4339c8f commit 9834f17

File tree

4 files changed

+127
-34
lines changed

4 files changed

+127
-34
lines changed

src/core/transform/ojph_colour_local.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,16 @@ namespace ojph {
229229
//////////////////////////////////////////////////////////////////////////
230230

231231
//////////////////////////////////////////////////////////////////////////
232-
void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
233-
ui32 width);
232+
void avx2_rev_convert(
233+
const line_buf *src_line, const ui32 src_line_offset,
234+
line_buf *dst_line, const ui32 dst_line_offset,
235+
si64 shift, ui32 width);
234236

235237
//////////////////////////////////////////////////////////////////////////
236-
void avx2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp,
237-
int shift, ui32 width);
238+
void avx2_rev_convert_nlt_type3(
239+
const line_buf *src_line, const ui32 src_line_offset,
240+
line_buf *dst_line, const ui32 dst_line_offset,
241+
si64 shift, ui32 width);
238242

239243
//////////////////////////////////////////////////////////////////////////
240244
void avx2_rct_forward(

src/core/transform/ojph_colour_sse2.cpp

Lines changed: 117 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ namespace ojph {
5353
static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
5454
{
5555
// note than m must be obtained using
56-
// __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt));
56+
// __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
5757
__m128i x = _mm_srli_epi64(a, amt);
5858
x = _mm_xor_si128(x, m);
5959
__m128i result = _mm_sub_epi64(x, m);
@@ -63,23 +63,19 @@ namespace ojph {
6363
//////////////////////////////////////////////////////////////////////////
6464
static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
6565
{
66-
__m128i s, t;
67-
s = _mm_unpacklo_epi32(a, zero); // missing extended -ve
66+
__m128i t;
6867
t = _mm_cmplt_epi32(a, zero); // get -ve
69-
t = _mm_unpacklo_epi32(zero, t);
70-
s = _mm_or_si128(t, s); // put -ve
71-
return s;
68+
t = _mm_unpacklo_epi32(a, t);
69+
return t;
7270
}
7371

7472
//////////////////////////////////////////////////////////////////////////
7573
static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
7674
{
77-
__m128i s, t;
78-
s = _mm_unpackhi_epi32(a, zero); // missing extended -ve
75+
__m128i t;
7976
t = _mm_cmplt_epi32(a, zero); // get -ve
80-
t = _mm_unpackhi_epi32(zero, t);
81-
s = _mm_or_si128(t, s); // put -ve
82-
return s;
77+
t = _mm_unpackhi_epi32(a, t);
78+
return t;
8379
}
8480

8581
//////////////////////////////////////////////////////////////////////////
@@ -95,16 +91,33 @@ namespace ojph {
9591
{
9692
const si32 *sp = src_line->i32 + src_line_offset;
9793
si32 *dp = dst_line->i32 + dst_line_offset;
98-
si32 s = (si32)shift;
99-
for (ui32 i = width; i > 0; --i)
100-
*dp++ = *sp++ + s;
94+
__m128i sh = _mm_set1_epi32((si32)shift);
95+
for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
96+
{
97+
__m128i s = _mm_loadu_si128((__m128i*)sp);
98+
s = _mm_add_epi32(s, sh);
99+
_mm_storeu_si128((__m128i*)dp, s);
100+
}
101101
}
102102
else
103103
{
104104
const si32 *sp = src_line->i32 + src_line_offset;
105105
si64 *dp = dst_line->i64 + dst_line_offset;
106-
for (ui32 i = width; i > 0; --i)
107-
*dp++ = *sp++ + shift;
106+
__m128i zero = _mm_setzero_si128();
107+
__m128i sh = _mm_set1_epi64x(shift);
108+
for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
109+
{
110+
__m128i s, t;
111+
s = _mm_loadu_si128((__m128i*)sp);
112+
113+
t = sse2_cvtlo_epi32_epi64(s, zero);
114+
t = _mm_add_epi64(t, sh);
115+
_mm_storeu_si128((__m128i*)dp, t);
116+
117+
t = sse2_cvthi_epi32_epi64(s, zero);
118+
t = _mm_add_epi64(t, sh);
119+
_mm_storeu_si128((__m128i*)dp + 1, t);
120+
}
108121
}
109122
}
110123
else
@@ -113,8 +126,26 @@ namespace ojph {
113126
assert(dst_line->flags | line_buf::LFT_32BIT);
114127
const si64 *sp = src_line->i64 + src_line_offset;
115128
si32 *dp = dst_line->i32 + dst_line_offset;
116-
for (ui32 i = width; i > 0; --i)
117-
*dp++ = (si32)(*sp++ + shift);
129+
__m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
130+
__m128i sh = _mm_set1_epi64x(shift);
131+
for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
132+
{
133+
__m128i s, t;
134+
s = _mm_loadu_si128((__m128i*)sp);
135+
s = _mm_add_epi64(s, sh);
136+
137+
t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
138+
t = _mm_and_si128(low_bits, t);
139+
140+
s = _mm_loadu_si128((__m128i*)sp + 1);
141+
s = _mm_add_epi64(s, sh);
142+
143+
s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
144+
s = _mm_andnot_si128(low_bits, s);
145+
146+
t = _mm_or_si128(s, t);
147+
_mm_storeu_si128((__m128i*)dp, t);
148+
}
118149
}
119150
}
120151

@@ -131,19 +162,49 @@ namespace ojph {
131162
{
132163
const si32 *sp = src_line->i32 + src_line_offset;
133164
si32 *dp = dst_line->i32 + dst_line_offset;
134-
si32 s = (si32)shift;
135-
for (ui32 i = width; i > 0; --i) {
136-
const si32 v = *sp++;
137-
*dp++ = v >= 0 ? v : (- v - s);
165+
__m128i sh = _mm_set1_epi32((si32)(-shift));
166+
__m128i zero = _mm_setzero_si128();
167+
for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
168+
{
169+
__m128i s = _mm_loadu_si128((__m128i*)sp);
170+
__m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value
171+
__m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value
172+
v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
173+
s = _mm_andnot_si128(c, s); // keep only +ve or 0
174+
s = _mm_or_si128(s, v_m_sh); // combine
175+
_mm_storeu_si128((__m128i*)dp, s);
138176
}
139177
}
140178
else
141179
{
142180
const si32 *sp = src_line->i32 + src_line_offset;
143181
si64 *dp = dst_line->i64 + dst_line_offset;
144-
for (ui32 i = width; i > 0; --i) {
145-
const si64 v = *sp++;
146-
*dp++ = v >= 0 ? v : (- v - shift);
182+
__m128i sh = _mm_set1_epi64x(-shift);
183+
__m128i zero = _mm_setzero_si128();
184+
for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
185+
{
186+
__m128i s, t, u, c, v_m_sh;
187+
s = _mm_loadu_si128((__m128i*)sp);
188+
189+
t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1
190+
u = _mm_unpacklo_epi32(s, t); // correct 64bit data
191+
c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value
192+
193+
v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
194+
v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
195+
u = _mm_andnot_si128(c, u); // keep only +ve or 0
196+
u = _mm_or_si128(u, v_m_sh); // combine
197+
198+
_mm_storeu_si128((__m128i*)dp, u);
199+
u = _mm_unpackhi_epi32(s, t); // correct 64bit data
200+
c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value
201+
202+
v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
203+
v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
204+
u = _mm_andnot_si128(c, u); // keep only +ve or 0
205+
u = _mm_or_si128(u, v_m_sh); // combine
206+
207+
_mm_storeu_si128((__m128i*)dp + 1, u);
147208
}
148209
}
149210
}
@@ -153,9 +214,37 @@ namespace ojph {
153214
assert(dst_line->flags | line_buf::LFT_32BIT);
154215
const si64 *sp = src_line->i64 + src_line_offset;
155216
si32 *dp = dst_line->i32 + dst_line_offset;
156-
for (ui32 i = width; i > 0; --i) {
157-
const si64 v = *sp++;
158-
*dp++ = (si32)(v >= 0 ? v : (- v - shift));
217+
__m128i sh = _mm_set1_epi64x(-shift);
218+
__m128i zero = _mm_setzero_si128();
219+
__m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
220+
for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
221+
{
222+
// s for source, t for target, p for positive, n for negative,
223+
// m for mask, and tm for temp
224+
__m128i s, t, p, n, m, tm;
225+
s = _mm_loadu_si128((__m128i*)sp);
226+
227+
tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
228+
m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
229+
tm = _mm_sub_epi64(sh, s); // - shift - value
230+
n = _mm_and_si128(m, tm); // -ve
231+
p = _mm_andnot_si128(m, s); // +ve
232+
tm = _mm_or_si128(n, p);
233+
tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
234+
t = _mm_and_si128(half_mask, tm);
235+
236+
s = _mm_loadu_si128((__m128i*)sp + 1);
237+
tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
238+
m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
239+
tm = _mm_sub_epi64(sh, s); // - shift - value
240+
n = _mm_and_si128(m, tm); // -ve
241+
p = _mm_andnot_si128(m, s); // +ve
242+
tm = _mm_or_si128(n, p);
243+
tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
244+
tm = _mm_andnot_si128(half_mask, tm);
245+
246+
t = _mm_or_si128(t, tm);
247+
_mm_storeu_si128((__m128i*)dp, t);
159248
}
160249
}
161250
}

src/core/transform/ojph_transform_avx2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ namespace ojph {
5858
__m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
5959
{
6060
// note than m must be obtained using
61-
// __m256i ve = _mm256_set1_epi64x(1ULL << (63 - amt));
61+
// __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
6262
__m256i x = _mm256_srli_epi64(a, amt);
6363
x = _mm256_xor_si256(x, m);
6464
__m256i result = _mm256_sub_epi64(x, m);

src/core/transform/ojph_transform_sse2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ namespace ojph {
5757
static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
5858
{
5959
// note than m must be obtained using
60-
// __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt));
60+
// __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
6161
__m128i x = _mm_srli_epi64(a, amt);
6262
x = _mm_xor_si128(x, m);
6363
__m128i result = _mm_sub_epi64(x, m);

0 commit comments

Comments
 (0)