@@ -53,7 +53,7 @@ namespace ojph {
5353 static inline __m128i sse2_mm_srai_epi64 (__m128i a, int amt, __m128i m)
5454 {
5555 // note than m must be obtained using
56- // __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt));
56+ // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
5757 __m128i x = _mm_srli_epi64 (a, amt);
5858 x = _mm_xor_si128 (x, m);
5959 __m128i result = _mm_sub_epi64 (x, m);
@@ -63,23 +63,19 @@ namespace ojph {
6363 // ////////////////////////////////////////////////////////////////////////
6464 static inline __m128i sse2_cvtlo_epi32_epi64 (__m128i a, __m128i zero)
6565 {
66- __m128i s, t;
67- s = _mm_unpacklo_epi32 (a, zero); // missing extended -ve
66+ __m128i t;
6867 t = _mm_cmplt_epi32 (a, zero); // get -ve
69- t = _mm_unpacklo_epi32 (zero, t);
70- s = _mm_or_si128 (t, s); // put -ve
71- return s;
68+ t = _mm_unpacklo_epi32 (a, t);
69+ return t;
7270 }
7371
7472 // ////////////////////////////////////////////////////////////////////////
7573 static inline __m128i sse2_cvthi_epi32_epi64 (__m128i a, __m128i zero)
7674 {
77- __m128i s, t;
78- s = _mm_unpackhi_epi32 (a, zero); // missing extended -ve
75+ __m128i t;
7976 t = _mm_cmplt_epi32 (a, zero); // get -ve
80- t = _mm_unpackhi_epi32 (zero, t);
81- s = _mm_or_si128 (t, s); // put -ve
82- return s;
77+ t = _mm_unpackhi_epi32 (a, t);
78+ return t;
8379 }
8480
8581 // ////////////////////////////////////////////////////////////////////////
@@ -95,16 +91,33 @@ namespace ojph {
9591 {
9692 const si32 *sp = src_line->i32 + src_line_offset;
9793 si32 *dp = dst_line->i32 + dst_line_offset;
98- si32 s = (si32)shift;
99- for (ui32 i = width; i > 0 ; --i)
100- *dp++ = *sp++ + s;
94+ __m128i sh = _mm_set1_epi32 ((si32)shift);
95+ for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp+=4 , dp+=4 )
96+ {
97+ __m128i s = _mm_loadu_si128 ((__m128i*)sp);
98+ s = _mm_add_epi32 (s, sh);
99+ _mm_storeu_si128 ((__m128i*)dp, s);
100+ }
101101 }
102102 else
103103 {
104104 const si32 *sp = src_line->i32 + src_line_offset;
105105 si64 *dp = dst_line->i64 + dst_line_offset;
106- for (ui32 i = width; i > 0 ; --i)
107- *dp++ = *sp++ + shift;
106+ __m128i zero = _mm_setzero_si128 ();
107+ __m128i sh = _mm_set1_epi64x (shift);
108+ for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp+=4 , dp+=4 )
109+ {
110+ __m128i s, t;
111+ s = _mm_loadu_si128 ((__m128i*)sp);
112+
113+ t = sse2_cvtlo_epi32_epi64 (s, zero);
114+ t = _mm_add_epi64 (t, sh);
115+ _mm_storeu_si128 ((__m128i*)dp, t);
116+
117+ t = sse2_cvthi_epi32_epi64 (s, zero);
118+ t = _mm_add_epi64 (t, sh);
119+ _mm_storeu_si128 ((__m128i*)dp + 1 , t);
120+ }
108121 }
109122 }
110123 else
@@ -113,8 +126,26 @@ namespace ojph {
113126 assert (dst_line->flags | line_buf::LFT_32BIT);
114127 const si64 *sp = src_line->i64 + src_line_offset;
115128 si32 *dp = dst_line->i32 + dst_line_offset;
116- for (ui32 i = width; i > 0 ; --i)
117- *dp++ = (si32)(*sp++ + shift);
129+ __m128i low_bits = _mm_set_epi64x (0 , (si64)ULLONG_MAX);
130+ __m128i sh = _mm_set1_epi64x (shift);
131+ for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp+=4 , dp+=4 )
132+ {
133+ __m128i s, t;
134+ s = _mm_loadu_si128 ((__m128i*)sp);
135+ s = _mm_add_epi64 (s, sh);
136+
137+ t = _mm_shuffle_epi32 (s, _MM_SHUFFLE (0 , 0 , 2 , 0 ));
138+ t = _mm_and_si128 (low_bits, t);
139+
140+ s = _mm_loadu_si128 ((__m128i*)sp + 1 );
141+ s = _mm_add_epi64 (s, sh);
142+
143+ s = _mm_shuffle_epi32 (s, _MM_SHUFFLE (2 , 0 , 0 , 0 ));
144+ s = _mm_andnot_si128 (low_bits, s);
145+
146+ t = _mm_or_si128 (s, t);
147+ _mm_storeu_si128 ((__m128i*)dp, t);
148+ }
118149 }
119150 }
120151
@@ -131,19 +162,49 @@ namespace ojph {
131162 {
132163 const si32 *sp = src_line->i32 + src_line_offset;
133164 si32 *dp = dst_line->i32 + dst_line_offset;
134- si32 s = (si32)shift;
135- for (ui32 i = width; i > 0 ; --i) {
136- const si32 v = *sp++;
137- *dp++ = v >= 0 ? v : (- v - s);
165+ __m128i sh = _mm_set1_epi32 ((si32)(-shift));
166+ __m128i zero = _mm_setzero_si128 ();
167+ for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp += 4 , dp += 4 )
168+ {
169+ __m128i s = _mm_loadu_si128 ((__m128i*)sp);
170+ __m128i c = _mm_cmplt_epi32 (s, zero); // 0xFFFFFFFF for -ve value
171+ __m128i v_m_sh = _mm_sub_epi32 (sh, s); // - shift - value
172+ v_m_sh = _mm_and_si128 (c, v_m_sh); // keep only - shift - value
173+ s = _mm_andnot_si128 (c, s); // keep only +ve or 0
174+ s = _mm_or_si128 (s, v_m_sh); // combine
175+ _mm_storeu_si128 ((__m128i*)dp, s);
138176 }
139177 }
140178 else
141179 {
142180 const si32 *sp = src_line->i32 + src_line_offset;
143181 si64 *dp = dst_line->i64 + dst_line_offset;
144- for (ui32 i = width; i > 0 ; --i) {
145- const si64 v = *sp++;
146- *dp++ = v >= 0 ? v : (- v - shift);
182+ __m128i sh = _mm_set1_epi64x (-shift);
183+ __m128i zero = _mm_setzero_si128 ();
184+ for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp += 4 , dp += 4 )
185+ {
186+ __m128i s, t, u, c, v_m_sh;
187+ s = _mm_loadu_si128 ((__m128i*)sp);
188+
189+ t = _mm_cmplt_epi32 (s, zero); // find -ve 32bit -1
190+ u = _mm_unpacklo_epi32 (s, t); // correct 64bit data
191+ c = _mm_unpacklo_epi32 (t, t); // 64bit -1 for -ve value
192+
193+ v_m_sh = _mm_sub_epi64 (sh, u); // - shift - value
194+ v_m_sh = _mm_and_si128 (c, v_m_sh); // keep only - shift - value
195+ u = _mm_andnot_si128 (c, u); // keep only +ve or 0
196+ u = _mm_or_si128 (u, v_m_sh); // combine
197+
198+ _mm_storeu_si128 ((__m128i*)dp, u);
199+ u = _mm_unpackhi_epi32 (s, t); // correct 64bit data
200+ c = _mm_unpackhi_epi32 (t, t); // 64bit -1 for -ve value
201+
202+ v_m_sh = _mm_sub_epi64 (sh, u); // - shift - value
203+ v_m_sh = _mm_and_si128 (c, v_m_sh); // keep only - shift - value
204+ u = _mm_andnot_si128 (c, u); // keep only +ve or 0
205+ u = _mm_or_si128 (u, v_m_sh); // combine
206+
207+ _mm_storeu_si128 ((__m128i*)dp + 1 , u);
147208 }
148209 }
149210 }
@@ -153,9 +214,37 @@ namespace ojph {
153214 assert (dst_line->flags | line_buf::LFT_32BIT);
154215 const si64 *sp = src_line->i64 + src_line_offset;
155216 si32 *dp = dst_line->i32 + dst_line_offset;
156- for (ui32 i = width; i > 0 ; --i) {
157- const si64 v = *sp++;
158- *dp++ = (si32)(v >= 0 ? v : (- v - shift));
217+ __m128i sh = _mm_set1_epi64x (-shift);
218+ __m128i zero = _mm_setzero_si128 ();
219+ __m128i half_mask = _mm_set_epi64x (0 , (si64)ULLONG_MAX);
220+ for (int i = (width + 3 ) >> 2 ; i > 0 ; --i, sp += 4 , dp += 4 )
221+ {
222+ // s for source, t for target, p for positive, n for negative,
223+ // m for mask, and tm for temp
224+ __m128i s, t, p, n, m, tm;
225+ s = _mm_loadu_si128 ((__m128i*)sp);
226+
227+ tm = _mm_cmplt_epi32 (s, zero); // 32b -1 for -ve value
228+ m = _mm_shuffle_epi32 (tm, _MM_SHUFFLE (3 , 3 , 1 , 1 )); // expand to 64b
229+ tm = _mm_sub_epi64 (sh, s); // - shift - value
230+ n = _mm_and_si128 (m, tm); // -ve
231+ p = _mm_andnot_si128 (m, s); // +ve
232+ tm = _mm_or_si128 (n, p);
233+ tm = _mm_shuffle_epi32 (tm, _MM_SHUFFLE (0 , 0 , 2 , 0 ));
234+ t = _mm_and_si128 (half_mask, tm);
235+
236+ s = _mm_loadu_si128 ((__m128i*)sp + 1 );
237+ tm = _mm_cmplt_epi32 (s, zero); // 32b -1 for -ve value
238+ m = _mm_shuffle_epi32 (tm, _MM_SHUFFLE (3 , 3 , 1 , 1 )); // expand to 64b
239+ tm = _mm_sub_epi64 (sh, s); // - shift - value
240+ n = _mm_and_si128 (m, tm); // -ve
241+ p = _mm_andnot_si128 (m, s); // +ve
242+ tm = _mm_or_si128 (n, p);
243+ tm = _mm_shuffle_epi32 (tm, _MM_SHUFFLE (2 , 0 , 0 , 0 ));
244+ tm = _mm_andnot_si128 (half_mask, tm);
245+
246+ t = _mm_or_si128 (t, tm);
247+ _mm_storeu_si128 ((__m128i*)dp, t);
159248 }
160249 }
161250 }
0 commit comments