@@ -83,7 +83,8 @@ namespace ojph {
8383 }
8484
8585 // ////////////////////////////////////////////////////////////////////////
86- // This requires SSE4.1
86+ // _mm_max_epi32 requires SSE4.1, so here we implement it in SSE2
87+ static inline
8788 __m128i ojph_mm_max_epi32 (__m128i a, __m128i b)
8889 {
8990 __m128i c = _mm_cmpgt_epi32 (a, b); // 0xFFFFFFFF for a > b
@@ -93,7 +94,8 @@ namespace ojph {
9394 }
9495
9596 // ////////////////////////////////////////////////////////////////////////
96- // This requires SSE4.1
97+ // _mm_min_epi32 requires SSE4.1, so here we implement it in SSE2
98+ static inline
9799 __m128i ojph_mm_min_epi32 (__m128i a, __m128i b)
98100 {
99101 __m128i c = _mm_cmplt_epi32 (a, b); // 0xFFFFFFFF for a < b
@@ -102,6 +104,28 @@ namespace ojph {
102104 return _mm_or_si128 (d, e); // combine
103105 }
104106
107+ // ////////////////////////////////////////////////////////////////////////
108+ static inline
109+ __m128i ojph_mm_max_ge_epi32 (__m128i a, __m128i b, __m128 x, __m128 y)
110+ {
111+ __m128 ct = _mm_cmpge_ps (x, y); // 0xFFFFFFFF for x >= y
112+ __m128i c = _mm_castps_si128 (ct); // does not generate any code
113+ __m128i d = _mm_and_si128 (c, a); // keep only a, where x >= y
114+ __m128i e = _mm_andnot_si128 (c, b); // keep only b, where x < y
115+ return _mm_or_si128 (d, e); // combine
116+ }
117+
118+ // ////////////////////////////////////////////////////////////////////////
119+ static inline
120+ __m128i ojph_mm_min_lt_epi32 (__m128i a, __m128i b, __m128 x, __m128 y)
121+ {
122+ __m128 ct = _mm_cmplt_ps (x, y); // 0xFFFFFFFF for x < y
123+ __m128i c = _mm_castps_si128 (ct); // does not generate any code
124+ __m128i d = _mm_and_si128 (c, a); // keep only a, where x < y
125+ __m128i e = _mm_andnot_si128 (c, b); // keep only b, where x >= y
126+ return _mm_or_si128 (d, e); // combine
127+ }
128+
105129 // ////////////////////////////////////////////////////////////////////////
106130 void sse2_irv_convert_to_integer_nlt_type3 (const line_buf *src_line,
107131 line_buf *dst_line, ui32 dst_line_offset,
@@ -129,7 +153,7 @@ namespace ojph {
129153 if (is_signed)
130154 {
131155 __m128i zero = _mm_setzero_si128 ();
132- __m128i bias = _mm_set1_epi32 (-((1 << (bit_depth - 1 )) + 1 ));
156+ __m128i bias = _mm_set1_epi32 (-((1 << (bit_depth - 1 )) + 1 ));
133157 for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 )
134158 {
135159 __m128 t = _mm_loadu_ps (sp);
@@ -172,31 +196,42 @@ namespace ojph {
172196 // can achieve. All this is academic, because here are talking
173197 // about a number which has all the exponent bits set, meaning
174198 // it is either infinity, -infinity, qNan or sNan.
175- float mul = (float )(1ull << bit_depth);
176- const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
177- const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
199+ si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
200+ __m128 mul = _mm_set1_ps ((float )(1 << bit_depth));
201+ __m128 fl_upper_limit = _mm_set1_ps (-(float )neg_limit); // val < upper
202+ __m128 fl_lower_limit = _mm_set1_ps ( (float )neg_limit); // val >= lower
203+ __m128i s32_upper_limit = _mm_set1_epi32 (INT_MAX >> (32 - bit_depth));
204+ __m128i s32_lower_limit = _mm_set1_epi32 (INT_MIN >> (32 - bit_depth));
178205
179206 if (is_signed)
180207 {
181- const si32 bias = (1 << (bit_depth - 1 )) + 1 ;
182- for (ui32 i = width; i > 0 ; --i) {
183- si64 t = ojph_round64 (*sp++ * mul);
184- t = ojph_max (t, lower_limit);
185- t = ojph_min (t, upper_limit);
186- si32 v = (si32)t;
187- v = (v >= 0 ) ? v : (- v - bias);
188- *dp++ = v;
208+ __m128i zero = _mm_setzero_si128 ();
209+ __m128i bias = _mm_set1_epi32 (-((1 << (bit_depth - 1 )) + 1 ));
210+ for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
211+ __m128 t = _mm_loadu_ps (sp);
212+ t = _mm_mul_ps (t, mul);
213+ __m128i u = _mm_cvtps_epi32 (t);
214+ u = ojph_mm_max_ge_epi32 (u, s32_lower_limit, t, fl_lower_limit);
215+ u = ojph_mm_min_lt_epi32 (u, s32_upper_limit, t, fl_upper_limit);
216+ __m128i c = _mm_cmpgt_epi32 (zero, u); // 0xFFFFFFFF for -ve value
217+ __m128i neg = _mm_sub_epi32 (bias, u); // -bias -value
218+ neg = _mm_and_si128 (c, neg); // keep only - bias - value
219+ __m128i v = _mm_andnot_si128 (c, u); // keep only +ve or 0
220+ v = _mm_or_si128 (neg, v); // combine
221+ _mm_storeu_si128 ((__m128i*)dp, v);
189222 }
190223 }
191224 else
192225 {
193- const si32 half = (1 << (bit_depth - 1 ));
194- for (ui32 i = width; i > 0 ; --i) {
195- si64 t = ojph_round64 (*sp++ * mul);
196- t = ojph_max (t, lower_limit);
197- t = ojph_min (t, upper_limit);
198- si32 v = (si32)t;
199- *dp++ = v + half;
226+ __m128i half = _mm_set1_epi32 (-(1 << (bit_depth - 1 )));
227+ for (ui32 i = width; i > 0 ; i -= 4 , sp += 4 , dp += 4 ) {
228+ __m128 t = _mm_loadu_ps (sp);
229+ t = _mm_mul_ps (t, mul);
230+ __m128i u = _mm_cvtps_epi32 (t);
231+ u = ojph_mm_max_ge_epi32 (u, s32_lower_limit, t, fl_lower_limit);
232+ u = ojph_mm_min_lt_epi32 (u, s32_upper_limit, t, fl_upper_limit);
233+ u = _mm_add_epi32 (u, half);
234+ _mm_storeu_si128 ((__m128i*)dp, u);
200235 }
201236 }
202237 }
0 commit comments