@@ -87,7 +87,7 @@ namespace ojph {
8787 __m128i one = _mm_set1_epi32 (1 );
8888 __m128i tmax = _mm_loadu_si128 ((__m128i*)max_val);
8989 __m128i *p = (__m128i*)sp;
90- for (ui32 i = 0 ; i < count; i + = 4 , p += 1 , dp += 4 )
90+ for ( ; count >= 4 ; count - = 4 , p += 1 , dp += 4 )
9191 {
9292 __m128i v = _mm_loadu_si128 (p);
9393 __m128i sign = _mm_cmplt_epi32 (v, zero);
@@ -100,6 +100,25 @@ namespace ojph {
100100 val = _mm_or_si128 (val, sign);
101101 _mm_storeu_si128 ((__m128i*)dp, val);
102102 }
103+ if (count)
104+ {
105+ __m128i v = _mm_loadu_si128 (p);
106+ __m128i sign = _mm_cmplt_epi32 (v, zero);
107+ __m128i val = _mm_xor_si128 (v, sign); // negate 1's complement
108+ __m128i ones = _mm_and_si128 (sign, one);
109+ val = _mm_add_epi32 (val, ones); // 2's complement
110+ sign = _mm_and_si128 (sign, m0);
111+ val = _mm_slli_epi32 (val, (int )shift);
112+
113+ __m128i c = _mm_set1_epi32 ((si32)count);
114+ __m128i idx = _mm_set_epi32 (3 , 2 , 1 , 0 );
115+ __m128i mask = _mm_cmpgt_epi32 (c, idx);
116+ c = _mm_and_si128 (val, mask);
117+ tmax = _mm_or_si128 (tmax, c);
118+
119+ val = _mm_or_si128 (val, sign);
120+ _mm_storeu_si128 ((__m128i*)dp, val);
121+ }
103122 _mm_storeu_si128 ((__m128i*)max_val, tmax);
104123 }
105124
@@ -116,7 +135,7 @@ namespace ojph {
116135 __m128i one = _mm_set1_epi32 (1 );
117136 __m128i tmax = _mm_loadu_si128 ((__m128i*)max_val);
118137 float *p = (float *)sp;
119- for (ui32 i = 0 ; i < count; i + = 4 , p += 4 , dp += 4 )
138+ for ( ; count >= 4 ; count - = 4 , p += 4 , dp += 4 )
120139 {
121140 __m128 vf = _mm_loadu_ps (p);
122141 vf = _mm_mul_ps (vf, d); // multiply
@@ -130,6 +149,26 @@ namespace ojph {
130149 val = _mm_or_si128 (val, sign);
131150 _mm_storeu_si128 ((__m128i*)dp, val);
132151 }
152+ if (count)
153+ {
154+ __m128 vf = _mm_loadu_ps (p);
155+ vf = _mm_mul_ps (vf, d); // multiply
156+ __m128i val = _mm_cvtps_epi32 (vf); // convert to int
157+ __m128i sign = _mm_cmplt_epi32 (val, zero); // get sign
158+ val = _mm_xor_si128 (val, sign); // negate 1's complement
159+ __m128i ones = _mm_and_si128 (sign, one);
160+ val = _mm_add_epi32 (val, ones); // 2's complement
161+
162+ __m128i c = _mm_set1_epi32 ((si32)count);
163+ __m128i idx = _mm_set_epi32 (3 , 2 , 1 , 0 );
164+ __m128i mask = _mm_cmpgt_epi32 (c, idx);
165+ c = _mm_and_si128 (val, mask);
166+ tmax = _mm_or_si128 (tmax, c);
167+
168+ sign = _mm_slli_epi32 (sign, 31 );
169+ val = _mm_or_si128 (val, sign);
170+ _mm_storeu_si128 ((__m128i*)dp, val);
171+ }
133172 _mm_storeu_si128 ((__m128i*)max_val, tmax);
134173 }
135174
@@ -189,7 +228,7 @@ namespace ojph {
189228 __m128i one = _mm_set1_epi64x (1 );
190229 __m128i tmax = _mm_loadu_si128 ((__m128i*)max_val);
191230 __m128i *p = (__m128i*)sp;
192- for (ui32 i = 0 ; i < count; i + = 2 , p += 1 , dp += 2 )
231+ for ( ; count >= 2 ; count - = 2 , p += 1 , dp += 2 )
193232 {
194233 __m128i v = _mm_loadu_si128 (p);
195234 __m128i sign = _mm_cmplt_epi32 (v, zero);
@@ -203,6 +242,24 @@ namespace ojph {
203242 val = _mm_or_si128 (val, sign);
204243 _mm_storeu_si128 ((__m128i*)dp, val);
205244 }
245+ if (count)
246+ {
247+ __m128i v = _mm_loadu_si128 (p);
248+ __m128i sign = _mm_cmplt_epi32 (v, zero);
249+ sign = _mm_shuffle_epi32 (sign, 0xF5 ); // sign = sign[1,1,3,3];
250+ __m128i val = _mm_xor_si128 (v, sign); // negate 1's complement
251+ __m128i ones = _mm_and_si128 (sign, one);
252+ val = _mm_add_epi64 (val, ones); // 2's complement
253+ sign = _mm_and_si128 (sign, m0);
254+ val = _mm_slli_epi64 (val, (int )shift);
255+
256+ __m128i c = _mm_set_epi32 (0 , 0 , (si32)0xFFFFFFFF , (si32)0xFFFFFFFF );
257+ c = _mm_and_si128 (val, c);
258+ tmax = _mm_or_si128 (tmax, c);
259+
260+ val = _mm_or_si128 (val, sign);
261+ _mm_storeu_si128 ((__m128i*)dp, val);
262+ }
206263 _mm_storeu_si128 ((__m128i*)max_val, tmax);
207264 }
208265
@@ -222,10 +279,10 @@ namespace ojph {
222279 __m128i val = _mm_and_si128 (v, m1);
223280 val = _mm_srli_epi64 (val, (int )shift);
224281 __m128i sign = _mm_cmplt_epi32 (v, zero);
225- sign = _mm_shuffle_epi32 (sign, 0xF5 ); // sign = sign[1,1,3,3];
226- val = _mm_xor_si128 (val, sign); // negate 1's complement
282+ sign = _mm_shuffle_epi32 (sign, 0xF5 ); // sign = sign[1,1,3,3];
283+ val = _mm_xor_si128 (val, sign); // negate 1's complement
227284 __m128i ones = _mm_and_si128 (sign, one);
228- val = _mm_add_epi64 (val, ones); // 2's complement
285+ val = _mm_add_epi64 (val, ones); // 2's complement
229286 _mm_storeu_si128 ((__m128i*)p, val);
230287 }
231288 }
0 commit comments