@@ -58,6 +58,21 @@ namespace ojph {
5858 // return t;
5959 }
6060
61+ // ////////////////////////////////////////////////////////////////////////
62+ ui64 sse2_find_max_val64 (ui64* address)
63+ {
64+ __m128i x1, x0 = _mm_loadu_si128 ((__m128i*)address);
65+ x1 = _mm_shuffle_epi32 (x0, 0xEE ); // x1 = x0[2,3,2,3]
66+ x0 = _mm_or_si128 (x0, x1);
67+ _mm_storeu_si128 ((__m128i*)address, x0);
68+ return *address;
69+ // A single movd t, xmm0 can do the trick, but it is not available
70+ // in SSE2 intrinsics. extract_epi32 is available in sse4.1
71+ // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
72+ // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
73+ // return t;
74+ }
75+
6176 // ////////////////////////////////////////////////////////////////////////
6277 void sse2_rev_tx_to_cb32 (const void *sp, ui32 *dp, ui32 K_max,
6378 float delta_inv, ui32 count, ui32* max_val)
@@ -129,14 +144,14 @@ namespace ojph {
129144 si32 *p = (si32*)dp;
130145 for (ui32 i = 0 ; i < count; i += 4 , sp += 4 , p += 4 )
131146 {
132- __m128i v = _mm_load_si128 ((__m128i*)sp);
133- __m128i val = _mm_and_si128 (v, m1);
134- val = _mm_srli_epi32 (val, (int )shift);
135- __m128i sign = _mm_cmplt_epi32 (v, zero);
136- val = _mm_xor_si128 (val, sign); // negate 1's complement
137- __m128i ones = _mm_and_si128 (sign, one);
138- val = _mm_add_epi32 (val, ones); // 2's complement
139- _mm_storeu_si128 ((__m128i*)p, val);
147+ __m128i v = _mm_load_si128 ((__m128i*)sp);
148+ __m128i val = _mm_and_si128 (v, m1);
149+ val = _mm_srli_epi32 (val, (int )shift);
150+ __m128i sign = _mm_cmplt_epi32 (v, zero);
151+ val = _mm_xor_si128 (val, sign); // negate 1's complement
152+ __m128i ones = _mm_and_si128 (sign, one);
153+ val = _mm_add_epi32 (val, ones); // 2's complement
154+ _mm_storeu_si128 ((__m128i*)p, val);
140155 }
141156 }
142157
@@ -159,5 +174,59 @@ namespace ojph {
159174 _mm_storeu_ps (p, valf);
160175 }
161176 }
177+
178+ // ////////////////////////////////////////////////////////////////////////
179+ void sse2_rev_tx_to_cb64 (const void *sp, ui64 *dp, ui32 K_max,
180+ float delta_inv, ui32 count, ui64* max_val)
181+ {
182+ ojph_unused (delta_inv);
183+
184+ // convert to sign and magnitude and keep max_val
185+ ui32 shift = 63 - K_max;
186+ __m128i m0 = _mm_set1_epi64x (0x8000000000000000LL );
187+ __m128i zero = _mm_setzero_si128 ();
188+ __m128i one = _mm_set1_epi64x (1 );
189+ __m128i tmax = _mm_loadu_si128 ((__m128i*)max_val);
190+ __m128i *p = (__m128i*)sp;
191+ for (ui32 i = 0 ; i < count; i += 2 , p += 1 , dp += 2 )
192+ {
193+ __m128i v = _mm_loadu_si128 (p);
194+ __m128i sign = _mm_cmplt_epi32 (v, zero);
195+ sign = _mm_shuffle_epi32 (sign, 0xF5 ); // sign = sign[1,1,3,3];
196+ __m128i val = _mm_xor_si128 (v, sign); // negate 1's complement
197+ __m128i ones = _mm_and_si128 (sign, one);
198+ val = _mm_add_epi64 (val, ones); // 2's complement
199+ sign = _mm_and_si128 (sign, m0);
200+ val = _mm_slli_epi64 (val, (int )shift);
201+ tmax = _mm_or_si128 (tmax, val);
202+ val = _mm_or_si128 (val, sign);
203+ _mm_storeu_si128 ((__m128i*)dp, val);
204+ }
205+ _mm_storeu_si128 ((__m128i*)max_val, tmax);
206+ }
207+
208+ // ////////////////////////////////////////////////////////////////////////
209+ void sse2_rev_tx_from_cb64 (const ui64 *sp, void *dp, ui32 K_max,
210+ float delta, ui32 count)
211+ {
212+ ojph_unused (delta);
213+ ui32 shift = 63 - K_max;
214+ __m128i m1 = _mm_set1_epi64x (0x7FFFFFFFFFFFFFFFLL );
215+ __m128i zero = _mm_setzero_si128 ();
216+ __m128i one = _mm_set1_epi64x (1 );
217+ si64 *p = (si64*)dp;
218+ for (ui32 i = 0 ; i < count; i += 2 , sp += 2 , p += 2 )
219+ {
220+ __m128i v = _mm_load_si128 ((__m128i*)sp);
221+ __m128i val = _mm_and_si128 (v, m1);
222+ val = _mm_srli_epi64 (val, (int )shift);
223+ __m128i sign = _mm_cmplt_epi32 (v, zero);
224+ sign = _mm_shuffle_epi32 (sign, 0xF5 ); // sign = sign[1,1,3,3];
225+ val = _mm_xor_si128 (val, sign); // negate 1's complement
226+ __m128i ones = _mm_and_si128 (sign, one);
227+ val = _mm_add_epi64 (val, ones); // 2's complement
228+ _mm_storeu_si128 ((__m128i*)p, val);
229+ }
230+ }
162231 }
163232}
0 commit comments