@@ -275,124 +275,6 @@ namespace ojph {
275275 }
276276 }
277277
278- // ///////////////////////////////////////////////////////////////////////////
279- void avx2_cvrt_32b3c_to_16ub3c_le (const line_buf *ln0, const line_buf *ln1,
280- const line_buf *ln2, void *dp,
281- int bit_depth, int count)
282- {
283- const si32 *sp0 = ln0->i32 ;
284- const si32 *sp1 = ln1->i32 ;
285- const si32 *sp2 = ln2->i32 ;
286- ui16* p = (ui16*)dp;
287-
288- __m256i max_val_vec = _mm256_set1_epi32 ((1 << bit_depth) - 1 );
289- __m256i zero = _mm256_setzero_si256 ();
290-
291- __m256i m0 = _mm256_set_epi64x (0x0B0A0908FFFF0706 , 0x0504FFFF03020100 ,
292- 0x0B0A0908FFFF0706 , 0x0504FFFF03020100 );
293- __m256i m1 = _mm256_set_epi64x (0xFFFFFFFF0504FFFF , 0xFFFF0100FFFFFFFF ,
294- 0xFFFFFFFF0504FFFF , 0xFFFF0100FFFFFFFF );
295- __m256i m2 = _mm256_set_epi64x (0xFFFFFFFFFFFFFFFF , 0xFFFF0F0E0D0CFFFF ,
296- 0xFFFFFFFFFFFFFFFF , 0xFFFF0F0E0D0CFFFF );
297- __m256i m3 = _mm256_set_epi64x (0x0706FFFFFFFF0302 , 0x0D0CFFFFFFFF0908 ,
298- 0x0706FFFFFFFF0302 , 0x0D0CFFFFFFFF0908 );
299- __m256i m4 = _mm256_set_epi64x (0xFFFF03020100FFFF , 0xFFFFFFFFFFFFFFFF ,
300- 0xFFFF03020100FFFF , 0xFFFFFFFFFFFFFFFF );
301- __m256i m5 = _mm256_set_epi64x (0xFFFFFFFF0F0EFFFF , 0xFFFF0B0AFFFFFFFF ,
302- 0xFFFFFFFF0F0EFFFF , 0xFFFF0B0AFFFFFFFF );
303- __m256i m6 = _mm256_set_epi64x (0x0F0E0D0CFFFF0B0A , 0x0908FFFF07060504 ,
304- 0x0F0E0D0CFFFF0B0A , 0x0908FFFF07060504 );
305-
306- // 24 entries in each loop
307- for ( ; count >= 16 ; count -= 16 , sp0 += 16 , sp1 += 16 , sp2 += 16 , p += 48 )
308- {
309- __m256i a, b, t, u, v;
310- a = _mm256_load_si256 ((__m256i*)sp0);
311- a = _mm256_max_epi32 (a, zero);
312- t = _mm256_min_epi32 (a, max_val_vec);
313-
314- a = _mm256_load_si256 ((__m256i*)sp1);
315- a = _mm256_max_epi32 (a, zero);
316- a = _mm256_min_epi32 (a, max_val_vec);
317- a = _mm256_slli_epi32 (a, 16 );
318- t = _mm256_or_si256 (t, a);
319-
320- a = _mm256_load_si256 ((__m256i*)sp2);
321- a = _mm256_max_epi32 (a, zero);
322- u = _mm256_min_epi32 (a, max_val_vec);
323-
324- a = _mm256_load_si256 ((__m256i*)sp0 + 1 );
325- a = _mm256_max_epi32 (a, zero);
326- a = _mm256_min_epi32 (a, max_val_vec);
327- a = _mm256_slli_epi32 (a, 16 );
328- u = _mm256_or_si256 (u, a);
329-
330- a = _mm256_load_si256 ((__m256i*)sp1 + 1 );
331- a = _mm256_max_epi32 (a, zero);
332- v = _mm256_min_epi32 (a, max_val_vec);
333-
334- a = _mm256_load_si256 ((__m256i*)sp2 + 1 );
335- a = _mm256_max_epi32 (a, zero);
336- a = _mm256_min_epi32 (a, max_val_vec);
337- a = _mm256_slli_epi32 (a, 16 );
338- v = _mm256_or_si256 (v, a);
339-
340- // start combining using the sse41 method
341- __m256i xt, xu, xv;
342-
343- a = _mm256_shuffle_epi8 (t, m0);
344- b = _mm256_shuffle_epi8 (u, m1);
345- xt = _mm256_or_si256 (a, b);
346-
347- a = _mm256_shuffle_epi8 (t, m2);
348- b = _mm256_shuffle_epi8 (u, m3);
349- a = _mm256_or_si256 (a, b);
350- b = _mm256_shuffle_epi8 (v, m4);
351- xu = _mm256_or_si256 (a, b);
352-
353- a = _mm256_shuffle_epi8 (u, m5);
354- b = _mm256_shuffle_epi8 (v, m6);
355- xv = _mm256_or_si256 (a, b);
356-
357- // reorder them in the correct order
358- t = _mm256_set_epi64x (_mm256_extract_epi64 (xt, 2 ),
359- _mm256_extract_epi64 (xu, 0 ),
360- _mm256_extract_epi64 (xt, 1 ),
361- _mm256_extract_epi64 (xt, 0 ));
362- _mm256_storeu_si256 ((__m256i*)p , t);
363-
364- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 0 ),
365- _mm256_extract_epi64 (xu, 1 ),
366- _mm256_extract_epi64 (xu, 2 ),
367- _mm256_extract_epi64 (xt, 3 ));
368- _mm256_storeu_si256 ((__m256i*)p + 1 , t);
369-
370- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 3 ),
371- _mm256_extract_epi64 (xv, 2 ),
372- _mm256_extract_epi64 (xu, 3 ),
373- _mm256_extract_epi64 (xv, 1 ));
374- _mm256_storeu_si256 ((__m256i*)p + 2 , t);
375- }
376-
377- int max_val = (1 <<bit_depth) - 1 ;
378- for ( ; count > 0 ; --count)
379- {
380- int val;
381- val = *sp0++;
382- val = val >= 0 ? val : 0 ;
383- val = val <= max_val ? val : max_val;
384- *p++ = be2le ((ui16) val);
385- val = *sp1++;
386- val = val >= 0 ? val : 0 ;
387- val = val <= max_val ? val : max_val;
388- *p++ = be2le ((ui16) val);
389- val = *sp2++;
390- val = val >= 0 ? val : 0 ;
391- val = val <= max_val ? val : max_val;
392- *p++ = (ui16) val;
393- }
394- }
395-
396278 // ///////////////////////////////////////////////////////////////////////////
397279 void avx2_cvrt_32b1c_to_16ub1c_be (const line_buf *ln0, const line_buf *ln1,
398280 const line_buf *ln2, void *dp,
@@ -436,122 +318,4 @@ namespace ojph {
436318 *p++ = be2le ((ui16) val);
437319 }
438320 }
439-
440- // ///////////////////////////////////////////////////////////////////////////
441- void avx2_cvrt_32b3c_to_16ub3c_be (const line_buf *ln0, const line_buf *ln1,
442- const line_buf *ln2, void *dp,
443- int bit_depth, int count)
444- {
445- const si32 *sp0 = ln0->i32 ;
446- const si32 *sp1 = ln1->i32 ;
447- const si32 *sp2 = ln2->i32 ;
448- ui16* p = (ui16*)dp;
449-
450- __m256i max_val_vec = _mm256_set1_epi32 ((1 << bit_depth) - 1 );
451- __m256i zero = _mm256_setzero_si256 ();
452-
453- __m256i m0 = _mm256_set_epi64x (0x0A0B0809FFFF0607 , 0x0405FFFF02030001 ,
454- 0x0A0B0809FFFF0607 , 0x0405FFFF02030001 );
455- __m256i m1 = _mm256_set_epi64x (0xFFFFFFFF0405FFFF , 0xFFFF0001FFFFFFFF ,
456- 0xFFFFFFFF0405FFFF , 0xFFFF0001FFFFFFFF );
457- __m256i m2 = _mm256_set_epi64x (0xFFFFFFFFFFFFFFFF , 0xFFFF0E0F0C0DFFFF ,
458- 0xFFFFFFFFFFFFFFFF , 0xFFFF0E0F0C0DFFFF );
459- __m256i m3 = _mm256_set_epi64x (0x0607FFFFFFFF0203 , 0x0C0DFFFFFFFF0809 ,
460- 0x0607FFFFFFFF0203 , 0x0C0DFFFFFFFF0809 );
461- __m256i m4 = _mm256_set_epi64x (0xFFFF02030001FFFF , 0xFFFFFFFFFFFFFFFF ,
462- 0xFFFF02030001FFFF , 0xFFFFFFFFFFFFFFFF );
463- __m256i m5 = _mm256_set_epi64x (0xFFFFFFFF0E0FFFFF , 0xFFFF0A0BFFFFFFFF ,
464- 0xFFFFFFFF0E0FFFFF , 0xFFFF0A0BFFFFFFFF );
465- __m256i m6 = _mm256_set_epi64x (0x0E0F0C0DFFFF0A0B , 0x0809FFFF06070405 ,
466- 0x0E0F0C0DFFFF0A0B , 0x0809FFFF06070405 );
467-
468- // 24 entries in each loop
469- for ( ; count >= 16 ; count -= 16 , sp0 += 16 , sp1 += 16 , sp2 += 16 , p += 48 )
470- {
471- __m256i a, b, t, u, v;
472- a = _mm256_load_si256 ((__m256i*)sp0);
473- a = _mm256_max_epi32 (a, zero);
474- t = _mm256_min_epi32 (a, max_val_vec);
475-
476- a = _mm256_load_si256 ((__m256i*)sp1);
477- a = _mm256_max_epi32 (a, zero);
478- a = _mm256_min_epi32 (a, max_val_vec);
479- a = _mm256_slli_epi32 (a, 16 );
480- t = _mm256_or_si256 (t, a);
481-
482- a = _mm256_load_si256 ((__m256i*)sp2);
483- a = _mm256_max_epi32 (a, zero);
484- u = _mm256_min_epi32 (a, max_val_vec);
485-
486- a = _mm256_load_si256 ((__m256i*)sp0 + 1 );
487- a = _mm256_max_epi32 (a, zero);
488- a = _mm256_min_epi32 (a, max_val_vec);
489- a = _mm256_slli_epi32 (a, 16 );
490- u = _mm256_or_si256 (u, a);
491-
492- a = _mm256_load_si256 ((__m256i*)sp1 + 1 );
493- a = _mm256_max_epi32 (a, zero);
494- v = _mm256_min_epi32 (a, max_val_vec);
495-
496- a = _mm256_load_si256 ((__m256i*)sp2 + 1 );
497- a = _mm256_max_epi32 (a, zero);
498- a = _mm256_min_epi32 (a, max_val_vec);
499- a = _mm256_slli_epi32 (a, 16 );
500- v = _mm256_or_si256 (v, a);
501-
502- // start combining using the sse41 method
503- __m256i xt, xu, xv;
504-
505- a = _mm256_shuffle_epi8 (t, m0);
506- b = _mm256_shuffle_epi8 (u, m1);
507- xt = _mm256_or_si256 (a, b);
508-
509- a = _mm256_shuffle_epi8 (t, m2);
510- b = _mm256_shuffle_epi8 (u, m3);
511- a = _mm256_or_si256 (a, b);
512- b = _mm256_shuffle_epi8 (v, m4);
513- xu = _mm256_or_si256 (a, b);
514-
515- a = _mm256_shuffle_epi8 (u, m5);
516- b = _mm256_shuffle_epi8 (v, m6);
517- xv = _mm256_or_si256 (a, b);
518-
519- // reorder them in the correct order
520- t = _mm256_set_epi64x (_mm256_extract_epi64 (xt, 2 ),
521- _mm256_extract_epi64 (xu, 0 ),
522- _mm256_extract_epi64 (xt, 1 ),
523- _mm256_extract_epi64 (xt, 0 ));
524- _mm256_storeu_si256 ((__m256i*)p , t);
525-
526- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 0 ),
527- _mm256_extract_epi64 (xu, 1 ),
528- _mm256_extract_epi64 (xu, 2 ),
529- _mm256_extract_epi64 (xt, 3 ));
530- _mm256_storeu_si256 ((__m256i*)p + 1 , t);
531-
532- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 3 ),
533- _mm256_extract_epi64 (xv, 2 ),
534- _mm256_extract_epi64 (xu, 3 ),
535- _mm256_extract_epi64 (xv, 1 ));
536- _mm256_storeu_si256 ((__m256i*)p + 2 , t);
537- }
538-
539- int max_val = (1 <<bit_depth) - 1 ;
540- for ( ; count > 0 ; --count)
541- {
542- int val;
543- val = *sp0++;
544- val = val >= 0 ? val : 0 ;
545- val = val <= max_val ? val : max_val;
546- *p++ = be2le ((ui16) val);
547- val = *sp1++;
548- val = val >= 0 ? val : 0 ;
549- val = val <= max_val ? val : max_val;
550- *p++ = be2le ((ui16) val);
551- val = *sp2++;
552- val = val >= 0 ? val : 0 ;
553- val = val <= max_val ? val : max_val;
554- *p++ = be2le ((ui16) val);
555- }
556- }
557321}
0 commit comments