@@ -48,12 +48,8 @@ fn ch_parts_4_rgb_f32_sse<const FMA: bool>(
4848 let rgb_pixel_0 = _mm_loadu_ps ( src_ptr. as_ptr ( ) ) ;
4949 let rgb_pixel_1 = _mm_loadu_ps ( src_ptr. get_unchecked ( 3 ..) . as_ptr ( ) ) ;
5050 let rgb_pixel_2 = _mm_loadu_ps ( src_ptr. get_unchecked ( 6 ..) . as_ptr ( ) ) ;
51- let rgb_pixel_3 = _mm_setr_ps (
52- * src_ptr. get_unchecked ( 9 ) ,
53- * src_ptr. get_unchecked ( 10 ) ,
54- * src_ptr. get_unchecked ( 11 ) ,
55- 0. ,
56- ) ;
51+ let mut rgb_pixel_3 = _mm_loadu_ps ( src_ptr. get_unchecked ( 8 ..) . as_ptr ( ) ) ;
52+ rgb_pixel_3 = _mm_shuffle_ps :: < { shuffle ( 0 , 3 , 2 , 1 ) } > ( rgb_pixel_3, rgb_pixel_3) ;
5753
5854 let acc = _mm_prefer_fma_ps :: < FMA > ( store_0, rgb_pixel_0, weight0) ;
5955 let acc = _mm_prefer_fma_ps :: < FMA > ( acc, rgb_pixel_1, weight1) ;
@@ -81,22 +77,14 @@ fn ch_parts_4_rgb_f32_avx<const FMA: bool>(
8177 let rgb_pixel_0_0 = _mm_loadu_ps ( src_ptr0. as_ptr ( ) ) ;
8278 let rgb_pixel_0_1 = _mm_loadu_ps ( src_ptr0. get_unchecked ( 3 ..) . as_ptr ( ) ) ;
8379 let rgb_pixel_0_2 = _mm_loadu_ps ( src_ptr0. get_unchecked ( 6 ..) . as_ptr ( ) ) ;
84- let rgb_pixel_0_3 = _mm_setr_ps (
85- * src_ptr0. get_unchecked ( 9 ) ,
86- * src_ptr0. get_unchecked ( 10 ) ,
87- * src_ptr0. get_unchecked ( 11 ) ,
88- 0. ,
89- ) ;
80+ let mut rgb_pixel_0_3 = _mm_loadu_ps ( src_ptr0. get_unchecked ( 8 ..) . as_ptr ( ) ) ;
81+ rgb_pixel_0_3 = _mm_shuffle_ps :: < { shuffle ( 0 , 3 , 2 , 1 ) } > ( rgb_pixel_0_3, rgb_pixel_0_3) ;
9082
9183 let rgb_pixel_1_0 = _mm_loadu_ps ( src_ptr1. as_ptr ( ) ) ;
9284 let rgb_pixel_1_1 = _mm_loadu_ps ( src_ptr1. get_unchecked ( 3 ..) . as_ptr ( ) ) ;
9385 let rgb_pixel_1_2 = _mm_loadu_ps ( src_ptr1. get_unchecked ( 6 ..) . as_ptr ( ) ) ;
94- let rgb_pixel_1_3 = _mm_setr_ps (
95- * src_ptr1. get_unchecked ( 9 ) ,
96- * src_ptr1. get_unchecked ( 10 ) ,
97- * src_ptr1. get_unchecked ( 11 ) ,
98- 0. ,
99- ) ;
86+ let mut rgb_pixel_1_3 = _mm_loadu_ps ( src_ptr1. get_unchecked ( 8 ..) . as_ptr ( ) ) ;
87+ rgb_pixel_1_3 = _mm_shuffle_ps :: < { shuffle ( 0 , 3 , 2 , 1 ) } > ( rgb_pixel_1_3, rgb_pixel_1_3) ;
10088
10189 let rgb_pixel_0 =
10290 _mm256_insertf128_ps :: < 1 > ( _mm256_castps128_ps256 ( rgb_pixel_0_0) , rgb_pixel_1_0) ;
@@ -132,20 +120,12 @@ fn ch_parts_2_rgb_f32_avx<const FMA: bool>(
132120 let orig1 = _mm_loadu_ps ( src_ptr1. as_ptr ( ) ) ;
133121
134122 let rgb_pixel_0_0 = orig0;
135- let rgb_pixel_0_1 = _mm_setr_ps (
136- * src_ptr0. get_unchecked ( 3 ) ,
137- * src_ptr0. get_unchecked ( 4 ) ,
138- * src_ptr0. get_unchecked ( 5 ) ,
139- 0. ,
140- ) ;
123+ let mut rgb_pixel_0_1 = _mm_loadu_ps ( src_ptr0. get_unchecked ( 2 ..) . as_ptr ( ) ) ;
124+ rgb_pixel_0_1 = _mm_shuffle_ps :: < { shuffle ( 0 , 3 , 2 , 1 ) } > ( rgb_pixel_0_1, rgb_pixel_0_1) ;
141125
142126 let rgb_pixel_1_0 = orig1;
143- let rgb_pixel_1_1 = _mm_setr_ps (
144- * src_ptr1. get_unchecked ( 3 ) ,
145- * src_ptr1. get_unchecked ( 4 ) ,
146- * src_ptr1. get_unchecked ( 5 ) ,
147- 0. ,
148- ) ;
127+ let mut rgb_pixel_1_1 = _mm_loadu_ps ( src_ptr1. get_unchecked ( 2 ..) . as_ptr ( ) ) ;
128+ rgb_pixel_1_1 = _mm_shuffle_ps :: < { shuffle ( 0 , 3 , 2 , 1 ) } > ( rgb_pixel_1_1, rgb_pixel_1_1) ;
149129
150130 let rgb_pixel_0 =
151131 _mm256_insertf128_ps :: < 1 > ( _mm256_castps128_ps256 ( rgb_pixel_0_0) , rgb_pixel_1_0) ;
@@ -172,12 +152,8 @@ fn ch_parts_2_rgb_f32<const FMA: bool>(
172152
173153 let orig1 = _mm_loadu_ps ( src_ptr. as_ptr ( ) ) ;
174154 let rgb_pixel_0 = orig1;
175- let rgb_pixel_1 = _mm_setr_ps (
176- * src_ptr. get_unchecked ( 3 ) ,
177- * src_ptr. get_unchecked ( 4 ) ,
178- * src_ptr. get_unchecked ( 5 ) ,
179- 0. ,
180- ) ;
155+ let mut rgb_pixel_1 = _mm_loadu_ps ( src_ptr. get_unchecked ( 2 ..) . as_ptr ( ) ) ;
156+ rgb_pixel_1 = _mm_shuffle_ps :: < { shuffle ( 0 , 3 , 2 , 1 ) } > ( rgb_pixel_1, rgb_pixel_1) ;
181157
182158 let mut acc = _mm_prefer_fma_ps :: < FMA > ( store_0, rgb_pixel_0, weight0) ;
183159 acc = _mm_prefer_fma_ps :: < FMA > ( acc, rgb_pixel_1, weight1) ;
0 commit comments