11// =====================================================================
2- // Copyright 2018 (c), Advanced Micro Devices, Inc. All rights reserved.
2+ // Copyright 2020 (c), Advanced Micro Devices, Inc. All rights reserved.
33//
44// Permission is hereby granted, free of charge, to any person obtaining a copy
55// of this software and associated documentation files(the "Software"), to deal
@@ -34,7 +34,8 @@ float cpu_sqrtf(float * pIn) {
3434 return sqrtf (*pIn);
3535}
3636
37- #ifndef _LINUX
37+ #ifdef CMP_USE_XMMINTRIN
38+ #ifndef __linux__
3839// ---------------------------------------------
3940// SSE: Computes square root of a float value
4041// ---------------------------------------------
@@ -45,6 +46,7 @@ float sse_sqrtf( float *pIn ) {
4546 return val.m128_f32 [0 ];
4647}
4748#endif
49+ #endif
4850
4951// -------------------------------------------------
5052// CPU: Computes 1 / (square root of a float value)
@@ -57,16 +59,29 @@ float cpu_rsqf(float *f) {
5759 return 0 .0f ;
5860}
5961
60- #ifndef _LINUX
62+ #ifdef CMP_USE_XMMINTRIN
63+ #ifndef __linux__
6164// -------------------------------------------------
6265// SSE: Computes 1 / (square root of a float value)
6366// -------------------------------------------------
64- float sse_rsqf (float *v) {
67+ #ifdef CMP_USE_RSQ_RSQR
68+ float sse_rsqf (float * v)
69+ {
6570 __m128 val = _mm_load1_ps (v);
66- val = _mm_rsqrt_ss (val);
71+ val = _mm_rsqrt_ss (val);
6772 float frsq = val.m128_f32 [0 ];
68- return (0 .5f * frsq) * (3 .0f - (*v * frsq) * frsq);
73+ return (0 .5f * frsq) * (3 .0f - (*v * frsq) * frsq);
6974};
75+ #else
76+ float sse_rsqf (float *v) {
77+ __m128 val = _mm_set_ss (*v); // Copy float and zero the upper 3 elements
78+ __m128 val1 = _mm_set_ss (1 .0f );
79+ val = _mm_sqrt_ss (val);
80+ val = _mm_div_ss (val1, val);
81+ return ( val.m128_f32 [0 ] );
82+ };
83+ #endif
84+ #endif
7085#endif
7186
7287// ---------------------------------------------
@@ -76,13 +91,15 @@ float cpu_minf(float l1, float r1) {
7691 return (l1 < r1 ? l1 : r1);
7792}
7893
79- #ifndef _LINUX
94+ #ifdef CMP_USE_XMMINTRIN
95+ #ifndef __linux__
8096float sse_minf ( float a, float b ) {
8197 // Branchless SSE min.
8298 _mm_store_ss ( &a, _mm_min_ss (_mm_set_ss (a),_mm_set_ss (b)) );
8399 return a;
84100}
85101#endif
102+ #endif
86103
87104// ---------------------------------------------
88105// CPU: Computes max of two float values
@@ -91,13 +108,15 @@ float cpu_maxf(float l1, float r1) {
91108 return (l1 > r1 ? l1 : r1);
92109}
93110
94- #ifndef _LINUX
111+ #ifdef CMP_USE_XMMINTRIN
112+ #ifndef __linux__
95113float sse_maxf ( float a, float b ) {
96114 // Branchless SSE max.
97115 _mm_store_ss ( &a, _mm_max_ss (_mm_set_ss (a),_mm_set_ss (b)) );
98116 return a;
99117}
100118#endif
119+ #endif
101120
102121// ================================================
103122// Clamp the value in the range [minval .. maxval]
@@ -111,12 +130,14 @@ float cpu_clampf(float value, float minval, float maxval) {
111130 return value;
112131}
113132
114- #ifndef _LINUX
133+ #ifdef CMP_USE_XMMINTRIN
134+ #ifndef __linux__
115135float sse_clampf ( float val, float minval, float maxval ) {
116136 _mm_store_ss ( &val, _mm_min_ss ( _mm_max_ss (_mm_set_ss (val),_mm_set_ss (minval)), _mm_set_ss (maxval) ) );
117137 return val;
118138}
119139#endif
140+ #endif
120141
121142void cpu_averageRGB (unsigned char *src_rgba_block) {
122143 float medianR = 0 .0f , medianG = 0 .0f , medianB = 0 .0f ;
@@ -184,7 +205,8 @@ float cpu_lerp2(CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_
184205 return float (min1+min2);
185206}
186207
187- #ifndef _LINUX
208+ #ifdef CMP_USE_XMMINTRIN
209+ #ifndef __linux__
188210float sse_lerp2 (CMP_Vec4uc C1, CMP_Vec4uc CA, CMP_Vec4uc CB, CMP_Vec4uc C2, CMP_MATH_BYTE *encode1, CMP_MATH_BYTE *encode2) {
189211 // Initial Setup
190212 __m128 iC1, iC2, iCA, iCB; // Load auchars into _m128
@@ -301,6 +323,7 @@ void cmp_set_fma3_features() {
301323 cmp_lerp2 = fma_lerp2;
302324}
303325#endif
326+ #endif
304327
305328
306329void cmp_set_cpu_features () {
@@ -313,7 +336,9 @@ void cmp_set_cpu_features() {
313336 cmp_sqrtf = cpu_sqrtf;
314337}
315338
316- #ifndef _LINUX
339+
340+ #ifdef CMP_USE_XMMINTRIN
341+ #ifndef __linux__
317342void cmp_set_sse2_features () {
318343 cmp_clampf = sse_clampf;
319344 cmp_lerp2 = sse_lerp2;
@@ -323,6 +348,7 @@ void cmp_set_sse2_features() {
323348 cmp_sqrtf = sse_sqrtf;
324349}
325350#endif
351+ #endif
326352
327353// ---------------------------------
328354// User Interface to the CMP_MATH
0 commit comments