Skip to content

Commit 4dda0e2

Browse files
committed
Small util function perf bumps
1 parent 97fc7a0 commit 4dda0e2

File tree

1 file changed

+25
-13
lines changed

1 file changed

+25
-13
lines changed

include/FastNoise/Generators/Utils.inl

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,16 @@ namespace FastNoise
5757
float32v a = u * FS::SelectHighBit( index, float32v( 2 ), float32v( kRoot3f ) );
5858
float32v b = v ^ FS::Cast<float>( ( index >> 30 ) << 31 );
5959

60-
return FS::MaskedAdd( index >= int32v( 0 ), a, b ) ^ FS::Cast<float>( ( index >> 28 ) << 31 );
60+
if constexpr( SIMD & FastSIMD::FeatureFlag::x86 )
61+
{
62+
auto indexNegativeMask = FS::Cast<FS::Mask<32, false>>( index >> 31 );
63+
64+
return FS::InvMaskedAdd( indexNegativeMask, a, b ) ^ FS::Cast<float>( ( index >> 28 ) << 31 );
65+
}
66+
else
67+
{
68+
return FS::MaskedAdd( index >= int32v( 0 ), a, b ) ^ FS::Cast<float>( ( index >> 28 ) << 31 );
69+
}
6170
}
6271
}
6372

@@ -454,8 +463,11 @@ namespace FastNoise
454463

455464
if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F )
456465
{
457-
//indexFacetBasisWithPermute2 = FS::NativeExec<int32v>( FS_BIND_INTRINSIC( _mm512_rol_epi32 ), indexFacetBasisWithPermute2, 2 );
466+
#if defined( _MSC_VER ) && !defined( __clang__ )
467+
indexFacetBasisWithPermute2 = FS::NativeExec<int32v>( FS_BIND_INTRINSIC( _mm512_rol_epi32 ), indexFacetBasisWithPermute2, std::integral_constant<int, 2>() );
468+
#else
458469
indexFacetBasisWithPermute2 = FS::NativeExec<int32v>( FS_BIND_INTRINSIC( _mm512_rolv_epi32 ), indexFacetBasisWithPermute2, int32v( 2 ) );
470+
#endif
459471

460472
const auto tableA_gX = FS::Constant<float>( kComponentA, kComponentA, kComponentC, kComponentC, -kComponentA, -kComponentA, kComponentC, kComponentC, kComponentA, kComponentA, kComponentC, kComponentC, -kComponentA, -kComponentA, kComponentC, kComponentC );
461473
const auto tableA_gY = FS::Constant<float>( kComponentC, kComponentB, kComponentA, kComponentA, kComponentC, kComponentB, -kComponentA, -kComponentA, kComponentC, -kComponentB, kComponentA, kComponentA, kComponentC, -kComponentB, -kComponentA, -kComponentA );
@@ -489,22 +501,22 @@ namespace FastNoise
489501
float32v sign0 = FS::Cast<float>( indexFacetBasisWithPermute2 << 31 );
490502
float32v sign1 = FS::Cast<float>( ( indexFacetBasisWithPermute2 << 30 ) & int32v( 1 << 31 ) );
491503

492-
auto notYZ = indexFacetBasisWithPermute2 >= int32v( 0 );
493-
auto notXY = ( indexFacetBasisWithPermute2 << 1 ) >= int32v( 0 );
504+
auto notYZ = indexFacetBasisWithPermute2;
505+
auto notXY = indexFacetBasisWithPermute2 << 1;
494506

495-
float32v valueA_gX = FS::Select( notYZ, float32v( kComponentA ) ^ sign0, float32v( kComponentC ) );
496-
float32v valueA_gY = FS::Select( notYZ & notXY, float32v( kComponentC ), FS::Select( notXY, float32v( kComponentA ) ^ sign0, float32v( kComponentB ) ^ sign1 ) );
497-
float32v valueA_gZ = FS::Select( notXY, float32v( kComponentB ) ^ sign1, float32v( kComponentC ) );
507+
float32v valueA_gX = FS::SelectHighBit( notYZ, float32v( kComponentC ), float32v( kComponentA ) ^ sign0 );
508+
float32v valueA_gY = FS::SelectHighBit( notYZ | notXY, FS::SelectHighBit( notXY, float32v( kComponentB ) ^ sign1, float32v( kComponentA ) ^ sign0 ), float32v( kComponentC ) );
509+
float32v valueA_gZ = FS::SelectHighBit( notXY, float32v( kComponentC ), float32v( kComponentB ) ^ sign1 );
498510
float32v valueA = FS::FMulAdd( valueA_gZ, fZ, FS::FMulAdd( fY, valueA_gY, fX * valueA_gX ) );
499511

500-
float32v valueB_gX = FS::Select( notYZ, float32v( kComponentB ) ^ sign0, float32v( kComponentC ) );
501-
float32v valueB_gY = FS::Select( notYZ & notXY, float32v( kComponentC ), FS::Select( notXY, float32v( kComponentB ) ^ sign0, float32v( kComponentA ) ^ sign1 ) );
502-
float32v valueB_gZ = FS::Select( notXY, float32v( kComponentA ) ^ sign1, float32v( kComponentC ) );
512+
float32v valueB_gX = FS::SelectHighBit( notYZ, float32v( kComponentC ), float32v( kComponentB ) ^ sign0 );
513+
float32v valueB_gY = FS::SelectHighBit( notYZ | notXY, FS::SelectHighBit( notXY, float32v( kComponentA ) ^ sign1, float32v( kComponentB ) ^ sign0 ), float32v( kComponentC ) );
514+
float32v valueB_gZ = FS::SelectHighBit( notXY, float32v( kComponentC ), float32v( kComponentA ) ^ sign1 );
503515
float32v valueB = FS::FMulAdd( valueB_gZ, fZ, FS::FMulAdd( fY, valueB_gY, fX * valueB_gX ) );
504516

505-
float32v valueC_gX = FS::Select( notYZ, float32v( kComponentsDE ) ^ sign0, float32v( kComponentF ) );
506-
float32v valueC_gY = FS::Select( notYZ & notXY, float32v( kComponentF ), FS::Select( notXY, float32v( kComponentsDE ) ^ sign0, float32v( kComponentsDE ) ^ sign1 ) );
507-
float32v valueC_gZ = FS::Select( notXY, float32v( kComponentsDE ) ^ sign1, float32v( kComponentF ) );
517+
float32v valueC_gX = FS::SelectHighBit( notYZ, float32v( kComponentF ), float32v( kComponentsDE ) ^ sign0 );
518+
float32v valueC_gY = FS::SelectHighBit( notYZ | notXY, FS::SelectHighBit( notXY, float32v( kComponentsDE ) ^ sign1, float32v( kComponentsDE ) ^ sign0 ), float32v( kComponentF ) );
519+
float32v valueC_gZ = FS::SelectHighBit( notXY, float32v( kComponentF ), float32v( kComponentsDE ) ^ sign1 );
508520
valueC = FS::FMulAdd( valueC_gZ, fZ, FS::FMulAdd( fY, valueC_gY, fX * valueC_gX ) );
509521

510522
valueAB = FS::SelectHighBit( indexPermutation2HighBit, valueB, valueA );

0 commit comments

Comments
 (0)