Skip to content

Commit bcb38b1

Browse files
authored
Ambient Occlusion Optimizations (#4099)
* Use Hilbert Index instead of Blue Noise for GTAO, same as the original paper, allows us to have better quality with less work, GTAOPasses::DenoiseSpatial does two pixels for one thread * Force Float16 for GTAO, good speedups on modern GPUs, default behaviour from reference implementation, do explicit casts to supress warnings * More accurate MSAAUtils::GetSampleIndex https://files.facepunch.com/sampavlovic/1b2611b1/image.psd.png * adjustments * reduce blending for non-edge pixels to preserve more details instead of using the raw amount at edges * Early out here, no good way to have a non uniform size dispatch yet, should still keep 2x spatial speedup * Remove unused comment * update shaders
1 parent 279f8e0 commit bcb38b1

File tree

4 files changed

+85
-63
lines changed

4 files changed

+85
-63
lines changed

engine/Sandbox.Engine/Scene/Components/PostProcessing/Effects/AmbientOcclusion.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,20 +124,20 @@ GTAOConstants GetGTAOConstants()
124124
consts.EffectFalloffRange = GetWeighted( x => x.FalloffRange, 1.0f );
125125
consts.DenoiseBlurBeta = 1.2f; // Used only on Spatial denoising
126126

127-
consts.NoiseIndex = DenoiseMode == DenoiseModes.Temporal ? Frame % 64 : 0;
127+
consts.NoiseIndex = DenoiseMode == DenoiseModes.Temporal ? Frame : 0;
128128
consts.ThinOccluderCompensation = ThinCompensation;
129-
consts.FinalValuePower = GetWeighted( x => x.Intensity, 1.0f ) * 5.0f;
129+
consts.FinalValuePower = GetWeighted( x => x.Intensity, 1.0f ) * 10.0f;
130130

131131
switch ( UserQuality )
132132
{
133133
case 1:
134134
consts.TAABlendAmount = 0.95f;
135135
break;
136136
case 2:
137-
consts.TAABlendAmount = 0.9f;
137+
consts.TAABlendAmount = 0.95f;
138138
break;
139139
case 3:
140-
consts.TAABlendAmount = 0.8f;
140+
consts.TAABlendAmount = 0.95f;
141141
break;
142142
}
143143
return consts;

game/addons/base/Assets/shaders/common/thirdparty/XeGTAO.hlsl

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ void XeGTAO_DecodeVisibilityBentNormal( const uint packedValue, out lpfloat visi
191191
visibility = decoded.w;
192192
}
193193

194-
void XeGTAO_OutputWorkingTerm( const uint2 pixCoord, lpfloat visibility, lpfloat3 bentNormal, RWTexture2D<lpfloat> outWorkingAOTerm )
194+
void XeGTAO_OutputWorkingTerm( const uint2 pixCoord, lpfloat visibility, lpfloat3 bentNormal, RWTexture2D<float> outWorkingAOTerm )
195195
{
196196
//visibility = saturate( visibility / lpfloat(XE_GTAO_OCCLUSION_TERM_SCALE) );
197197
#ifdef XE_GTAO_COMPUTE_BENT_NORMALS
@@ -239,12 +239,12 @@ lpfloat3x3 XeGTAO_RotFromToMatrix( lpfloat3 from, lpfloat3 to )
239239
}
240240

241241
void XeGTAO_MainPass( const uint2 pixCoord, lpfloat sliceCount, lpfloat stepsPerSlice, const lpfloat2 localNoise, lpfloat3 viewspaceNormal, const GTAOConstants consts,
242-
Texture2D<lpfloat> sourceViewspaceDepth, SamplerState depthSampler, RWTexture2D<lpfloat> outWorkingAOTerm, RWTexture2D<lpfloat> outWorkingEdges )
242+
Texture2D<float> sourceViewspaceDepth, SamplerState depthSampler, RWTexture2D<float> outWorkingAOTerm, RWTexture2D<float> outWorkingEdges )
243243
{
244244
float2 normalizedScreenPos = (pixCoord + 0.5.xx) * consts.ViewportPixelSize;
245245

246-
lpfloat4 valuesUL = sourceViewspaceDepth.GatherRed( depthSampler, float2( pixCoord * consts.ViewportPixelSize ) );
247-
lpfloat4 valuesBR = sourceViewspaceDepth.GatherRed( depthSampler, float2( pixCoord * consts.ViewportPixelSize ), int2( 1, 1 ) );
246+
lpfloat4 valuesUL = (lpfloat4)sourceViewspaceDepth.GatherRed( depthSampler, float2( pixCoord * consts.ViewportPixelSize ) );
247+
lpfloat4 valuesBR = (lpfloat4)sourceViewspaceDepth.GatherRed( depthSampler, float2( pixCoord * consts.ViewportPixelSize ), int2( 1, 1 ) );
248248

249249
// viewspace Z at the center
250250
lpfloat viewspaceZ = valuesUL.y; //sourceViewspaceDepth.SampleLevel( depthSampler, normalizedScreenPos, 0 ).x;
@@ -610,7 +610,7 @@ lpfloat XeGTAO_ClampDepth( float depth )
610610
}
611611

612612
groupshared lpfloat g_scratchDepths[8][8];
613-
void XeGTAO_PrefilterDepths16x16( uint2 dispatchThreadID /*: SV_DispatchThreadID*/, uint2 groupThreadID /*: SV_GroupThreadID*/, const GTAOConstants consts, Texture2D sourceNDCDepth, SamplerState depthSampler, RWTexture2D<lpfloat> outDepth0, RWTexture2D<lpfloat> outDepth1, RWTexture2D<lpfloat> outDepth2, RWTexture2D<lpfloat> outDepth3, RWTexture2D<lpfloat> outDepth4 )
613+
void XeGTAO_PrefilterDepths16x16( uint2 dispatchThreadID /*: SV_DispatchThreadID*/, uint2 groupThreadID /*: SV_GroupThreadID*/, const GTAOConstants consts, Texture2D sourceNDCDepth, SamplerState depthSampler, RWTexture2D<float> outDepth0, RWTexture2D<float> outDepth1, RWTexture2D<float> outDepth2, RWTexture2D<float> outDepth3, RWTexture2D<float> outDepth4 )
614614
{
615615
// MIP 0
616616
const uint2 baseCoord = dispatchThreadID;
@@ -727,7 +727,9 @@ void XeGTAO_DecodeGatherPartial( const uint4 packedValue, out AOTermType outDeco
727727
#endif
728728
}
729729

730-
lpfloat XeGTAO_Denoise( const uint2 pixCoordBase, const GTAOConstants consts, Texture2D sourceAOTerm, Texture2D<lpfloat> sourceEdges, SamplerState texSampler )
730+
// Returns two AO values: .x for pixCoordBase, .y for pixCoordBase + int2(1,0)
731+
// Each thread processes 2 horizontal pixels as a performance optimization (shared gather reads)
732+
lpfloat2 XeGTAO_Denoise( const uint2 pixCoordBase, const GTAOConstants consts, Texture2D sourceAOTerm, Texture2D<float> sourceEdges, SamplerState texSampler )
731733
{
732734
const lpfloat blurAmount = (lpfloat)consts.DenoiseBlurBeta;
733735
const lpfloat diagWeight = 0.85 * 0.5;
@@ -741,14 +743,14 @@ lpfloat XeGTAO_Denoise( const uint2 pixCoordBase, const GTAOConstants consts, Te
741743

742744
// gather edge and visibility quads, used later
743745
const float2 gatherCenter = float2( pixCoordBase.x, pixCoordBase.y ) * consts.ViewportPixelSize;
744-
lpfloat4 edgesQ0 = sourceEdges.GatherRed( texSampler, gatherCenter, int2( 0, 0 ) );
745-
lpfloat4 edgesQ1 = sourceEdges.GatherRed( texSampler, gatherCenter, int2( 2, 0 ) );
746-
lpfloat4 edgesQ2 = sourceEdges.GatherRed( texSampler, gatherCenter, int2( 1, 2 ) );
746+
lpfloat4 edgesQ0 = (lpfloat4)sourceEdges.GatherRed( texSampler, gatherCenter, int2( 0, 0 ) );
747+
lpfloat4 edgesQ1 = (lpfloat4)sourceEdges.GatherRed( texSampler, gatherCenter, int2( 2, 0 ) );
748+
lpfloat4 edgesQ2 = (lpfloat4)sourceEdges.GatherRed( texSampler, gatherCenter, int2( 1, 2 ) );
747749

748-
lpfloat4 visQ0 = ( sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 0, 0 ) ) );
749-
lpfloat4 visQ1 = ( sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 2, 0 ) ) );
750-
lpfloat4 visQ2 = ( sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 0, 2 ) ) );
751-
lpfloat4 visQ3 = ( sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 2, 2 ) ) );
750+
lpfloat4 visQ0 = (lpfloat4)sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 0, 0 ) );
751+
lpfloat4 visQ1 = (lpfloat4)sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 2, 0 ) );
752+
lpfloat4 visQ2 = (lpfloat4)sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 0, 2 ) );
753+
lpfloat4 visQ3 = (lpfloat4)sourceAOTerm.GatherRed( texSampler, gatherCenter, int2( 2, 2 ) );
752754

753755
for( int side = 0; side < 2; side++ )
754756
{
@@ -810,7 +812,7 @@ lpfloat XeGTAO_Denoise( const uint2 pixCoordBase, const GTAOConstants consts, Te
810812
aoTerm[side] = sum / sumWeight;
811813

812814
}
813-
return aoTerm[0];
815+
return lpfloat2(aoTerm[0], aoTerm[1]);
814816
}
815817

816818

game/addons/base/Assets/shaders/common/utils/MSAAUtils.hlsl

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,33 @@
44
#include "common/classes/Depth.hlsl"
55
class MSAAUtils
66
{
7-
// Gets the correct sample index on the lane for the given pixel position and UV coordinates.
8-
// This is used to composite a non-MSAA texture in a MSAA buffer.
9-
// You can use either QuadReadLaneAt or Texture2D::GatherRed to get the correct sample.
7+
// Gets the gather lane whose texel center is closest to the current MSAA sample position.
8+
// Used to composite a non-MSAA texture into a MSAA buffer: first filters to texels that
9+
// match our depth, then picks the spatially nearest one via branchless paired tournament.
1010
static int GetSampleIndex( float4 vPositionSs, float2 uv )
1111
{
12-
float4 v4Depths = g_tDepthChain.GatherRed( g_sBilinearClamp, uv.xy );
13-
float4 vDepthDiffs = abs( vPositionSs.zzzz - v4Depths.xyzw );
12+
float4 depths = g_tDepthChain.GatherRed( g_sBilinearClamp, uv );
13+
float4 depthDiffs = abs( vPositionSs.z - depths );
1414

15-
// Find the minimum depth difference
16-
float minDepthDiff = min(min(vDepthDiffs.x, vDepthDiffs.y), min(vDepthDiffs.z, vDepthDiffs.w));
15+
// Only consider lanes within epsilon of the closest depth match
16+
float minDiff = min( min( depthDiffs.x, depthDiffs.y ), min( depthDiffs.z, depthDiffs.w ) );
17+
float4 valid = step( depthDiffs, minDiff + 1e-5 );
1718

18-
// Explicit vector comparison to find the closest actual index.
19-
int4 indices = int4(0, 1, 2, 3);
20-
int4 matchIndices = indices * (vDepthDiffs == minDepthDiff);
21-
int selectedIndex = max(max(matchIndices.x, matchIndices.y), max(matchIndices.z, matchIndices.w));
19+
// Squared sub-pixel distance from fragment to each gather texel center
20+
// Texel centers: lane0(0.25,0.25) lane1(0.75,0.25) lane2(0.25,0.75) lane3(0.75,0.75)
21+
float2 dx = frac( vPositionSs.x ) - float2( 0.25, 0.75 );
22+
float2 dy = frac( vPositionSs.y ) - float2( 0.25, 0.75 );
23+
float4 distSq = dx.xyxy * dx.xyxy + dy.xxyy * dy.xxyy;
2224

23-
// Return the index value for the first matching
24-
return selectedIndex;
25+
// Large penalty knocks out depth-mismatched lanes
26+
float4 scores = distSq + ( 1.0 - valid ) * 1e8;
27+
28+
// Branchless argmin: paired tournament builds a 2-bit index
29+
float2 sel = step( scores.yw + 1e-6, scores.xz ); // per-pair winner (bit0 candidates)
30+
float2 best = lerp( scores.xz, scores.yw, sel ); // per-pair best score
31+
float pair = step( best.y + 1e-6, best.x ); // winning pair (bit1)
32+
33+
return (int)( lerp( sel.x, sel.y, pair ) + pair * 2.0 );
2534
}
2635
};
2736

game/addons/base/Assets/shaders/gtao_cs.shader

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,6 @@ MODES
77

88
CS
99
{
10-
11-
#define XE_GTAO_USE_HALF_FLOAT_PRECISION 0 // dxc has a compiler bug doing dot products with half precision, unsure if much perf gain on desktop
12-
#define XE_GTAO_FP32_DEPTHS 1
13-
1410
#include "common.fxc"
1511
#include "postprocess/shared.hlsl"
1612

@@ -34,29 +30,27 @@ CS
3430
// input output textures for the first pass (XeGTAO_PrefilterDepths16x16)
3531
#if ( D_PASS == 0)
3632
Texture2DMS<float> g_srcRawDepth < Attribute("RawDepth"); > ; // source depth buffer data (in NDC space in DirectX)
37-
RWTexture2D<lpfloat> g_outWorkingDepthMIP0 < Attribute("WorkingDepthMIP0"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
38-
RWTexture2D<lpfloat> g_outWorkingDepthMIP1 < Attribute("WorkingDepthMIP1"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
39-
RWTexture2D<lpfloat> g_outWorkingDepthMIP2 < Attribute("WorkingDepthMIP2"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
40-
RWTexture2D<lpfloat> g_outWorkingDepthMIP3 < Attribute("WorkingDepthMIP3"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
41-
RWTexture2D<lpfloat> g_outWorkingDepthMIP4 < Attribute("WorkingDepthMIP4"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
33+
RWTexture2D<float> g_outWorkingDepthMIP0 < Attribute("WorkingDepthMIP0"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
34+
RWTexture2D<float> g_outWorkingDepthMIP1 < Attribute("WorkingDepthMIP1"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
35+
RWTexture2D<float> g_outWorkingDepthMIP2 < Attribute("WorkingDepthMIP2"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
36+
RWTexture2D<float> g_outWorkingDepthMIP3 < Attribute("WorkingDepthMIP3"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
37+
RWTexture2D<float> g_outWorkingDepthMIP4 < Attribute("WorkingDepthMIP4"); > ; // output viewspace depth MIP (these are views into g_srcWorkingDepth MIP levels)
4238
#endif
4339

4440
// input output textures for the second pass (XeGTAO_MainPass)
45-
Texture2D<lpfloat> g_srcWorkingDepth < Attribute("WorkingDepth"); > ; // viewspace depth with MIPs, output by XeGTAO_PrefilterDepths16x16 and consumed by XeGTAO_MainPass
46-
RWTexture2D<lpfloat> g_outWorkingAOTerm < Attribute("WorkingAOTerm"); > ; // output AO term (includes bent normals if enabled - packed as R11G11B10 scaled by AO)
47-
RWTexture2D<lpfloat> g_outWorkingEdges < Attribute("WorkingEdges"); > ; // output depth-based edges used by the denoiser
41+
Texture2D<float> g_srcWorkingDepth < Attribute("WorkingDepth"); > ; // viewspace depth with MIPs, output by XeGTAO_PrefilterDepths16x16 and consumed by XeGTAO_MainPass
42+
RWTexture2D<float> g_outWorkingAOTerm < Attribute("WorkingAOTerm"); > ; // output AO term (includes bent normals if enabled - packed as R11G11B10 scaled by AO)
43+
RWTexture2D<float> g_outWorkingEdges < Attribute("WorkingEdges"); > ; // output depth-based edges used by the denoiser
4844

4945
// input output textures for the third pass
5046
Texture2D g_srcWorkingAOTerm < Attribute("WorkingAOTerm"); > ; // coming from previous pass
51-
Texture2D<lpfloat> g_srcWorkingEdges < Attribute("WorkingEdges"); > ; // coming from previous pass
52-
RWTexture2D<lpfloat> g_outAO < Attribute("FinalAOTerm"); >; // final AO term - just 'visibility' or 'visibility + bent normals'
47+
Texture2D<float> g_srcWorkingEdges < Attribute("WorkingEdges"); > ; // coming from previous pass
48+
RWTexture2D<float> g_outAO < Attribute("FinalAOTerm"); >; // final AO term - just 'visibility' or 'visibility + bent normals'
5349
Texture2D g_prevAO < Attribute("FinalAOTermPrev"); >;
5450

5551
SamplerState PointClamp < Filter( POINT ); AddressU( CLAMP ); AddressV( CLAMP ); AddressW( CLAMP ); >;
5652
SamplerState BilinearClamp < Filter( MIN_MAG_MIP_LINEAR ); AddressU( CLAMP ); AddressV( CLAMP ); AddressW( CLAMP ); >;
5753

58-
Texture2D g_tBlueNoise < Attribute( "BlueNoise" ); >;
59-
6054
//-------------------------------------------------------------------------------------------------------------------
6155

6256
enum GTAOPasses
@@ -98,7 +92,7 @@ CS
9892
//-------------------------------------------------------------------------------------------------------------------
9993
lpfloat3 LoadNormal( int2 pos )
10094
{
101-
lpfloat3 viewnormal = Vector3WsToVs( Normals::Sample( pos ) );
95+
lpfloat3 viewnormal = (lpfloat3)Vector3WsToVs( Normals::Sample( pos ) );
10296
viewnormal.z = -viewnormal.z;
10397

10498
return viewnormal;
@@ -121,26 +115,32 @@ CS
121115
}
122116
else if (D_PASS == GTAOPasses::MainPass )
123117
{
124-
const lpfloat2 localNoise = g_tBlueNoise[ ( vDispatchId.xy + ( sGTAOConsts.NoiseIndex * float2( 1325, 4125 ) ) ) % 128 ].xy; // Blue noise texture
118+
// Hilbert R2 quasi-random sequence for spatiotemporal noise (better than blue noise for this use case)
119+
// See: https://www.shadertoy.com/view/3tB3z3, https://github.com/GameTechDev/XeGTAO
120+
uint hilbertIndex = HilbertIndex( vDispatchId.x, vDispatchId.y );
121+
hilbertIndex += 288 * ( sGTAOConsts.NoiseIndex % 64 ); // 288 found empirically for best results with XE_HILBERT_LEVEL 6
122+
const lpfloat2 localNoise = lpfloat2( frac( 0.5 + hilbertIndex * float2( 0.75487766624669276005, 0.5698402909980532659114 ) ) );
125123
const lpfloat3 viewspaceNormal = LoadNormal( vDispatchId.xy );
126124

127125
lpfloat sliceCount;
128126
lpfloat stepsPerSlice;
129127

128+
// Sample counts matched to reference XeGTAO implementation
129+
// Lower counts are compensated by better noise (Hilbert R2) and temporal denoising
130130
if (D_QUALITY == 0)
131131
{
132-
sliceCount = 3; // Low quality
133-
stepsPerSlice = 3;
132+
sliceCount = 1; // Low quality
133+
stepsPerSlice = 2;
134134
}
135135
else if (D_QUALITY == 1)
136136
{
137-
sliceCount = 4; // Medium quality
138-
stepsPerSlice = 4;
137+
sliceCount = 2; // Medium quality
138+
stepsPerSlice = 2;
139139
}
140140
else if (D_QUALITY == 2)
141141
{
142-
sliceCount = 7; // High quality
143-
stepsPerSlice = 7;
142+
sliceCount = 3; // High quality
143+
stepsPerSlice = 3;
144144
}
145145

146146
XeGTAO_MainPass
@@ -159,24 +159,35 @@ CS
159159
}
160160
else if (D_PASS == GTAOPasses::DenoiseSpatial )
161161
{
162-
g_outAO[vDispatchId] = XeGTAO_Denoise
162+
// Each thread processes 2 horizontal pixels (XeGTAO optimization: shared gather reads)
163+
// Dispatch must be at half-width to match
164+
const uint2 pixCoordBase = vDispatchId * uint2( 2, 1 );
165+
166+
// Early out here, no good way to have a non uniform size dispatch yet
167+
if( pixCoordBase.x >= sGTAOConsts.ViewportSize.x || pixCoordBase.y >= sGTAOConsts.ViewportSize.y )
168+
return;
169+
170+
lpfloat2 aoTerms = XeGTAO_Denoise
163171
(
164-
vDispatchId, // const uint2 pixCoordBase
165-
sGTAOConsts, // const GTAOConstants consts
166-
g_srcWorkingAOTerm, // Texture2D<uint> sourceAOTerm
172+
pixCoordBase, // const uint2 pixCoordBase
173+
sGTAOConsts, // const GTAOConstants consts
174+
g_srcWorkingAOTerm, // Texture2D sourceAOTerm
167175
g_srcWorkingEdges, // Texture2D<lpfloat> sourceEdges
168-
PointClamp // SamplerState texSampler
176+
BilinearClamp // SamplerState texSampler
169177
);
178+
179+
g_outAO[pixCoordBase] = aoTerms.x;
180+
g_outAO[pixCoordBase + uint2(1, 0)] = aoTerms.y;
170181
}
171182
else if( D_PASS == GTAOPasses::DenoiseTemporal )
172183
{
184+
float taaBlendAmount = sGTAOConsts.TAABlendAmount;
173185
if( g_srcWorkingEdges[ vDispatchId ] < 0.5 )
174186
{
175-
g_outAO[vDispatchId] = g_srcWorkingAOTerm[ vDispatchId ].x;
176-
return;
187+
taaBlendAmount *= 0.5; // reduce blending for non-edge pixels to preserve more details
177188
}
178189

179-
g_outAO[vDispatchId] = Motion::TemporalFilter( vDispatchId.xy, g_srcWorkingAOTerm, g_prevAO, g_GTAOConsts.TAABlendAmount ).r;
190+
g_outAO[vDispatchId] = Motion::TemporalFilter( vDispatchId.xy, g_srcWorkingAOTerm, g_prevAO, taaBlendAmount ).r;
180191
}
181192
}
182193
}

0 commit comments

Comments
 (0)