-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathism.comp
139 lines (106 loc) · 5.14 KB
/
ism.comp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#version 430
#extension GL_ARB_shading_language_include : require
#include </data/shaders/common/random.glsl>
#include </data/shaders/common/floatpacking.glsl>
#include </data/shaders/ism/ism_utils.glsl>
layout (local_size_x = 128) in;
const int totalVplCount = 1024;
layout (std140, binding = 0) uniform packedVplBuffer_
{
vec4 vplPositionNormalBuffer[totalVplCount];
};
layout (shared, binding = 0) buffer atomicBuffer_
{
uint[totalVplCount] atomicCounter;
};
layout (r32ui, binding = 0) coherent uniform uimage2D softrenderBuffer;
layout (rgba32f, binding = 1) restrict readonly uniform imageBuffer pointBuffer;
uniform ivec2 viewport;
uniform float zFar;
uniform bool usePushPull = true;
uniform int vplStartIndex = 0;
uniform int vplEndIndex = totalVplCount;
uniform bool scaleISMs = false;
uniform bool pointsOnlyIntoScaledISMs = false;
int vplCount = vplEndIndex - vplStartIndex;
int sampledVplCount = pointsOnlyIntoScaledISMs ? vplCount : totalVplCount;
int ismCount = (scaleISMs) ? vplCount : totalVplCount;
int ismIndices1d = int(pow(2, ceil(log2(ismCount) / 2))); // find next even power of two
int vplIdOffset = pointsOnlyIntoScaledISMs ? vplStartIndex : 0;
const float infinity = 1. / 0.;
const int maxVplTestCount = 16; // don't make this greater than local_size_x
shared vec4[maxVplTestCount] vpls;
shared int[maxVplTestCount] vplIDs;
const int maxVplCollectCount = 4; // don't make this greater than maxVplTestCount
// this is not a local array since that was slower (perhaps put into global memory)
shared int[gl_WorkGroupSize.x * maxVplCollectCount] usedVplIDs;
void main()
{
if (pointsOnlyIntoScaledISMs && (gl_WorkGroupID.x > vplCount))
return;
// cache a bunch of vpls into shared memory. store their IDs into vplIDs
if (gl_LocalInvocationID.x < maxVplTestCount) {
int index = int(gl_WorkGroupID.x + gl_LocalInvocationID.x);
index %= totalVplCount;
if (pointsOnlyIntoScaledISMs) {
index %= vplCount;
index += vplStartIndex;
}
vplIDs[gl_LocalInvocationID.x] = index;
vpls[gl_LocalInvocationID.x] = vplPositionNormalBuffer[index];
}
barrier();
memoryBarrierShared();
// for each point
for(uint j = 0; j < atomicCounter[gl_WorkGroupID.x] / gl_WorkGroupSize.x + 1; j++)
{
uint pointIdInISM = j * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
if (pointIdInISM >= atomicCounter[gl_WorkGroupID.x])
break;
vec4 read = imageLoad(pointBuffer, int(gl_WorkGroupID.x * (imageSize(pointBuffer).x / sampledVplCount) + pointIdInISM));
vec3 position = read.xyz;
float g_normalRadius = read.w;
vec4 normalRadiusUnpacked = unpack4UNFromFloat(g_normalRadius);
normalRadiusUnpacked.xyz = normalRadiusUnpacked.xyz * 2.0 - 1.0;
normalRadiusUnpacked.w = normalRadiusUnpacked.w * 25;
vec3 pointNormal = normalRadiusUnpacked.xyz;
float pointRadius = normalRadiusUnpacked.w;
// gather up to maxVplCollectCount vpls that pass culling
int found = 0;
for(int i = 0; i < maxVplTestCount; i++) {
vec4 vpl = vpls[i];
vec3 vplPosition = vpl.xyz;
vec3 vplNormal2 = unpack3SNFromFloat(vpl.w);
vec3 positionRelativeToCamera = position.xyz - vplPosition;
bool cull = dot(vplNormal2, positionRelativeToCamera) < 0 || dot(pointNormal, -positionRelativeToCamera) < 0;
if (!cull && found < maxVplCollectCount) {
usedVplIDs[gl_LocalInvocationID.x * maxVplCollectCount + found] = i;
found++;
}
}
// no barrier needed, usedVplIDs is read only from the thread that wrote it
// for each found vpl, render
for (int i = 0; i < found; i++)
{
int localVplID = usedVplIDs[gl_LocalInvocationID.x * maxVplCollectCount + i];
int globalVplID = vplIDs[localVplID];
// reconstruct vpl. saving the reconstructed stuff in arrays in the gather step was slower.
vec4 vpl = vpls[localVplID];
vec3 vplPosition = vpl.xyz;
vec3 vplNormal2 = unpack3SNFromFloat(vpl.w);
vec3 positionRelativeToCamera = position.xyz - vplPosition;
// paraboloid projection
float distToCamera = length(positionRelativeToCamera);
float ismIndex = scaleISMs ? float(globalVplID) - vplStartIndex : globalVplID;
vec3 v = paraboloid_project(positionRelativeToCamera, distToCamera, vplNormal2, zFar, ismIndex, ismIndices1d, true);
vec3 normalPositionRelativeToCamera = positionRelativeToCamera + pointNormal * 0.1;
float normalDist = length(normalPositionRelativeToCamera);
vec3 normalV = paraboloid_project(normalPositionRelativeToCamera, normalDist, vplNormal2, zFar, ismIndex, ismIndices1d, true);
v.xy *= imageSize(softrenderBuffer).xy;
v.z *= 1 << 24;
uint currentDepthValue = uint(v.z) << 8;
currentDepthValue |= uint(pointRadius * 10 / sqrt(float(maxVplCollectCount)));
uint originalDepthValue = imageAtomicMin(softrenderBuffer, ivec2(v.xy), currentDepthValue);
}
}
}