data/shaders/ism/ism.comp

#version 430

#extension GL_ARB_shading_language_include : require
#include </data/shaders/common/random.glsl>
#include </data/shaders/common/floatpacking.glsl>
#include </data/shaders/ism/ism_utils.glsl>


layout (local_size_x = 128) in;


const int totalVplCount = 1024;
layout (std140, binding = 0) uniform packedVplBuffer_
{
    vec4 vplPositionNormalBuffer[totalVplCount];
};

layout (shared, binding = 0) buffer atomicBuffer_
{
	uint[totalVplCount] atomicCounter;
};

layout (r32ui, binding = 0) coherent uniform uimage2D softrenderBuffer;
layout (rgba32f, binding = 1) restrict readonly uniform imageBuffer pointBuffer;

uniform ivec2 viewport;
uniform float zFar;

uniform bool usePushPull = true;

uniform int vplStartIndex = 0;
uniform int vplEndIndex = totalVplCount;
uniform bool scaleISMs = false;
uniform bool pointsOnlyIntoScaledISMs = false;

int vplCount = vplEndIndex - vplStartIndex;
int sampledVplCount = pointsOnlyIntoScaledISMs ? vplCount : totalVplCount;
int ismCount = (scaleISMs) ? vplCount : totalVplCount;
int ismIndices1d = int(pow(2, ceil(log2(ismCount) / 2))); // find next even power of two
int vplIdOffset = pointsOnlyIntoScaledISMs ? vplStartIndex : 0;

const float infinity = 1. / 0.;

const int maxVplTestCount = 16; // don't make this greater than local_size_x
shared vec4[maxVplTestCount] vpls;
shared int[maxVplTestCount] vplIDs;


const int maxVplCollectCount = 4; // don't make this greater than maxVplTestCount
// this is not a local array since that was slower (perhaps put into global memory)
shared int[gl_WorkGroupSize.x * maxVplCollectCount] usedVplIDs;


void main()
{
    if (pointsOnlyIntoScaledISMs && (gl_WorkGroupID.x > vplCount))
        return;

    // cache a bunch of vpls into shared memory. store their IDs into vplIDs
    if (gl_LocalInvocationID.x < maxVplTestCount) {
        int index = int(gl_WorkGroupID.x + gl_LocalInvocationID.x);
        index %= totalVplCount;
        if (pointsOnlyIntoScaledISMs) {
            index %= vplCount;
            index += vplStartIndex;
        }
        vplIDs[gl_LocalInvocationID.x] = index;
        vpls[gl_LocalInvocationID.x] = vplPositionNormalBuffer[index];
    }

    barrier();
    memoryBarrierShared();

    // for each point
    for(uint j = 0; j < atomicCounter[gl_WorkGroupID.x] / gl_WorkGroupSize.x + 1; j++)
    {
        uint pointIdInISM = j * gl_WorkGroupSize.x + gl_LocalInvocationID.x;
        if (pointIdInISM >= atomicCounter[gl_WorkGroupID.x])
            break;

        vec4 read = imageLoad(pointBuffer, int(gl_WorkGroupID.x * (imageSize(pointBuffer).x / sampledVplCount) + pointIdInISM));

        vec3 position = read.xyz;
        float g_normalRadius = read.w;
        vec4 normalRadiusUnpacked = unpack4UNFromFloat(g_normalRadius);
        normalRadiusUnpacked.xyz = normalRadiusUnpacked.xyz * 2.0 - 1.0;
        normalRadiusUnpacked.w = normalRadiusUnpacked.w * 25;

        vec3 pointNormal = normalRadiusUnpacked.xyz;
        float pointRadius = normalRadiusUnpacked.w;

        // gather up to maxVplCollectCount vpls that pass culling
        int found = 0;
        for(int i = 0; i < maxVplTestCount; i++) {
            vec4 vpl = vpls[i];
            vec3 vplPosition = vpl.xyz;
            vec3 vplNormal2 = unpack3SNFromFloat(vpl.w);

            vec3 positionRelativeToCamera = position.xyz - vplPosition;

            bool cull = dot(vplNormal2, positionRelativeToCamera) < 0 || dot(pointNormal, -positionRelativeToCamera) < 0;

            if (!cull && found < maxVplCollectCount) {
                usedVplIDs[gl_LocalInvocationID.x * maxVplCollectCount + found] = i;
                found++;
            }
        }

        // no barrier needed, usedVplIDs is read only from the thread that wrote it

        // for each found vpl, render
        for (int i = 0; i < found; i++)
        {
            int localVplID = usedVplIDs[gl_LocalInvocationID.x * maxVplCollectCount + i];
            int globalVplID = vplIDs[localVplID];
            // reconstruct vpl. saving the reconstructed stuff in arrays in the gather step was slower.
            vec4 vpl = vpls[localVplID];
            vec3 vplPosition = vpl.xyz;
            vec3 vplNormal2 = unpack3SNFromFloat(vpl.w);

            vec3 positionRelativeToCamera = position.xyz - vplPosition;
            // paraboloid projection
            float distToCamera = length(positionRelativeToCamera);
            float ismIndex = scaleISMs ? float(globalVplID) - vplStartIndex : globalVplID;
            vec3 v = paraboloid_project(positionRelativeToCamera, distToCamera, vplNormal2, zFar, ismIndex, ismIndices1d, true);

            vec3 normalPositionRelativeToCamera = positionRelativeToCamera + pointNormal * 0.1;
            float normalDist = length(normalPositionRelativeToCamera);
            vec3 normalV = paraboloid_project(normalPositionRelativeToCamera, normalDist, vplNormal2, zFar, ismIndex, ismIndices1d, true);

            v.xy *= imageSize(softrenderBuffer).xy;
            v.z *= 1 << 24;

            uint currentDepthValue = uint(v.z) << 8;
            currentDepthValue |= uint(pointRadius * 10 / sqrt(float(maxVplCollectCount)));
            uint originalDepthValue = imageAtomicMin(softrenderBuffer, ivec2(v.xy), currentDepthValue);
        }
    }
}