Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/render/3dgut.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ splat: # 3DGUT-specific settings
# rendering
k_buffer_size: 0 # 0 means unsorted
global_z_order: true
fine_grained_load_balancing: false
37 changes: 37 additions & 0 deletions threedgut_tracer/include/3dgut/kernels/cuda/common/rayPayload.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,43 @@ __device__ __inline__ RayPayloadT initializeRay(const threedgut::RenderParameter
return ray;
}


// Initialize ray based on given pixel coordinates (load-balanced mode)
template <typename RayPayloadT>
__device__ __inline__ RayPayloadT initializeRayPerPixel(const threedgut::RenderParameters& params,
const tcnn::uvec2& pixel,
const tcnn::vec3* __restrict__ sensorRayOriginPtr,
const tcnn::vec3* __restrict__ sensorRayDirectionPtr,
const tcnn::mat4x3& sensorToWorldTransform) {
RayPayloadT ray;
ray.flags = RayPayloadT::Default;

if ((pixel.x >= params.resolution.x) || (pixel.y >= params.resolution.y)) {
return ray;
}

ray.idx = pixel.x + params.resolution.x * pixel.y;
ray.hitT = 0.0f;
ray.transmittance = 1.0f;
ray.features = tcnn::vec<RayPayloadT::FeatDim>::zero();

ray.origin = sensorToWorldTransform * tcnn::vec4(sensorRayOriginPtr[ray.idx], 1.0f);
ray.direction = tcnn::mat3(sensorToWorldTransform) * sensorRayDirectionPtr[ray.idx];

ray.tMinMax = params.objectAABB.ray_intersect(ray.origin, ray.direction);
ray.tMinMax.x = fmaxf(ray.tMinMax.x, 0.0f);

if (ray.tMinMax.y > ray.tMinMax.x) {
ray.flags |= RayPayloadT::Valid | RayPayloadT::Alive;
}

#if GAUSSIAN_ENABLE_HIT_COUNT
ray.hitN = 0;
#endif

return ray;
}

template <typename TRayPayload>
__device__ __inline__ void finalizeRay(const TRayPayload& ray,
const threedgut::RenderParameters& params,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,183 @@ struct GUTKBufferRenderer : Params {
}
}

// Fine-grained balanced forward rendering: Gaussian-wise parallelism with warp-level optimization
template <typename TRay>
static inline __device__ void evalForwardNoKBufferBalanced(
const threedgut::RenderParameters& params,
TRay& ray,
const tcnn::uvec2* __restrict__ sortedTileRangeIndicesPtr,
const uint32_t* __restrict__ sortedTileParticleIdxPtr,
const tcnn::vec2* __restrict__ particlesProjectedPositionPtr,
const tcnn::vec4* __restrict__ particlesProjectedConicOpacityPtr,
const float* __restrict__ particlesGlobalDepthPtr,
const float* __restrict__ particlesPrecomputedFeaturesPtr,
const tcnn::uvec2& tile,
const tcnn::uvec2& tileGrid,
const int laneId,
threedgut::MemoryHandles parameters,
tcnn::vec2* __restrict__ particlesProjectedPositionGradPtr = nullptr,
tcnn::vec4* __restrict__ particlesProjectedConicOpacityGradPtr = nullptr,
float* __restrict__ particlesGlobalDepthGradPtr = nullptr,
float* __restrict__ particlesPrecomputedFeaturesGradPtr = nullptr,
threedgut::MemoryHandles parametersGradient = {}) {

using namespace threedgut;

// Get tile data: each warp processes particles from a single 16x16 tile
const uint32_t tileIdx = tile.y * tileGrid.x + tile.x;
const tcnn::uvec2 tileParticleRangeIndices = sortedTileRangeIndicesPtr[tileIdx];

uint32_t tileNumParticlesToProcess = tileParticleRangeIndices.y - tileParticleRangeIndices.x;

// Setup feature buffers based on rendering mode
const TFeaturesVec* particleFeaturesBuffer =
Params::PerRayParticleFeatures ? nullptr :
reinterpret_cast<const TFeaturesVec*>(particlesPrecomputedFeaturesPtr);
TFeaturesVec* particleFeaturesGradientBuffer =
(Params::PerRayParticleFeatures || !Backward) ? nullptr :
reinterpret_cast<TFeaturesVec*>(particlesPrecomputedFeaturesGradPtr);

// Initialize particle system
Particles particles;
particles.initializeDensity(parameters);
if constexpr (Backward) {
particles.initializeDensityGradient(parametersGradient);
}
particles.initializeFeatures(parameters);
if constexpr (Backward && Params::PerRayParticleFeatures) {
particles.initializeFeaturesGradient(parametersGradient);
}

static_assert(Params::KHitBufferSize == 0, "evalForwardNoKBufferBalanced only supports K=0 (no hit buffer). Use evalKBuffer for K>0 cases.");

// Warp-aligned processing: round up to multiple of WarpSize to avoid divergence
constexpr uint32_t WarpSize = GUTParameters::Tiling::WarpSize; // 32 threads per warp
uint32_t alignedParticleCount = ((tileNumParticlesToProcess + WarpSize - 1) / WarpSize) * WarpSize;

// Main loop: Gaussian-wise parallelism - WarpSize threads process Gaussians, single ray
for (uint32_t j = laneId; j < alignedParticleCount; j += WarpSize) {
if (!ray.isAlive()) break;

float hitAlpha = 0.0f;
float hitT = 0.0f;
TFeaturesVec hitFeatures = TFeaturesVec::zero();
bool validHit = false;

// Step 1: Each thread tests one Gaussian intersection
if (j < tileNumParticlesToProcess) {
const uint32_t toProcessSortedIndex = tileParticleRangeIndices.x + j;
const uint32_t particleIdx = sortedTileParticleIdxPtr[toProcessSortedIndex];

if (particleIdx != GUTParameters::InvalidParticleIdx) {
auto densityParams = particles.fetchDensityParameters(particleIdx);

if (particles.densityHit(ray.origin,
ray.direction,
densityParams,
hitAlpha,
hitT) &&
(hitT > ray.tMinMax.x) &&
(hitT < ray.tMinMax.y)) {

validHit = true;

// Get Gaussian features
if constexpr (Params::PerRayParticleFeatures) {
hitFeatures = particles.featuresFromBuffer(particleIdx, ray.direction);
} else {
hitFeatures = tcnn::max(particleFeaturesBuffer[particleIdx], 0.f);
}
}
}
}

// Skip if no hits in this warp batch
constexpr uint32_t WarpMask = GUTParameters::Tiling::WarpMask; // 0xFFFFFFFF for full warp
if (__all_sync(WarpMask, !validHit)) continue;

// Step 2: Compute per-thread transmittance contribution
float localTransmittance = validHit ? (1.0f - hitAlpha) : 1.0f;

// Step 3: Warp-level prefix scan for cumulative transmittance
for (uint32_t offset = 1; offset < WarpSize; offset <<= 1) {
float n = __shfl_up_sync(WarpMask, localTransmittance, offset);
if (laneId >= offset) {
localTransmittance *= n;
}
}

// Get overall batch transmittance impact
float batchTransmittance = __shfl_sync(WarpMask, localTransmittance, WarpSize - 1);
float newTransmittance = ray.transmittance * batchTransmittance;

// Step 4: Early termination detection - find exact termination point
unsigned int earlyTerminationMask = __ballot_sync(WarpMask,
validHit && (ray.transmittance * localTransmittance) < Particles::MinTransmittanceThreshold);

bool shouldTerminate = false;
int terminationLane = -1;

if (earlyTerminationMask) {
terminationLane = __ffs(earlyTerminationMask) - 1; // Find first terminating lane
shouldTerminate = true;
ray.kill();
}

// Step 5: Warp reduction for feature accumulation
TFeaturesVec accumulatedFeatures = TFeaturesVec::zero();
float accumulatedHitT = 0.0f;
uint32_t accumulatedHitCount = 0;

// Only accumulate contributions before (and including) termination point
bool shouldContribute = validHit && (!shouldTerminate || laneId <= terminationLane);

if (shouldContribute) {
// Use precomputed prefix transmittance, excluding current particle
float prefixTransmittance = (laneId > 0) ?
(localTransmittance / (1.0f - hitAlpha)) : 1.0f;
float particleTransmittance = ray.transmittance * prefixTransmittance;
float hitWeight = hitAlpha * particleTransmittance;

// Compute weighted contributions
for (int featIdx = 0; featIdx < Particles::FeaturesDim; ++featIdx) {
accumulatedFeatures[featIdx] = hitFeatures[featIdx] * hitWeight;
}
accumulatedHitT = hitT * hitWeight;
accumulatedHitCount = (hitWeight > 0.0f) ? 1 : 0;
}

// Step 6: Warp-level reduction (tree-based sum)
for (int featIdx = 0; featIdx < Particles::FeaturesDim; ++featIdx) {
for (uint32_t offset = WarpSize / 2; offset > 0; offset >>= 1) {
accumulatedFeatures[featIdx] += __shfl_down_sync(WarpMask, accumulatedFeatures[featIdx], offset);
}
}

for (uint32_t offset = WarpSize / 2; offset > 0; offset >>= 1) {
accumulatedHitT += __shfl_down_sync(WarpMask, accumulatedHitT, offset);
accumulatedHitCount += __shfl_down_sync(WarpMask, accumulatedHitCount, offset);
}

// Step 7: Only lane 0 updates ray state (avoid race conditions)
if (laneId == 0) {
for (int featIdx = 0; featIdx < Particles::FeaturesDim; ++featIdx) {
ray.features[featIdx] += accumulatedFeatures[featIdx];
}
ray.hitT += accumulatedHitT;
ray.countHit(accumulatedHitCount);
}

// Step 8: Update ray transmittance
ray.transmittance = newTransmittance;

// Break on early termination
if (shouldTerminate) {
break;
}
}
}

template <typename TRay>
static inline __device__ void evalBackwardNoKBuffer(TRay& ray,
Particles& particles,
Expand Down
100 changes: 100 additions & 0 deletions threedgut_tracer/include/3dgut/kernels/cuda/renderers/gutRenderer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,106 @@ __global__ void render(threedgut::RenderParameters params,
finalizeRay(ray, params, sensorRayOriginPtr, worldHitCountPtr, worldHitDistancePtr, radianceDensityPtr, sensorToWorldTransform);
}

// Fine-grained load balancing rendering kernel: static allocation per virtual tile
__global__ void renderBalanced(threedgut::RenderParameters params,
const tcnn::uvec2* __restrict__ sortedTileRangeIndicesPtr,
const uint32_t* __restrict__ sortedTileDataPtr,
const tcnn::vec3* __restrict__ sensorRayOriginPtr,
const tcnn::vec3* __restrict__ sensorRayDirectionPtr,
tcnn::mat4x3 sensorToWorldTransform,
float* __restrict__ worldHitCountPtr,
float* __restrict__ worldHitDistancePtr,
tcnn::vec4* __restrict__ radianceDensityPtr,
const tcnn::vec2* __restrict__ particlesProjectedPositionPtr,
const tcnn::vec4* __restrict__ particlesProjectedConicOpacityPtr,
const float* __restrict__ particlesGlobalDepthPtr,
const float* __restrict__ particlesPrecomputedFeaturesPtr,
const uint64_t* __restrict__ parameterMemoryHandles,
const tcnn::uvec2 tileGrid) {

// Static allocation: each block handles one virtual tile
using namespace threedgut;
constexpr uint32_t virtualTilesPerTile = GUTParameters::Tiling::VirtualTilesPerTile;
const uint32_t virtualTileId = blockIdx.x;

// Calculate total virtual tiles across all original tiles
const uint32_t totalVirtualTiles = tileGrid.x * tileGrid.y * virtualTilesPerTile;

// Boundary check
if (virtualTileId >= totalVirtualTiles) return;

// Map virtual tile back to original tile coordinates and local position
const uint32_t originalTileId = virtualTileId / virtualTilesPerTile;
const uint32_t virtualTileInOriginal = virtualTileId % virtualTilesPerTile;

const uint32_t originalTileX = originalTileId % tileGrid.x;
const uint32_t originalTileY = originalTileId / tileGrid.x;

// Map virtual tile to pixel coordinates within original tile
constexpr uint32_t virtualTilesPerTileX = GUTParameters::Tiling::VirtualTilesPerTileX;
constexpr uint32_t virtualTileX = GUTParameters::Tiling::VirtualTileX;
constexpr uint32_t virtualTileY = GUTParameters::Tiling::VirtualTileY;
constexpr uint32_t warpSize = GUTParameters::Tiling::WarpSize;

const uint32_t virtualTileXPos = virtualTileInOriginal % virtualTilesPerTileX; // 0-7
const uint32_t virtualTileYPos = virtualTileInOriginal / virtualTilesPerTileX; // 0-7

// Calculate base pixel coordinates for this virtual tile
const uint32_t basePixelX = virtualTileXPos * virtualTileX; // 0,2,4,6,8,10,12,14
const uint32_t basePixelY = virtualTileYPos * virtualTileY; // 0,2,4,6,8,10,12,14

// Warp-level processing: each warp handles one pixel in virtual tile
const uint32_t warpId = threadIdx.x / warpSize;
const uint32_t laneId = threadIdx.x & (warpSize - 1);

// Each block processes 1 virtual tile = virtualTileSize pixels, each warp handles 1 pixel
constexpr uint32_t virtualTileSize = GUTParameters::Tiling::VirtualTileSize;
constexpr uint32_t blockX = GUTParameters::Tiling::BlockX;
constexpr uint32_t blockY = GUTParameters::Tiling::BlockY;

if (warpId < virtualTileSize) { // virtualTileSize warps per block (1 warp per pixel)
// Arrange pixels in row-major order within virtualTileX x virtualTileY region
// warp 0-3 maps to pixels: (0,0),(1,0),(0,1),(1,1) for 2x2 virtual tile
const uint32_t pixelOffsetX = warpId % virtualTileX;
const uint32_t pixelOffsetY = warpId / virtualTileX;

const uint32_t pixelLocalX = basePixelX + pixelOffsetX;
const uint32_t pixelLocalY = basePixelY + pixelOffsetY;

const tcnn::uvec2 pixel = {
originalTileX * blockX + pixelLocalX,
originalTileY * blockY + pixelLocalY
};

// Initialize ray for current pixel
auto ray = initializeRayPerPixel<TGUTRenderer::TRayPayload>(
params, pixel, sensorRayOriginPtr, sensorRayDirectionPtr, sensorToWorldTransform);

// Warp-level parallel rendering using original tile's particle data
const tcnn::uvec2 originalTile = {originalTileX, originalTileY};

TGUTRenderer::evalForwardNoKBufferBalanced(params,
ray,
sortedTileRangeIndicesPtr,
sortedTileDataPtr,
particlesProjectedPositionPtr,
particlesProjectedConicOpacityPtr,
particlesGlobalDepthPtr,
particlesPrecomputedFeaturesPtr,
originalTile,
tileGrid,
laneId, // warp lane for parallel processing
{parameterMemoryHandles});

// Write final results to output buffers
// Only lane 0 should write, as only it has accumulated the correct values
if (laneId == 0) {
finalizeRay(ray, params, sensorRayOriginPtr, worldHitCountPtr,
worldHitDistancePtr, radianceDensityPtr, sensorToWorldTransform);
}
}
}

__global__ void renderBackward(threedgut::RenderParameters params,
const tcnn::uvec2* __restrict__ sortedTileRangeIndicesPtr,
const uint32_t* __restrict__ sortedTileDataPtr,
Expand Down
11 changes: 11 additions & 0 deletions threedgut_tracer/include/3dgut/renderer/gutRendererParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ struct GUTParameters {
static constexpr uint32_t WarpSize = 32;
static constexpr uint32_t NumWarps = BlockSize / WarpSize;
static constexpr uint32_t WarpMask = 0xFFFFFFFFU;

// Fine-grained load balancing parameters - base dimensions
static constexpr uint32_t VirtualTileX = 2; // virtual tile width in pixels
static constexpr uint32_t VirtualTileY = 2; // virtual tile height in pixels
// Derived constants from base dimensions
static constexpr uint32_t VirtualTileSize = VirtualTileX * VirtualTileY; // 4 pixels per virtual tile
static constexpr uint32_t VirtualTilesPerTileX = BlockX / VirtualTileX; // 8 virtual tiles per row
static constexpr uint32_t VirtualTilesPerTileY = BlockY / VirtualTileY; // 8 virtual tiles per column
static constexpr uint32_t VirtualTilesPerTile = VirtualTilesPerTileX * VirtualTilesPerTileY; // 64 total
static constexpr uint32_t FineGrainedWarpsPerBlock = VirtualTileSize; // 4 warps per block (1 per pixel)
static constexpr uint32_t FineGrainedThreadsPerBlock = FineGrainedWarpsPerBlock * WarpSize; // 128 threads
};

static constexpr uint32_t InvalidParticleIdx = -1U;
Expand Down
1 change: 1 addition & 0 deletions threedgut_tracer/setup_3dgut.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def to_cpp_bool(value):
f"-DGAUSSIAN_N_ROLLING_SHUTTER_ITERATIONS={conf.render.splat.n_rolling_shutter_iterations}",
f"-DGAUSSIAN_K_BUFFER_SIZE={conf.render.splat.k_buffer_size}",
f"-DGAUSSIAN_GLOBAL_Z_ORDER={to_cpp_bool(conf.render.splat.global_z_order)}",
f"-DFINE_GRAINED_LOAD_BALANCING={to_cpp_bool(getattr(conf.render.splat, 'fine_grained_load_balancing', False))}",
# -- Unscented Transform --
f"-DGAUSSIAN_UT_ALPHA={ut_alpha}",
f"-DGAUSSIAN_UT_BETA={ut_beta}",
Expand Down
Loading