Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,7 @@ compile_commands.json
data
results

!examples/benchmarks/compression/results/
!examples/benchmarks/compression/results/

gsplat/hip/
*.hip
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "gsplat/cuda/csrc/third_party/glm"]
path = gsplat/cuda/csrc/third_party/glm
[submodule "third_party/glm"]
path = third_party/glm
url = https://github.com/g-truc/glm.git
2 changes: 1 addition & 1 deletion gsplat/cuda/_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ def rasterize_to_pixels(
assert colors.shape == image_dims + (N, channels), colors.shape
assert opacities.shape == image_dims + (N,), opacities.shape
if backgrounds is not None:
assert backgrounds.shape == image_dims + (channels,), backgrounds.shape
assert backgrounds.numel() == math.prod(image_dims + (channels,)), f"{backgrounds.shape=} != {(image_dims + (channels,))=}"
backgrounds = backgrounds.contiguous()
if masks is not None:
assert masks.shape == isect_offsets.shape, masks.shape
Expand Down
16 changes: 12 additions & 4 deletions gsplat/cuda/csrc/IntersectTile.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
#include "Intersect.h"
#include "Utils.cuh"

#if defined(__HIP__)
template <typename T>
using DB = hipcub::DoubleBuffer<T>;
#else
template <typename T>
using DB = cub::DoubleBuffer<T>;
#endif

namespace gsplat {

namespace cg = cooperative_groups;
Expand Down Expand Up @@ -307,10 +315,10 @@ void radix_sort_double_buffer(
}

// Create a set of DoubleBuffers to wrap pairs of device pointers
cub::DoubleBuffer<int64_t> d_keys(
DB<int64_t> d_keys(
isect_ids.data_ptr<int64_t>(), isect_ids_sorted.data_ptr<int64_t>()
);
cub::DoubleBuffer<int32_t> d_values(
DB<int32_t> d_values(
flatten_ids.data_ptr<int32_t>(), flatten_ids_sorted.data_ptr<int32_t>()
);
CUB_WRAPPER(
Expand Down Expand Up @@ -356,10 +364,10 @@ void segmented_radix_sort_double_buffer(
}

// Create a set of DoubleBuffers to wrap pairs of device pointers
cub::DoubleBuffer<int64_t> d_keys(
DB<int64_t> d_keys(
isect_ids.data_ptr<int64_t>(), isect_ids_sorted.data_ptr<int64_t>()
);
cub::DoubleBuffer<int32_t> d_values(
DB<int32_t> d_values(
flatten_ids.data_ptr<int32_t>(), flatten_ids_sorted.data_ptr<int32_t>()
);
// image dimensions are contiguous in the isect_ids,
Expand Down
2 changes: 1 addition & 1 deletion gsplat/cuda/csrc/Null.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ namespace gsplat {
// function will be called.
void launch_null_kernel(const at::Tensor input, at::Tensor output);

} // namespace gsplat
} // namespace gsplat
2 changes: 1 addition & 1 deletion gsplat/cuda/csrc/Projection.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,4 +280,4 @@ void launch_projection_ut_3dgs_fused_kernel(
at::optional<at::Tensor> compensations // [C, N] optional
);

} // namespace gsplat
} // namespace gsplat
2 changes: 1 addition & 1 deletion gsplat/cuda/csrc/Projection2DGS.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ inline __device__ void compute_ray_transforms_aabb_vjp(
v_t += v_M[2];
}

} // namespace gsplat
} // namespace gsplat
28 changes: 22 additions & 6 deletions gsplat/cuda/csrc/Projection2DGSFused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -433,22 +433,33 @@ __global__ void projection_2dgs_fused_bwd_kernel(
// #if __CUDA_ARCH__ >= 700
// write out results with warp-level reduction
auto warp = cg::tiled_partition<32>(cg::this_thread_block());
#if FOR_HIP
auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
#else
auto warp_group_g = cg::labeled_partition(warp, gid);
#endif

if (v_means != nullptr) {
#if !FOR_HIP
warpSum(v_mean, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_means += bid * N * 3 + gid * 3;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) {
gpuAtomicAdd(v_means + i, v_mean[i]);
}
}
}

// Directly output gradients w.r.t. the quaternion and scale
#if !FOR_HIP
warpSum(v_quat, warp_group_g);
warpSum(v_scale, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_quats += bid * N * 4 + gid * 4;
v_scales += bid * N * 3 + gid * 3;
gpuAtomicAdd(v_quats, v_quat[0]);
Expand All @@ -460,14 +471,19 @@ __global__ void projection_2dgs_fused_bwd_kernel(
}

if (v_viewmats != nullptr) {
#if FOR_HIP
auto warp_group_c = warp; // Not used, just here to not error in the if-statements below.
#else
auto warp_group_c = cg::labeled_partition(warp, cid);
warpSum(v_R, warp_group_c);
warpSum(v_t, warp_group_c);
if (warp_group_c.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_c.thread_rank() == 0) {
v_viewmats += bid * C * 16 + cid * 16;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) {
#pragma unroll
#pragma unroll
for (uint32_t j = 0; j < 3; j++) {
gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
}
Expand Down
30 changes: 23 additions & 7 deletions gsplat/cuda/csrc/Projection2DGSPacked.cu
Original file line number Diff line number Diff line change
Expand Up @@ -414,21 +414,32 @@ __global__ void projection_2dgs_packed_bwd_kernel(
// write out results with dense layout
// #if __CUDA_ARCH__ >= 700
// write out results with warp-level reduction
#if FOR_HIP
auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
#else
auto warp_group_g = cg::labeled_partition(warp, gid);
#endif

if (v_means != nullptr) {
#if !FOR_HIP
warpSum(v_mean, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_means += bid * N * 3 + gid * 3;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) {
gpuAtomicAdd(v_means + i, v_mean[i]);
}
}
}
// Directly output gradients w.r.t. the quaternion and scale
#if !FOR_HIP
warpSum(v_quat, warp_group_g);
warpSum(v_scale, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_quats += bid * N * 4 + gid * 4;
v_scales += bid * N * 3 + gid * 3;
gpuAtomicAdd(v_quats, v_quat[0]);
Expand All @@ -441,14 +452,19 @@ __global__ void projection_2dgs_packed_bwd_kernel(
}

if (v_viewmats != nullptr) {
#if FOR_HIP
auto warp_group_c = warp; // Not used, just here to not error in the if-statements.
#else
auto warp_group_c = cg::labeled_partition(warp, cid);
warpSum(v_R, warp_group_c);
warpSum(v_t, warp_group_c);
if (warp_group_c.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_c.thread_rank() == 0) {
v_viewmats += bid * C * 16 + cid * 16;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) {
#pragma unroll
#pragma unroll
for (uint32_t j = 0; j < 3; j++) {
gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
}
Expand Down Expand Up @@ -528,4 +544,4 @@ void launch_projection_2dgs_packed_bwd_kernel(
);
}

} // namespace gsplat
} // namespace gsplat
34 changes: 27 additions & 7 deletions gsplat/cuda/csrc/ProjectionEWA3DGSFused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -469,21 +469,32 @@ __global__ void projection_ewa_3dgs_fused_bwd_kernel(
// #if __CUDA_ARCH__ >= 700
// write out results with warp-level reduction
auto warp = cg::tiled_partition<32>(cg::this_thread_block());
#if FOR_HIP
auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
#else
auto warp_group_g = cg::labeled_partition(warp, gid);
#endif

if (v_means != nullptr) {
#if !FOR_HIP
warpSum(v_mean, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_means += bid * N * 3 + gid * 3;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) {
gpuAtomicAdd(v_means + i, v_mean[i]);
}
}
}
if (v_covars != nullptr) {
// Output gradients w.r.t. the covariance matrix
#if !FOR_HIP
warpSum(v_covar, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_covars += bid * N * 6 + gid * 6;
gpuAtomicAdd(v_covars, v_covar[0][0]);
gpuAtomicAdd(v_covars + 1, v_covar[0][1] + v_covar[1][0]);
Expand All @@ -498,9 +509,13 @@ __global__ void projection_ewa_3dgs_fused_bwd_kernel(
vec4 v_quat(0.f);
vec3 v_scale(0.f);
quat_scale_to_covar_vjp(quat, scale, rotmat, v_covar, v_quat, v_scale);

#if !FOR_HIP
warpSum(v_quat, warp_group_g);
warpSum(v_scale, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_quats += bid * N * 4 + gid * 4;
v_scales += bid * N * 3 + gid * 3;
gpuAtomicAdd(v_quats, v_quat[0]);
Expand All @@ -513,14 +528,19 @@ __global__ void projection_ewa_3dgs_fused_bwd_kernel(
}
}
if (v_viewmats != nullptr) {
#if FOR_HIP
auto warp_group_c = warp; // Not used, just here to not error in the if-statements below.
#else
auto warp_group_c = cg::labeled_partition(warp, cid);
warpSum(v_R, warp_group_c);
warpSum(v_t, warp_group_c);
if (warp_group_c.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_c.thread_rank() == 0) {
v_viewmats += bid * C * 16 + cid * 16;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) { // rows
#pragma unroll
#pragma unroll
for (uint32_t j = 0; j < 3; j++) { // cols
gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
}
Expand Down
35 changes: 27 additions & 8 deletions gsplat/cuda/csrc/ProjectionEWA3DGSPacked.cu
Original file line number Diff line number Diff line change
Expand Up @@ -591,21 +591,32 @@ __global__ void projection_ewa_3dgs_packed_bwd_kernel(
// write out results with dense layout
// #if __CUDA_ARCH__ >= 700
// write out results with warp-level reduction
#if FOR_HIP
auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
#else
auto warp_group_g = cg::labeled_partition(warp, gid);
#endif

if (v_means != nullptr) {
#if !FOR_HIP
warpSum(v_mean, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_means += bid * N * 3 + gid * 3;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) {
gpuAtomicAdd(v_means + i, v_mean[i]);
}
}
}
if (v_covars != nullptr) {
// Directly output gradients w.r.t. the covariance
#if !FOR_HIP
warpSum(v_covar, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_covars += bid * N * 6 + gid * 6;
gpuAtomicAdd(v_covars, v_covar[0][0]);
gpuAtomicAdd(v_covars + 1, v_covar[0][1] + v_covar[1][0]);
Expand All @@ -622,9 +633,12 @@ __global__ void projection_ewa_3dgs_packed_bwd_kernel(
quat_scale_to_covar_vjp(
quat, scale, rotmat, v_covar, v_quat, v_scale
);
#if !FOR_HIP
warpSum(v_quat, warp_group_g);
warpSum(v_scale, warp_group_g);
if (warp_group_g.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_g.thread_rank() == 0) {
v_quats += bid * N * 4 + gid * 4;
v_scales += bid * N * 3 + gid * 3;
gpuAtomicAdd(v_quats, v_quat[0]);
Expand All @@ -639,14 +653,19 @@ __global__ void projection_ewa_3dgs_packed_bwd_kernel(
}
// v_viewmats is always in dense layout
if (v_viewmats != nullptr) {
#if FOR_HIP
auto warp_group_c = warp; // Not used, just here to not error in the if-statements.
#else
auto warp_group_c = cg::labeled_partition(warp, cid);
warpSum(v_R, warp_group_c);
warpSum(v_t, warp_group_c);
if (warp_group_c.thread_rank() == 0) {
#endif

if (FOR_HIP || warp_group_c.thread_rank() == 0) {
v_viewmats += bid * C * 16 + cid * 16;
#pragma unroll
#pragma unroll
for (uint32_t i = 0; i < 3; i++) { // rows
#pragma unroll
#pragma unroll
for (uint32_t j = 0; j < 3; j++) { // cols
gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
}
Expand Down Expand Up @@ -756,4 +775,4 @@ void launch_projection_ewa_3dgs_packed_bwd_kernel(
);
}

} // namespace gsplat
} // namespace gsplat
2 changes: 1 addition & 1 deletion gsplat/cuda/csrc/Rasterization.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,4 @@ void launch_rasterize_to_pixels_from_world_3dgs_bwd_kernel(
at::Tensor v_opacities // [..., C, N] or [nnz]
) ;

} // namespace gsplat
} // namespace gsplat
2 changes: 1 addition & 1 deletion gsplat/cuda/csrc/RasterizeToIndices2DGS.cu
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ void launch_rasterize_to_indices_2dgs_kernel(
// channels into the kernel functions and avoid necessary global memory
// writes. This requires moving the channel padding from python to C side.
if (cudaFuncSetAttribute(
rasterize_to_indices_2dgs_kernel<float>,
(const void*)rasterize_to_indices_2dgs_kernel<float>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
shmem_size
) != cudaSuccess) {
Expand Down
2 changes: 1 addition & 1 deletion gsplat/cuda/csrc/RasterizeToIndices3DGS.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ void launch_rasterize_to_indices_3dgs_kernel(
// channels into the kernel functions and avoid necessary global memory
// writes. This requires moving the channel padding from python to C side.
if (cudaFuncSetAttribute(
rasterize_to_indices_3dgs_kernel<float>,
(const void*)rasterize_to_indices_3dgs_kernel<float>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
shmem_size
) != cudaSuccess) {
Expand Down
Loading