nerfstudio-project · pxl-th · Sep 4, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -124,4 +124,7 @@ compile_commands.json
 data
 results
 
-!examples/benchmarks/compression/results/
+!examples/benchmarks/compression/results/
+
+gsplat/hip/
+*.hip
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "gsplat/cuda/csrc/third_party/glm"]
-	path = gsplat/cuda/csrc/third_party/glm
+[submodule "third_party/glm"]
+	path = third_party/glm
 	url = https://github.com/g-truc/glm.git
diff --git a/gsplat/cuda/_wrapper.py b/gsplat/cuda/_wrapper.py
@@ -595,7 +595,7 @@ def rasterize_to_pixels(
         assert colors.shape == image_dims + (N, channels), colors.shape
         assert opacities.shape == image_dims + (N,), opacities.shape
     if backgrounds is not None:
-        assert backgrounds.shape == image_dims + (channels,), backgrounds.shape
+        assert backgrounds.numel() == math.prod(image_dims + (channels,)), f"{backgrounds.shape=} != {(image_dims + (channels,))=}"
         backgrounds = backgrounds.contiguous()
     if masks is not None:
         assert masks.shape == isect_offsets.shape, masks.shape

diff --git a/gsplat/cuda/csrc/IntersectTile.cu b/gsplat/cuda/csrc/IntersectTile.cu
@@ -11,6 +11,14 @@
 #include "Intersect.h"
 #include "Utils.cuh"
 
+#if defined(__HIP__)
+template <typename T>
+using DB = hipcub::DoubleBuffer<T>;
+#else
+template <typename T>
+using DB = cub::DoubleBuffer<T>;
+#endif
+
 namespace gsplat {
 
 namespace cg = cooperative_groups;
@@ -307,10 +315,10 @@ void radix_sort_double_buffer(
     }
 
     // Create a set of DoubleBuffers to wrap pairs of device pointers
-    cub::DoubleBuffer<int64_t> d_keys(
+    DB<int64_t> d_keys(
         isect_ids.data_ptr<int64_t>(), isect_ids_sorted.data_ptr<int64_t>()
     );
-    cub::DoubleBuffer<int32_t> d_values(
+    DB<int32_t> d_values(
         flatten_ids.data_ptr<int32_t>(), flatten_ids_sorted.data_ptr<int32_t>()
     );
     CUB_WRAPPER(
@@ -356,10 +364,10 @@ void segmented_radix_sort_double_buffer(
     }
 
     // Create a set of DoubleBuffers to wrap pairs of device pointers
-    cub::DoubleBuffer<int64_t> d_keys(
+    DB<int64_t> d_keys(
         isect_ids.data_ptr<int64_t>(), isect_ids_sorted.data_ptr<int64_t>()
     );
-    cub::DoubleBuffer<int32_t> d_values(
+    DB<int32_t> d_values(
         flatten_ids.data_ptr<int32_t>(), flatten_ids_sorted.data_ptr<int32_t>()
     );
     // image dimensions are contiguous in the isect_ids, 

diff --git a/gsplat/cuda/csrc/Null.h b/gsplat/cuda/csrc/Null.h
@@ -19,4 +19,4 @@ namespace gsplat {
 // function will be called.
 void launch_null_kernel(const at::Tensor input, at::Tensor output);
 
-} // namespace gsplat
+} // namespace gsplat
diff --git a/gsplat/cuda/csrc/Projection.h b/gsplat/cuda/csrc/Projection.h
@@ -280,4 +280,4 @@ void launch_projection_ut_3dgs_fused_kernel(
     at::optional<at::Tensor> compensations // [C, N] optional
 );
 
-} // namespace gsplat
+} // namespace gsplat
diff --git a/gsplat/cuda/csrc/Projection2DGS.cuh b/gsplat/cuda/csrc/Projection2DGS.cuh
@@ -89,4 +89,4 @@ inline __device__ void compute_ray_transforms_aabb_vjp(
     v_t += v_M[2];
 }
 
-} // namespace gsplat
+} // namespace gsplat
diff --git a/gsplat/cuda/csrc/Projection2DGSFused.cu b/gsplat/cuda/csrc/Projection2DGSFused.cu
@@ -433,22 +433,33 @@ __global__ void projection_2dgs_fused_bwd_kernel(
     // #if __CUDA_ARCH__ >= 700
     // write out results with warp-level reduction
     auto warp = cg::tiled_partition<32>(cg::this_thread_block());
+    #if FOR_HIP
+    auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
+    #else
     auto warp_group_g = cg::labeled_partition(warp, gid);
+    #endif
+
     if (v_means != nullptr) {
+        #if !FOR_HIP
         warpSum(v_mean, warp_group_g);
-        if (warp_group_g.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_g.thread_rank() == 0) {
             v_means += bid * N * 3 + gid * 3;
-#pragma unroll
+            #pragma unroll
             for (uint32_t i = 0; i < 3; i++) {
                 gpuAtomicAdd(v_means + i, v_mean[i]);
             }
         }
     }
 
     // Directly output gradients w.r.t. the quaternion and scale
+    #if !FOR_HIP
     warpSum(v_quat, warp_group_g);
     warpSum(v_scale, warp_group_g);
-    if (warp_group_g.thread_rank() == 0) {
+    #endif
+
+    if (FOR_HIP || warp_group_g.thread_rank() == 0) {
         v_quats += bid * N * 4 + gid * 4;
         v_scales += bid * N * 3 + gid * 3;
         gpuAtomicAdd(v_quats, v_quat[0]);
@@ -460,14 +471,19 @@ __global__ void projection_2dgs_fused_bwd_kernel(
     }
 
     if (v_viewmats != nullptr) {
+        #if FOR_HIP
+        auto warp_group_c = warp; // Not used, just here to not error in the if-statements below.
+        #else
         auto warp_group_c = cg::labeled_partition(warp, cid);
         warpSum(v_R, warp_group_c);
         warpSum(v_t, warp_group_c);
-        if (warp_group_c.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_c.thread_rank() == 0) {
             v_viewmats += bid * C * 16 + cid * 16;
-#pragma unroll
+            #pragma unroll
             for (uint32_t i = 0; i < 3; i++) {
-#pragma unroll
+                #pragma unroll
                 for (uint32_t j = 0; j < 3; j++) {
                     gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
                 }

diff --git a/gsplat/cuda/csrc/Projection2DGSPacked.cu b/gsplat/cuda/csrc/Projection2DGSPacked.cu
@@ -414,21 +414,32 @@ __global__ void projection_2dgs_packed_bwd_kernel(
         // write out results with dense layout
         // #if __CUDA_ARCH__ >= 700
         // write out results with warp-level reduction
+        #if FOR_HIP
+        auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
+        #else
         auto warp_group_g = cg::labeled_partition(warp, gid);
+        #endif
+
         if (v_means != nullptr) {
+            #if !FOR_HIP
             warpSum(v_mean, warp_group_g);
-            if (warp_group_g.thread_rank() == 0) {
+            #endif
+
+            if (FOR_HIP || warp_group_g.thread_rank() == 0) {
                 v_means += bid * N * 3 + gid * 3;
-#pragma unroll
+                #pragma unroll
                 for (uint32_t i = 0; i < 3; i++) {
                     gpuAtomicAdd(v_means + i, v_mean[i]);
                 }
             }
         }
         // Directly output gradients w.r.t. the quaternion and scale
+        #if !FOR_HIP
         warpSum(v_quat, warp_group_g);
         warpSum(v_scale, warp_group_g);
-        if (warp_group_g.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_g.thread_rank() == 0) {
             v_quats += bid * N * 4 + gid * 4;
             v_scales += bid * N * 3 + gid * 3;
             gpuAtomicAdd(v_quats, v_quat[0]);
@@ -441,14 +452,19 @@ __global__ void projection_2dgs_packed_bwd_kernel(
     }
 
     if (v_viewmats != nullptr) {
+        #if FOR_HIP
+        auto warp_group_c = warp; // Not used, just here to not error in the if-statements.
+        #else
         auto warp_group_c = cg::labeled_partition(warp, cid);
         warpSum(v_R, warp_group_c);
         warpSum(v_t, warp_group_c);
-        if (warp_group_c.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_c.thread_rank() == 0) {
             v_viewmats += bid * C * 16 + cid * 16;
-#pragma unroll
+            #pragma unroll
             for (uint32_t i = 0; i < 3; i++) {
-#pragma unroll
+                #pragma unroll
                 for (uint32_t j = 0; j < 3; j++) {
                     gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
                 }
@@ -528,4 +544,4 @@ void launch_projection_2dgs_packed_bwd_kernel(
         );
 }
 
-} // namespace gsplat
+} // namespace gsplat
diff --git a/gsplat/cuda/csrc/ProjectionEWA3DGSFused.cu b/gsplat/cuda/csrc/ProjectionEWA3DGSFused.cu
@@ -469,21 +469,32 @@ __global__ void projection_ewa_3dgs_fused_bwd_kernel(
     // #if __CUDA_ARCH__ >= 700
     // write out results with warp-level reduction
     auto warp = cg::tiled_partition<32>(cg::this_thread_block());
+    #if FOR_HIP
+    auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
+    #else
     auto warp_group_g = cg::labeled_partition(warp, gid);
+    #endif
+
     if (v_means != nullptr) {
+        #if !FOR_HIP
         warpSum(v_mean, warp_group_g);
-        if (warp_group_g.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_g.thread_rank() == 0) {
             v_means += bid * N * 3 + gid * 3;
-#pragma unroll
+            #pragma unroll
             for (uint32_t i = 0; i < 3; i++) {
                 gpuAtomicAdd(v_means + i, v_mean[i]);
             }
         }
     }
     if (v_covars != nullptr) {
         // Output gradients w.r.t. the covariance matrix
+        #if !FOR_HIP
         warpSum(v_covar, warp_group_g);
-        if (warp_group_g.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_g.thread_rank() == 0) {
             v_covars += bid * N * 6 + gid * 6;
             gpuAtomicAdd(v_covars, v_covar[0][0]);
             gpuAtomicAdd(v_covars + 1, v_covar[0][1] + v_covar[1][0]);
@@ -498,9 +509,13 @@ __global__ void projection_ewa_3dgs_fused_bwd_kernel(
         vec4 v_quat(0.f);
         vec3 v_scale(0.f);
         quat_scale_to_covar_vjp(quat, scale, rotmat, v_covar, v_quat, v_scale);
+
+        #if !FOR_HIP
         warpSum(v_quat, warp_group_g);
         warpSum(v_scale, warp_group_g);
-        if (warp_group_g.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_g.thread_rank() == 0) {
             v_quats += bid * N * 4 + gid * 4;
             v_scales += bid * N * 3 + gid * 3;
             gpuAtomicAdd(v_quats, v_quat[0]);
@@ -513,14 +528,19 @@ __global__ void projection_ewa_3dgs_fused_bwd_kernel(
         }
     }
     if (v_viewmats != nullptr) {
+        #if FOR_HIP
+        auto warp_group_c = warp; // Not used, just here to not error in the if-statements below.
+        #else
         auto warp_group_c = cg::labeled_partition(warp, cid);
         warpSum(v_R, warp_group_c);
         warpSum(v_t, warp_group_c);
-        if (warp_group_c.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_c.thread_rank() == 0) {
             v_viewmats += bid * C * 16 + cid * 16;
-#pragma unroll
+            #pragma unroll
             for (uint32_t i = 0; i < 3; i++) { // rows
-#pragma unroll
+                #pragma unroll
                 for (uint32_t j = 0; j < 3; j++) { // cols
                     gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
                 }

diff --git a/gsplat/cuda/csrc/ProjectionEWA3DGSPacked.cu b/gsplat/cuda/csrc/ProjectionEWA3DGSPacked.cu
@@ -591,21 +591,32 @@ __global__ void projection_ewa_3dgs_packed_bwd_kernel(
         // write out results with dense layout
         // #if __CUDA_ARCH__ >= 700
         // write out results with warp-level reduction
+        #if FOR_HIP
+        auto warp_group_g = warp; // Not used, just here to not error in the if-statements.
+        #else
         auto warp_group_g = cg::labeled_partition(warp, gid);
+        #endif
+
         if (v_means != nullptr) {
+            #if !FOR_HIP
             warpSum(v_mean, warp_group_g);
-            if (warp_group_g.thread_rank() == 0) {
+            #endif
+
+            if (FOR_HIP || warp_group_g.thread_rank() == 0) {
                 v_means += bid * N * 3 + gid * 3;
-#pragma unroll
+                #pragma unroll
                 for (uint32_t i = 0; i < 3; i++) {
                     gpuAtomicAdd(v_means + i, v_mean[i]);
                 }
             }
         }
         if (v_covars != nullptr) {
             // Directly output gradients w.r.t. the covariance
+            #if !FOR_HIP
             warpSum(v_covar, warp_group_g);
-            if (warp_group_g.thread_rank() == 0) {
+            #endif
+
+            if (FOR_HIP || warp_group_g.thread_rank() == 0) {
                 v_covars += bid * N * 6 + gid * 6;
                 gpuAtomicAdd(v_covars, v_covar[0][0]);
                 gpuAtomicAdd(v_covars + 1, v_covar[0][1] + v_covar[1][0]);
@@ -622,9 +633,12 @@ __global__ void projection_ewa_3dgs_packed_bwd_kernel(
             quat_scale_to_covar_vjp(
                 quat, scale, rotmat, v_covar, v_quat, v_scale
             );
+            #if !FOR_HIP
             warpSum(v_quat, warp_group_g);
             warpSum(v_scale, warp_group_g);
-            if (warp_group_g.thread_rank() == 0) {
+            #endif
+
+            if (FOR_HIP || warp_group_g.thread_rank() == 0) {
                 v_quats += bid * N * 4 + gid * 4;
                 v_scales += bid * N * 3 + gid * 3;
                 gpuAtomicAdd(v_quats, v_quat[0]);
@@ -639,14 +653,19 @@ __global__ void projection_ewa_3dgs_packed_bwd_kernel(
     }
     // v_viewmats is always in dense layout
     if (v_viewmats != nullptr) {
+        #if FOR_HIP
+        auto warp_group_c = warp; // Not used, just here to not error in the if-statements.
+        #else
         auto warp_group_c = cg::labeled_partition(warp, cid);
         warpSum(v_R, warp_group_c);
         warpSum(v_t, warp_group_c);
-        if (warp_group_c.thread_rank() == 0) {
+        #endif
+
+        if (FOR_HIP || warp_group_c.thread_rank() == 0) {
             v_viewmats += bid * C * 16 + cid * 16;
-#pragma unroll
+            #pragma unroll
             for (uint32_t i = 0; i < 3; i++) { // rows
-#pragma unroll
+                #pragma unroll
                 for (uint32_t j = 0; j < 3; j++) { // cols
                     gpuAtomicAdd(v_viewmats + i * 4 + j, v_R[j][i]);
                 }
@@ -756,4 +775,4 @@ void launch_projection_ewa_3dgs_packed_bwd_kernel(
     );
 }
 
-} // namespace gsplat
+} // namespace gsplat
diff --git a/gsplat/cuda/csrc/Rasterization.h b/gsplat/cuda/csrc/Rasterization.h
@@ -274,4 +274,4 @@ void launch_rasterize_to_pixels_from_world_3dgs_bwd_kernel(
     at::Tensor v_opacities   // [..., C, N] or [nnz]
 ) ;
 
-} // namespace gsplat
+} // namespace gsplat
diff --git a/gsplat/cuda/csrc/RasterizeToIndices2DGS.cu b/gsplat/cuda/csrc/RasterizeToIndices2DGS.cu
@@ -253,7 +253,7 @@ void launch_rasterize_to_indices_2dgs_kernel(
     // channels into the kernel functions and avoid necessary global memory
     // writes. This requires moving the channel padding from python to C side.
     if (cudaFuncSetAttribute(
-            rasterize_to_indices_2dgs_kernel<float>,
+            (const void*)rasterize_to_indices_2dgs_kernel<float>,
             cudaFuncAttributeMaxDynamicSharedMemorySize,
             shmem_size
         ) != cudaSuccess) {

diff --git a/gsplat/cuda/csrc/RasterizeToIndices3DGS.cu b/gsplat/cuda/csrc/RasterizeToIndices3DGS.cu
@@ -214,7 +214,7 @@ void launch_rasterize_to_indices_3dgs_kernel(
     // channels into the kernel functions and avoid necessary global memory
     // writes. This requires moving the channel padding from python to C side.
     if (cudaFuncSetAttribute(
-            rasterize_to_indices_3dgs_kernel<float>,
+            (const void*)rasterize_to_indices_3dgs_kernel<float>,
             cudaFuncAttributeMaxDynamicSharedMemorySize,
             shmem_size
         ) != cudaSuccess) {