[BUG FIX] Fix BVH build's radix sort. (Genesis-Embodied-AI#1305)

Libero0809 · YilingQiao · commit e1cc023fc14d · 2025-08-21T20:55:52.000-07:00
diff --git a/genesis/engine/bvh.py b/genesis/engine/bvh.py
@@ -1,6 +1,7 @@
 import genesis as gs
 import taichi as ti
 from genesis.repr_base import RBC
+import numpy as np
 
 
 @ti.data_oriented
@@ -157,19 +158,27 @@ class Node:
         # Nodes of the BVH, first n_aabbs - 1 are internal nodes, last n_aabbs are leaf nodes
         self.nodes = self.Node.field(shape=(self.n_batches, self.n_aabbs * 2 - 1))
         # Whether an internal node has been visited during traversal
-        self.internal_node_visited = ti.field(ti.u8, shape=(self.n_batches, self.n_aabbs - 1))
+        self.internal_node_active = ti.field(ti.u1, shape=(self.n_batches, self.n_aabbs - 1))
+        self.internal_node_ready = ti.field(ti.u1, shape=(self.n_batches, self.n_aabbs - 1))
+        self.updated = ti.field(ti.u1, shape=())
 
         # Query results, vec3 of batch id, self id, query id
         self.query_result = ti.field(gs.ti_ivec3, shape=(self.max_n_query_results))
         # Count of query results
         self.query_result_count = ti.field(ti.i32, shape=())
 
-    @ti.kernel
     def build(self):
         """
         Build the BVH from the axis-aligned bounding boxes (AABBs).
         """
+        self.compute_aabb_centers_and_scales()
+        self.compute_morton_codes()
+        self.radix_sort_morton_codes()
+        self.build_radix_tree()
+        self.compute_bounds()
 
+    @ti.kernel
+    def compute_aabb_centers_and_scales(self):
         for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
             self.aabb_centers[i_b, i_a] = (self.aabbs[i_b, i_a].min + self.aabbs[i_b, i_a].max) / 2
 
@@ -184,14 +193,9 @@ def build(self):
         for i_b in ti.ndrange(self.n_batches):
             scale = self.aabb_max[i_b] - self.aabb_min[i_b]
             for i in ti.static(range(3)):
-                self.scale[i_b][i] = ti.select(scale[i] > 1e-7, 1.0 / scale[i], 1)
+                self.scale[i_b][i] = ti.select(scale[i] > gs.EPS, 1.0 / scale[i], 1.0)
 
-        self.compute_morton_codes()
-        self.radix_sort_morton_codes()
-        self.build_radix_tree()
-        self.compute_bounds()
-
-    @ti.func
+    @ti.kernel
     def compute_morton_codes(self):
         """
         Compute the Morton codes for each AABB.
@@ -223,38 +227,43 @@ def expand_bits(self, v):
         v = (v * ti.u32(0x00000005)) & ti.u32(0x49249249)
         return v
 
-    @ti.func
     def radix_sort_morton_codes(self):
         """
         Radix sort the morton codes, using 8 bits at a time.
         """
-        for i in ti.static(range(8)):
-            # Clear histogram
-            for i_b, j in ti.ndrange(self.n_batches, 256):
-                self.hist[i_b, j] = 0
+        for i in range(8):
+            self._kernel_radix_sort_morton_codes_one_round(i)
 
-            # Fill histogram
-            for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
+    @ti.kernel
+    def _kernel_radix_sort_morton_codes_one_round(self, i: int):
+        # Clear histogram
+        self.hist.fill(0)
+
+        # Fill histogram
+        for i_b in range(self.n_batches):
+            # This is now sequential
+            # TODO Parallelize, need to use groups to handle data to remain stable, could be not worth it
+            for i_a in range(self.n_aabbs):
                 code = (self.morton_codes[i_b, i_a] >> (i * 8)) & 0xFF
                 self.offset[i_b, i_a] = ti.atomic_add(self.hist[i_b, ti.i32(code)], 1)
 
-            # Compute prefix sum
-            for i_b in ti.ndrange(self.n_batches):
-                self.prefix_sum[i_b, 0] = 0
-                for j in range(1, 256):  # sequential prefix sum
-                    self.prefix_sum[i_b, j] = self.prefix_sum[i_b, j - 1] + self.hist[i_b, j - 1]
+        # Compute prefix sum
+        for i_b in ti.ndrange(self.n_batches):
+            self.prefix_sum[i_b, 0] = 0
+            for j in range(1, 256):  # sequential prefix sum
+                self.prefix_sum[i_b, j] = self.prefix_sum[i_b, j - 1] + self.hist[i_b, j - 1]
 
-            # Reorder morton codes
-            for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
-                code = (self.morton_codes[i_b, i_a] >> (i * 8)) & 0xFF
-                idx = ti.i32(self.offset[i_b, i_a] + self.prefix_sum[i_b, ti.i32(code)])
-                self.tmp_morton_codes[i_b, idx] = self.morton_codes[i_b, i_a]
+        # Reorder morton codes
+        for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
+            code = (self.morton_codes[i_b, i_a] >> (i * 8)) & 0xFF
+            idx = ti.i32(self.offset[i_b, i_a] + self.prefix_sum[i_b, ti.i32(code)])
+            self.tmp_morton_codes[i_b, idx] = self.morton_codes[i_b, i_a]
 
-            # Swap the temporary and original morton codes
-            for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
-                self.morton_codes[i_b, i_a] = self.tmp_morton_codes[i_b, i_a]
+        # Swap the temporary and original morton codes
+        for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
+            self.morton_codes[i_b, i_a] = self.tmp_morton_codes[i_b, i_a]
 
-    @ti.func
+    @ti.kernel
     def build_radix_tree(self):
         """
         Build the radix tree from the sorted morton codes.
@@ -321,31 +330,51 @@ def delta(self, i, j, i_b):
                     break
         return result
 
-    @ti.func
     def compute_bounds(self):
         """
         Compute the bounds of the BVH nodes.
 
-        Starts from the leaf nodes and works upwards.
+        Starts from the leaf nodes and works upwards layer by layer.
         """
-        for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs - 1):
-            self.internal_node_visited[i_b, i] = ti.u8(0)
+        self._kernel_compute_bounds_init()
+        while self.updated[None]:
+            self._kernel_compute_bounds_one_layer()
+
+    @ti.kernel
+    def _kernel_compute_bounds_init(self):
+        self.updated[None] = True
+        self.internal_node_active.fill(0)
+        self.internal_node_ready.fill(0)
 
         for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs):
             idx = ti.i32(self.morton_codes[i_b, i])
             self.nodes[i_b, i + self.n_aabbs - 1].bound.min = self.aabbs[i_b, idx].min
             self.nodes[i_b, i + self.n_aabbs - 1].bound.max = self.aabbs[i_b, idx].max
+            parent_idx = self.nodes[i_b, i + self.n_aabbs - 1].parent
+            if parent_idx != -1:
+                self.internal_node_active[i_b, parent_idx] = 1
 
-            cur_idx = self.nodes[i_b, i + self.n_aabbs - 1].parent
-            while cur_idx != -1:
-                visited = ti.u1(ti.atomic_or(self.internal_node_visited[i_b, cur_idx], ti.u8(1)))
-                if not visited:
-                    break
-                left_bound = self.nodes[i_b, self.nodes[i_b, cur_idx].left].bound
-                right_bound = self.nodes[i_b, self.nodes[i_b, cur_idx].right].bound
-                self.nodes[i_b, cur_idx].bound.min = ti.min(left_bound.min, right_bound.min)
-                self.nodes[i_b, cur_idx].bound.max = ti.max(left_bound.max, right_bound.max)
-                cur_idx = self.nodes[i_b, cur_idx].parent
+    @ti.kernel
+    def _kernel_compute_bounds_one_layer(self):
+        self.updated[None] = False
+        for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs - 1):
+            if self.internal_node_active[i_b, i] == 0:
+                continue
+            left_bound = self.nodes[i_b, self.nodes[i_b, i].left].bound
+            right_bound = self.nodes[i_b, self.nodes[i_b, i].right].bound
+            self.nodes[i_b, i].bound.min = ti.min(left_bound.min, right_bound.min)
+            self.nodes[i_b, i].bound.max = ti.max(left_bound.max, right_bound.max)
+            parent_idx = self.nodes[i_b, i].parent
+            if parent_idx != -1:
+                self.internal_node_ready[i_b, parent_idx] = 1
+            self.internal_node_active[i_b, i] = 0
+            self.updated[None] = True
+
+        for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs - 1):
+            if self.internal_node_ready[i_b, i] == 0:
+                continue
+            self.internal_node_active[i_b, i] = 1
+            self.internal_node_ready[i_b, i] = 0
 
     @ti.kernel
     def query(self, aabbs: ti.template()):
diff --git a/genesis/engine/coupler.py b/genesis/engine/coupler.py
@@ -649,6 +649,7 @@ def __init__(
         self._n_linesearch_iterations = options.n_linesearch_iterations
         self._linesearch_c = options.linesearch_c
         self._linesearch_tau = options.linesearch_tau
+        self.default_deformable_g = 1.0e8  # default deformable geometry size
 
     def build(self) -> None:
         self._B = self.sim._B
@@ -698,6 +699,7 @@ def init_fem_fields(self):
         self.max_fem_floor_contact_pairs = fem_solver.n_surfaces * fem_solver._B
         self.n_fem_floor_contact_pairs = ti.field(gs.ti_int, shape=())
         self.fem_floor_contact_pairs = self.fem_floor_contact_pair_type.field(shape=(self.max_fem_floor_contact_pairs,))
+
         # Lookup table for marching tetrahedra edges
         kMarchingTetsEdgeTable_np = np.array(
             [
@@ -934,15 +936,12 @@ def fem_floor_detection(self, f: ti.i32):
             )
             self.fem_floor_contact_pairs[i_c].barycentric = barycentric
 
-            C = ti.static(1.0e8)
-            deformable_g = C
             rigid_g = self.fem_pressure_gradient[i_b, i_e].z
             # TODO A better way to handle corner cases where pressure and pressure gradient are ill defined
             if total_area < gs.EPS or rigid_g < gs.EPS:
                 self.fem_floor_contact_pairs[i_c].active = 0
                 continue
-            g = 1.0 / (1.0 / deformable_g + 1.0 / rigid_g)  # harmonic average
-            deformable_k = total_area * C
+            g = self.default_deformable_g * rigid_g / (self.default_deformable_g + rigid_g)  # harmonic average
             rigid_k = total_area * g
             rigid_phi0 = -pressure / g
             rigid_fn0 = total_area * pressure
diff --git a/genesis/engine/materials/FEM/elastic.py b/genesis/engine/materials/FEM/elastic.py
@@ -82,6 +82,7 @@ def build_linear_corotated(self, fem_solver):
 
     @ti.func
     def pre_compute_linear_corotated(self, J, F, i_e, i_b):
+        # Computing Polar Decomposition instead of calling `R, P = ti.polar_decompose(F)` since `P` is not needed here
         U, S, V = ti.svd(F)
         R = U @ V.transpose()
         self.R[i_b, i_e] = R
diff --git a/genesis/engine/solvers/fem_solver.py b/genesis/engine/solvers/fem_solver.py
@@ -528,6 +528,32 @@ def _func_compute_ele_energy(self, f: ti.i32):
 
                 self.elements_el_energy[i_b, i_e].energy += 0.5 * damping_beta_over_dt * St_x_diff.dot(H_St_x_diff)
 
+            # add linearized damping energy
+            if self._damping_beta > gs.EPS:
+                damping_beta_over_dt = self._damping_beta / self._substep_dt
+                i_v = self.elements_i[i_e].el2v
+                S = ti.Matrix.zero(gs.ti_float, 4, 3)
+                B = self.elements_i[i_e].B
+                S[:3, :] = B
+                S[3, :] = -B[0, :] - B[1, :] - B[2, :]
+
+                x_diff = ti.Vector.zero(gs.ti_float, 12)
+                for i in ti.static(range(4)):
+                    x_diff[i * 3 : i * 3 + 3] = (
+                        self.elements_v[f + 1, i_v[i], i_b].pos - self.elements_v[f, i_v[i], i_b].pos
+                    )
+                St_x_diff = ti.Vector.zero(gs.ti_float, 9)
+                for i, j in ti.static(ti.ndrange(3, 4)):
+                    St_x_diff[i * 3 : i * 3 + 3] += S[j, i] * x_diff[j * 3 : j * 3 + 3]
+
+                H_St_x_diff = ti.Vector.zero(gs.ti_float, 9)
+                for i, j in ti.static(ti.ndrange(3, 3)):
+                    H_St_x_diff[i * 3 : i * 3 + 3] += (
+                        self.elements_el_hessian[i_b, i, j, i_e] @ St_x_diff[j * 3 : j * 3 + 3]
+                    )
+
+                self.elements_el_energy[i_b, i_e].energy += 0.5 * damping_beta_over_dt * St_x_diff.dot(H_St_x_diff)
+
     @ti.kernel
     def accumulate_vertex_force_preconditioner(self, f: ti.i32):
         damping_alpha_dt = self._damping_alpha * self._substep_dt
diff --git a/tests/test_bvh.py b/tests/test_bvh.py
@@ -13,16 +13,16 @@
 def lbvh():
     """Fixture for a LBVH tree"""
 
-    n_aabbs = 20
+    n_aabbs = 500
     n_batches = 10
     aabb = AABB(n_batches=n_batches, n_aabbs=n_aabbs)
-    min = np.random.rand(n_batches, n_aabbs, 3).astype(np.float32)
+    min = np.random.rand(n_batches, n_aabbs, 3).astype(np.float32) * 20.0
     max = min + np.random.rand(n_batches, n_aabbs, 3).astype(np.float32)
 
     aabb.aabbs.min.from_numpy(min)
     aabb.aabbs.max.from_numpy(max)
 
-    lbvh = LBVH(aabb)
+    lbvh = LBVH(aabb, max_n_query_result_per_aabb=32)
     lbvh.build()
     return lbvh
 
@@ -70,6 +70,7 @@ def test_expand_bits():
         ), f"Expected {str_expanded_x}, got {''.join(f'00{bit}' for bit in str_x)}"
 
 
+@pytest.mark.parametrize("backend", [gs.cpu, gs.gpu])
 def test_build_tree(lbvh):
     nodes = lbvh.nodes.to_numpy()
     n_aabbs = lbvh.n_aabbs
@@ -116,13 +117,18 @@ def test_build_tree(lbvh):
                 assert_allclose(parent_max, parent_max_expected, atol=1e-6, rtol=1e-5)
 
 
+@pytest.mark.parametrize("backend", [gs.cpu, gs.gpu])
 def test_query(lbvh):
     aabbs = lbvh.aabbs
 
     # Query the tree
     lbvh.query(aabbs)
 
     query_result_count = lbvh.query_result_count.to_numpy()
+    if query_result_count > lbvh.max_n_query_results:
+        raise ValueError(
+            f"Query result count {query_result_count} exceeds max_n_query_results {lbvh.max_n_query_results}"
+        )
     query_result = lbvh.query_result.to_numpy()
 
     n_aabbs = lbvh.n_aabbs