diff --git a/genesis/engine/bvh.py b/genesis/engine/bvh.py
index bf84e107e4..79b5c273c3 100644
--- a/genesis/engine/bvh.py
+++ b/genesis/engine/bvh.py
@@ -235,12 +235,16 @@ def compute_morton_codes(self):
             self.morton_codes[i_b, i_a] = ti.Vector([morton_code, i_a], dt=ti.u32)
 
     @ti.func
-    def expand_bits(self, v):
+    def expand_bits(self, v: ti.u32) -> ti.u32:
         """
         Expands a 10-bit integer into 30 bits by inserting 2 zeros before each bit.
         """
         v = (v * ti.u32(0x00010001)) & ti.u32(0xFF0000FF)
-        v = (v * ti.u32(0x00000101)) & ti.u32(0x0F00F00F)
+        # This is to silence taichi debug warning of overflow
+        # Has the same result as v = (v * ti.u32(0x00000101)) & ti.u32(0x0F00F00F)
+        # Performance difference is negligible
+        # See https://github.com/Genesis-Embodied-AI/Genesis/pull/1560 for details
+        v = (v | ((v & 0x00FFFFFF) << 8)) & 0x0F00F00F
         v = (v * ti.u32(0x00000011)) & ti.u32(0xC30C30C3)
         v = (v * ti.u32(0x00000005)) & ti.u32(0x49249249)
         return v
@@ -351,21 +355,21 @@ def build_radix_tree(self):
 
             delta_min = self.delta(i, i - d, i_b)
             l_max = ti.u32(2)
-            while self.delta(i, i + l_max * d, i_b) > delta_min:
+            while self.delta(i, i + ti.i32(l_max) * d, i_b) > delta_min:
                 l_max *= 2
             l = ti.u32(0)
 
             t = l_max // 2
             while t > 0:
-                if self.delta(i, i + (l + t) * d, i_b) > delta_min:
+                if self.delta(i, i + ti.i32(l + t) * d, i_b) > delta_min:
                     l += t
                 t //= 2
-            j = i + l * d
+            j = i + ti.i32(l) * d
             delta_node = self.delta(i, j, i_b)
             s = ti.u32(0)
             t = (l + 1) // 2
             while t > 0:
-                if self.delta(i, i + (s + t) * d, i_b) > delta_node:
+                if self.delta(i, i + ti.i32(s + t) * d, i_b) > delta_node:
                     s += t
                 t = ti.select(t > 1, (t + 1) // 2, 0)
 
@@ -378,7 +382,7 @@ def build_radix_tree(self):
             self.nodes[i_b, ti.i32(right)].parent = i
 
     @ti.func
-    def delta(self, i, j, i_b):
+    def delta(self, i: ti.i32, j: ti.i32, i_b: ti.i32):
         """
         Compute the longest common prefix (LCP) of the morton codes of two AABBs.
         """
@@ -386,9 +390,9 @@ def delta(self, i, j, i_b):
         if j >= 0 and j < self.n_aabbs:
             result = 64
             for i_bit in range(2):
-                x = self.morton_codes[i_b, ti.i32(i)][i_bit] ^ self.morton_codes[i_b, ti.i32(j)][i_bit]
+                x = self.morton_codes[i_b, i][i_bit] ^ self.morton_codes[i_b, j][i_bit]
                 for b in range(32):
-                    if x & (1 << (31 - b)):
+                    if x & (ti.u32(1) << (31 - b)):
                         result = b + 32 * i_bit
                         break
                 if result != 64:
diff --git a/genesis/engine/couplers/sap_coupler.py b/genesis/engine/couplers/sap_coupler.py
index 2fb0d10c4b..a832b586be 100644
--- a/genesis/engine/couplers/sap_coupler.py
+++ b/genesis/engine/couplers/sap_coupler.py
@@ -655,7 +655,7 @@ def compute_inertia_elastic_gradient_alpha(self, i_step: ti.i32):
         for i_b, i_v in ti.ndrange(self._B, self.fem_solver.n_vertices):
             if not self.batch_linesearch_active[i_b]:
                 continue
-            self.linesearch_state.dell_dalpha[i_b] += dp[i_b, i_v].dot(v[i_b, i_v] - v_star[i_step + 1, i_b, i_v])
+            self.linesearch_state.dell_dalpha[i_b] += dp[i_b, i_v].dot(v[i_b, i_v] - v_star[i_step + 1, i_v, i_b])
 
     @ti.kernel
     def compute_inertia_elastic_hessian_alpha(self):
@@ -679,7 +679,7 @@ def compute_inertia_elastic_energy_alpha(self, i_step: ti.i32, energy: ti.templa
         for i_b, i_v in ti.ndrange(self._B, self.fem_solver.n_vertices):
             if not self.batch_linesearch_active[i_b]:
                 continue
-            energy[i_b] += alpha[i_b] * dp[i_b, i_v].dot(v[i_b, i_v] - v_star[i_step + 1, i_b, i_v])
+            energy[i_b] += alpha[i_b] * dp[i_b, i_v].dot(v[i_b, i_v] - v_star[i_step + 1, i_v, i_b])
 
     def prepare_search_direction_data(self):
         self.prepare_inertia_elastic_search_direction_data()