@@ -235,12 +235,16 @@ def compute_morton_codes(self):
235235 self .morton_codes [i_b , i_a ] = ti .Vector ([morton_code , i_a ], dt = ti .u32 )
236236
237237 @ti .func
238- def expand_bits (self , v ) :
238+ def expand_bits (self , v : ti . u32 ) -> ti . u32 :
239239 """
240240 Expands a 10-bit integer into 30 bits by inserting 2 zeros before each bit.
241241 """
242242 v = (v * ti .u32 (0x00010001 )) & ti .u32 (0xFF0000FF )
243- v = (v * ti .u32 (0x00000101 )) & ti .u32 (0x0F00F00F )
243+ # This is to silence taichi debug warning of overflow
244+ # Has the same result as v = (v * ti.u32(0x00000101)) & ti.u32(0x0F00F00F)
245+ # Performance difference is negligible
246+ # See https://github.com/Genesis-Embodied-AI/Genesis/pull/1560 for details
247+ v = (v | ((v & 0x00FFFFFF ) << 8 )) & 0x0F00F00F
244248 v = (v * ti .u32 (0x00000011 )) & ti .u32 (0xC30C30C3 )
245249 v = (v * ti .u32 (0x00000005 )) & ti .u32 (0x49249249 )
246250 return v
@@ -277,8 +281,8 @@ def _kernel_radix_sort_morton_codes_one_round(self, i: int):
277281
278282 # Reorder morton codes
279283 for i_b , i_a in ti .ndrange (self .n_batches , self .n_aabbs ):
280- code = ( self .morton_codes [i_b , i_a ][1 - (i // 4 )] >> ((i % 4 ) * 8 )) & 0xFF
281- idx = ti .i32 (self .offset [i_b , i_a ] + self .prefix_sum [i_b , ti . i32 ( code ) ])
284+ code = ti . i32 (( self .morton_codes [i_b , i_a ][1 - (i // 4 )] >> ((i % 4 ) * 8 )) & 0xFF )
285+ idx = ti .i32 (self .offset [i_b , i_a ] + self .prefix_sum [i_b , code ])
282286 self .tmp_morton_codes [i_b , idx ] = self .morton_codes [i_b , i_a ]
283287
284288 # Swap the temporary and original morton codes
@@ -351,21 +355,21 @@ def build_radix_tree(self):
351355
352356 delta_min = self .delta (i , i - d , i_b )
353357 l_max = ti .u32 (2 )
354- while self .delta (i , i + l_max * d , i_b ) > delta_min :
358+ while self .delta (i , i + ti . i32 ( l_max ) * d , i_b ) > delta_min :
355359 l_max *= 2
356360 l = ti .u32 (0 )
357361
358362 t = l_max // 2
359363 while t > 0 :
360- if self .delta (i , i + (l + t ) * d , i_b ) > delta_min :
364+ if self .delta (i , i + ti . i32 (l + t ) * d , i_b ) > delta_min :
361365 l += t
362366 t //= 2
363- j = i + l * d
367+ j = i + ti . i32 ( l ) * d
364368 delta_node = self .delta (i , j , i_b )
365369 s = ti .u32 (0 )
366370 t = (l + 1 ) // 2
367371 while t > 0 :
368- if self .delta (i , i + (s + t ) * d , i_b ) > delta_node :
372+ if self .delta (i , i + ti . i32 (s + t ) * d , i_b ) > delta_node :
369373 s += t
370374 t = ti .select (t > 1 , (t + 1 ) // 2 , 0 )
371375
@@ -378,17 +382,17 @@ def build_radix_tree(self):
378382 self .nodes [i_b , ti .i32 (right )].parent = i
379383
380384 @ti .func
381- def delta (self , i , j , i_b ):
385+ def delta (self , i : ti . i32 , j : ti . i32 , i_b : ti . i32 ):
382386 """
383387 Compute the longest common prefix (LCP) of the morton codes of two AABBs.
384388 """
385389 result = - 1
386390 if j >= 0 and j < self .n_aabbs :
387391 result = 64
388392 for i_bit in range (2 ):
389- x = self .morton_codes [i_b , ti . i32 ( i ) ][i_bit ] ^ self .morton_codes [i_b , ti . i32 ( j ) ][i_bit ]
393+ x = self .morton_codes [i_b , i ][i_bit ] ^ self .morton_codes [i_b , j ][i_bit ]
390394 for b in range (32 ):
391- if x & (1 << (31 - b )):
395+ if x & (ti . u32 ( 1 ) << (31 - b )):
392396 result = b + 32 * i_bit
393397 break
394398 if result != 64 :
@@ -441,14 +445,15 @@ def _kernel_compute_bounds_one_layer(self) -> ti.i32:
441445
442446 return is_done
443447
444- @ti .kernel
448+ @ti .func
445449 def query (self , aabbs : ti .template ()):
446450 """
447451 Query the BVH for intersections with the given AABBs.
448452
449453 The results are stored in the query_result field.
450454 """
451455 self .query_result_count [None ] = 0
456+ overflow = False
452457
453458 n_querys = aabbs .shape [1 ]
454459 for i_b , i_q in ti .ndrange (self .n_batches , n_querys ):
@@ -470,6 +475,8 @@ def query(self, aabbs: ti.template()):
470475 idx = ti .atomic_add (self .query_result_count [None ], 1 )
471476 if idx < self .max_n_query_results :
472477 self .query_result [idx ] = gs .ti_ivec3 (i_b , i_a , i_q ) # Store the AABB index
478+ else :
479+ overflow = True
473480 else :
474481 # Push children onto the stack
475482 if node .right != - 1 :
@@ -479,6 +486,8 @@ def query(self, aabbs: ti.template()):
479486 query_stack [stack_depth ] = node .left
480487 stack_depth += 1
481488
489+ return overflow
490+
482491
483492@ti .data_oriented
484493class FEMSurfaceTetLBVH (LBVH ):
@@ -499,10 +508,13 @@ def filter(self, i_a, i_q):
499508
500509 This is used to avoid self-collisions in FEM surface tets.
501510
502- i_a: index of the found AABB
503- i_q: index of the query AABB
511+ Parameters
512+ ----------
513+ i_a:
514+ index of the found AABB
515+ i_q:
516+ index of the query AABB
504517 """
505-
506518 result = i_a >= i_q
507519 i_av = self .fem_solver .elements_i [self .fem_solver .surface_elements [i_a ]].el2v
508520 i_qv = self .fem_solver .elements_i [self .fem_solver .surface_elements [i_q ]].el2v
0 commit comments