Genesis-Embodied-AI
diff --git a/‎.github/CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/linux-gpu.yml‎
Lines changed: 10 additions & 4 deletions b/‎.github/workflows/linux-gpu.yml‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎examples/drone/hover_env.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/drone/hover_env.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎genesis/__init__.py‎
Lines changed: 11 additions & 1 deletion b/‎genesis/__init__.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎genesis/engine/bvh.py‎
Lines changed: 73 additions & 44 deletions b/‎genesis/engine/bvh.py‎
Lines changed: 73 additions & 44 deletions
diff --git a/‎genesis/engine/coupler.py‎
Lines changed: 3 additions & 4 deletions b/‎genesis/engine/coupler.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎genesis/engine/entities/hybrid_entity.py‎
Lines changed: 2 additions & 2 deletions b/‎genesis/engine/entities/hybrid_entity.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎genesis/engine/entities/rigid_entity/rigid_entity.py‎
Lines changed: 1 addition & 1 deletion b/‎genesis/engine/entities/rigid_entity/rigid_entity.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎genesis/engine/materials/FEM/elastic.py‎
Lines changed: 2 additions & 1 deletion b/‎genesis/engine/materials/FEM/elastic.py‎
Lines changed: 2 additions & 1 deletion
@@ -61,7 +61,7 @@ Thank you for your interest in contributing to Genesis! We welcome contributions
 - (Optional) You can run CI tests locally to ensure you pass the online CI checks.
 
   ```python
-  python -m unittest discover tests
+  pytest -v --forked -m required ./tests
   ```
 
 - In the title of your Pull Request, please include [BUG FIX], [FEATURE] or [MISC] to indicate the purpose.
 
@@ -35,6 +35,12 @@ jobs:
 
           mkdir -p "${HOME}/.cache"
 
+          # Prefer idle nodes if any
+          IDLE_NODES=$(sinfo -h -o "%N %t" | awk '$2 == "idle" {print $1}')
+          if [[ -n "$IDLE_NODES" ]]; then
+            NODELIST="--nodelist=$IDLE_NODES"
+          fi
+
           srun \
             --container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \
             --container-mounts=\
@@ -44,7 +50,7 @@ jobs:
             --export=\
           HF_TOKEN="${HF_TOKEN}",\
           NVIDIA_DRIVER_CAPABILITIES=all \
-            --partition=hpc-mid --nodes=1 --gpus=1 --time="${TIMEOUT_MINUTES}" \
+            --partition=hpc-mid ${NODELIST} --nodes=1 --time="${TIMEOUT_MINUTES}" \
             --job-name=${SLURM_JOB_NAME} \
             bash -c "
               pip install -e '.[dev,render]' && \
@@ -69,16 +75,16 @@ jobs:
           "${{ github.workspace }}":/root/workspace \
             --no-container-mount-home --container-workdir=/root/workspace \
             --export=${SLURM_ENV_VARS} \
-            --partition=hpc-mid --exclusive --nodes=1 --gpus=1 --time="${TIMEOUT_MINUTES}" \
+            --partition=hpc-mid --exclusive --nodes=1 --time="${TIMEOUT_MINUTES}" \
             --job-name=${SLURM_JOB_NAME} \
             bash -c "
               : # sudo apt install -y tmate && \
               tmate -S /tmp/tmate.sock new-session -d && \
               tmate -S /tmp/tmate.sock wait tmate-ready && \
               tmate -S /tmp/tmate.sock display -p '#{tmate_ssh}'
               pip install -e '.[dev,render]' && \
-              pytest --print -x -m 'benchmarks' --backend gpu ./tests && \
-              cp 'speed_test.txt' '/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt'
+              pytest --print -x -m 'benchmarks' ./tests && \
+              cat speed_test*.txt > '/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt'
               : # tmate -S /tmp/tmate.sock wait tmate-exit
             "
 
 
@@ -134,7 +134,7 @@ def step(self, actions):
         self.drone.set_propellels_rpm((1 + exec_actions * 0.8) * 14468.429183500699)
         # update target pos
         if self.target is not None:
-            self.target.set_pos(self.commands, zero_velocity=True, envs_idx=list(range(self.num_envs)))
+            self.target.set_pos(self.commands, zero_velocity=True)
         self.scene.step()
 
         # update buffers
 
@@ -50,6 +50,7 @@ def init(
     backend=None,
     theme="dark",
     logger_verbose_time=False,
+    performance_mode: bool = False,  # True: compilation up to 6x slower (GJK), but runs ~1-5% faster
 ):
     # Consider Genesis as initialized right away
     global _initialized
@@ -172,6 +173,12 @@ def init(
         torch.backends.cudnn.benchmark = False
         logger.info("Beware running Genesis in debug mode dramatically reduces runtime speed.")
 
+    if not performance_mode:
+        logger.info(
+            "Consider setting 'performance_mode=True' in production to maximise runtime speed, if significantly "
+            "increasing compilation time is not a concern."
+        )
+
     if seed is not None:
         global SEED
         SEED = seed
@@ -197,6 +204,7 @@ def init(
             force_scalarize_matrix=True,
             # Turning off 'advanced_optimization' is causing issues on MacOS
             advanced_optimization=True,
+            cfg_optimization=performance_mode,
             fast_math=not debug,
             default_ip=ti_int,
             default_fp=ti_float,
@@ -291,6 +299,7 @@ def _display_greeting(INFO_length):
     wave_width = max(0, min(38, wave_width))
     bar_width = wave_width * 2 + 9
     wave = ("┈┉" * wave_width)[:wave_width]
+    global logger
     logger.info(f"~<╭{'─'*(bar_width)}╮>~")
     logger.info(f"~<│{wave}>~ ~~~~<Genesis>~~~~ ~<{wave}│>~")
     logger.info(f"~<╰{'─'*(bar_width)}╯>~")
@@ -314,9 +323,10 @@ def _custom_excepthook(exctype, value, tb):
     print("".join(traceback.format_exception(exctype, value, tb)))
 
     # Logger the exception right before exit if possible
+    global logger
     try:
         logger.error(f"{exctype.__name__}: {value}")
-    except AttributeError:
+    except (AttributeError, NameError):
         # Logger may not be configured at this point
         pass
 
 
@@ -1,6 +1,7 @@
 import genesis as gs
 import taichi as ti
 from genesis.repr_base import RBC
+import numpy as np
 
 
 @ti.data_oriented
@@ -157,19 +158,27 @@ class Node:
         # Nodes of the BVH, first n_aabbs - 1 are internal nodes, last n_aabbs are leaf nodes
         self.nodes = self.Node.field(shape=(self.n_batches, self.n_aabbs * 2 - 1))
         # Whether an internal node has been visited during traversal
-        self.internal_node_visited = ti.field(ti.u8, shape=(self.n_batches, self.n_aabbs - 1))
+        self.internal_node_active = ti.field(ti.u1, shape=(self.n_batches, self.n_aabbs - 1))
+        self.internal_node_ready = ti.field(ti.u1, shape=(self.n_batches, self.n_aabbs - 1))
+        self.updated = ti.field(ti.u1, shape=())
 
         # Query results, vec3 of batch id, self id, query id
         self.query_result = ti.field(gs.ti_ivec3, shape=(self.max_n_query_results))
         # Count of query results
         self.query_result_count = ti.field(ti.i32, shape=())
 
-    @ti.kernel
     def build(self):
         """
         Build the BVH from the axis-aligned bounding boxes (AABBs).
         """
+        self.compute_aabb_centers_and_scales()
+        self.compute_morton_codes()
+        self.radix_sort_morton_codes()
+        self.build_radix_tree()
+        self.compute_bounds()
 
+    @ti.kernel
+    def compute_aabb_centers_and_scales(self):
         for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
             self.aabb_centers[i_b, i_a] = (self.aabbs[i_b, i_a].min + self.aabbs[i_b, i_a].max) / 2
 
@@ -184,14 +193,9 @@ def build(self):
         for i_b in ti.ndrange(self.n_batches):
             scale = self.aabb_max[i_b] - self.aabb_min[i_b]
             for i in ti.static(range(3)):
-                self.scale[i_b][i] = ti.select(scale[i] > 1e-7, 1.0 / scale[i], 1)
+                self.scale[i_b][i] = ti.select(scale[i] > gs.EPS, 1.0 / scale[i], 1.0)
 
-        self.compute_morton_codes()
-        self.radix_sort_morton_codes()
-        self.build_radix_tree()
-        self.compute_bounds()
-
-    @ti.func
+    @ti.kernel
     def compute_morton_codes(self):
         """
         Compute the Morton codes for each AABB.
@@ -223,38 +227,43 @@ def expand_bits(self, v):
         v = (v * ti.u32(0x00000005)) & ti.u32(0x49249249)
         return v
 
-    @ti.func
     def radix_sort_morton_codes(self):
         """
         Radix sort the morton codes, using 8 bits at a time.
         """
-        for i in ti.static(range(8)):
-            # Clear histogram
-            for i_b, j in ti.ndrange(self.n_batches, 256):
-                self.hist[i_b, j] = 0
+        for i in range(8):
+            self._kernel_radix_sort_morton_codes_one_round(i)
 
-            # Fill histogram
-            for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
+    @ti.kernel
+    def _kernel_radix_sort_morton_codes_one_round(self, i: int):
+        # Clear histogram
+        self.hist.fill(0)
+
+        # Fill histogram
+        for i_b in range(self.n_batches):
+            # This is now sequential
+            # TODO Parallelize, need to use groups to handle data to remain stable, could be not worth it
+            for i_a in range(self.n_aabbs):
                 code = (self.morton_codes[i_b, i_a] >> (i * 8)) & 0xFF
                 self.offset[i_b, i_a] = ti.atomic_add(self.hist[i_b, ti.i32(code)], 1)
 
-            # Compute prefix sum
-            for i_b in ti.ndrange(self.n_batches):
-                self.prefix_sum[i_b, 0] = 0
-                for j in range(1, 256):  # sequential prefix sum
-                    self.prefix_sum[i_b, j] = self.prefix_sum[i_b, j - 1] + self.hist[i_b, j - 1]
+        # Compute prefix sum
+        for i_b in ti.ndrange(self.n_batches):
+            self.prefix_sum[i_b, 0] = 0
+            for j in range(1, 256):  # sequential prefix sum
+                self.prefix_sum[i_b, j] = self.prefix_sum[i_b, j - 1] + self.hist[i_b, j - 1]
 
-            # Reorder morton codes
-            for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
-                code = (self.morton_codes[i_b, i_a] >> (i * 8)) & 0xFF
-                idx = ti.i32(self.offset[i_b, i_a] + self.prefix_sum[i_b, ti.i32(code)])
-                self.tmp_morton_codes[i_b, idx] = self.morton_codes[i_b, i_a]
+        # Reorder morton codes
+        for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
+            code = (self.morton_codes[i_b, i_a] >> (i * 8)) & 0xFF
+            idx = ti.i32(self.offset[i_b, i_a] + self.prefix_sum[i_b, ti.i32(code)])
+            self.tmp_morton_codes[i_b, idx] = self.morton_codes[i_b, i_a]
 
-            # Swap the temporary and original morton codes
-            for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
-                self.morton_codes[i_b, i_a] = self.tmp_morton_codes[i_b, i_a]
+        # Swap the temporary and original morton codes
+        for i_b, i_a in ti.ndrange(self.n_batches, self.n_aabbs):
+            self.morton_codes[i_b, i_a] = self.tmp_morton_codes[i_b, i_a]
 
-    @ti.func
+    @ti.kernel
     def build_radix_tree(self):
         """
         Build the radix tree from the sorted morton codes.
@@ -321,31 +330,51 @@ def delta(self, i, j, i_b):
                     break
         return result
 
-    @ti.func
     def compute_bounds(self):
         """
         Compute the bounds of the BVH nodes.
 
-        Starts from the leaf nodes and works upwards.
+        Starts from the leaf nodes and works upwards layer by layer.
         """
-        for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs - 1):
-            self.internal_node_visited[i_b, i] = ti.u8(0)
+        self._kernel_compute_bounds_init()
+        while self.updated[None]:
+            self._kernel_compute_bounds_one_layer()
+
+    @ti.kernel
+    def _kernel_compute_bounds_init(self):
+        self.updated[None] = True
+        self.internal_node_active.fill(0)
+        self.internal_node_ready.fill(0)
 
         for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs):
             idx = ti.i32(self.morton_codes[i_b, i])
             self.nodes[i_b, i + self.n_aabbs - 1].bound.min = self.aabbs[i_b, idx].min
             self.nodes[i_b, i + self.n_aabbs - 1].bound.max = self.aabbs[i_b, idx].max
+            parent_idx = self.nodes[i_b, i + self.n_aabbs - 1].parent
+            if parent_idx != -1:
+                self.internal_node_active[i_b, parent_idx] = 1
 
-            cur_idx = self.nodes[i_b, i + self.n_aabbs - 1].parent
-            while cur_idx != -1:
-                visited = ti.u1(ti.atomic_or(self.internal_node_visited[i_b, cur_idx], ti.u8(1)))
-                if not visited:
-                    break
-                left_bound = self.nodes[i_b, self.nodes[i_b, cur_idx].left].bound
-                right_bound = self.nodes[i_b, self.nodes[i_b, cur_idx].right].bound
-                self.nodes[i_b, cur_idx].bound.min = ti.min(left_bound.min, right_bound.min)
-                self.nodes[i_b, cur_idx].bound.max = ti.max(left_bound.max, right_bound.max)
-                cur_idx = self.nodes[i_b, cur_idx].parent
+    @ti.kernel
+    def _kernel_compute_bounds_one_layer(self):
+        self.updated[None] = False
+        for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs - 1):
+            if self.internal_node_active[i_b, i] == 0:
+                continue
+            left_bound = self.nodes[i_b, self.nodes[i_b, i].left].bound
+            right_bound = self.nodes[i_b, self.nodes[i_b, i].right].bound
+            self.nodes[i_b, i].bound.min = ti.min(left_bound.min, right_bound.min)
+            self.nodes[i_b, i].bound.max = ti.max(left_bound.max, right_bound.max)
+            parent_idx = self.nodes[i_b, i].parent
+            if parent_idx != -1:
+                self.internal_node_ready[i_b, parent_idx] = 1
+            self.internal_node_active[i_b, i] = 0
+            self.updated[None] = True
+
+        for i_b, i in ti.ndrange(self.n_batches, self.n_aabbs - 1):
+            if self.internal_node_ready[i_b, i] == 0:
+                continue
+            self.internal_node_active[i_b, i] = 1
+            self.internal_node_ready[i_b, i] = 0
 
     @ti.kernel
     def query(self, aabbs: ti.template()):
 
@@ -649,6 +649,7 @@ def __init__(
         self._n_linesearch_iterations = options.n_linesearch_iterations
         self._linesearch_c = options.linesearch_c
         self._linesearch_tau = options.linesearch_tau
+        self.default_deformable_g = 1.0e8  # default deformable geometry size
 
     def build(self) -> None:
         self._B = self.sim._B
@@ -698,6 +699,7 @@ def init_fem_fields(self):
         self.max_fem_floor_contact_pairs = fem_solver.n_surfaces * fem_solver._B
         self.n_fem_floor_contact_pairs = ti.field(gs.ti_int, shape=())
         self.fem_floor_contact_pairs = self.fem_floor_contact_pair_type.field(shape=(self.max_fem_floor_contact_pairs,))
+
         # Lookup table for marching tetrahedra edges
         kMarchingTetsEdgeTable_np = np.array(
             [
@@ -934,15 +936,12 @@ def fem_floor_detection(self, f: ti.i32):
             )
             self.fem_floor_contact_pairs[i_c].barycentric = barycentric
 
-            C = ti.static(1.0e8)
-            deformable_g = C
             rigid_g = self.fem_pressure_gradient[i_b, i_e].z
             # TODO A better way to handle corner cases where pressure and pressure gradient are ill defined
             if total_area < gs.EPS or rigid_g < gs.EPS:
                 self.fem_floor_contact_pairs[i_c].active = 0
                 continue
-            g = 1.0 / (1.0 / deformable_g + 1.0 / rigid_g)  # harmonic average
-            deformable_k = total_area * C
+            g = self.default_deformable_g * rigid_g / (self.default_deformable_g + rigid_g)  # harmonic average
             rigid_k = total_area * g
             rigid_phi0 = -pressure / g
             rigid_fn0 = total_area * pressure
 
@@ -431,8 +431,8 @@ def _kernel_update_soft_part_mpm(self, f: ti.i32):
                 acc = vel_d / dt_for_rigid_acc
                 frc_vel = mass_real * acc
                 frc_ang = (x_pos - link.COM).cross(frc_vel)
-                self._solver_rigid.links_state[link_idx, i_b].cfrc_ext_vel += frc_vel
-                self._solver_rigid.links_state[link_idx, i_b].cfrc_ext_ang += frc_ang
+                self._solver_rigid.links_state[link_idx, i_b].cfrc_applied_vel += frc_vel
+                self._solver_rigid.links_state[link_idx, i_b].cfrc_applied_ang += frc_ang
 
                 # rigid-to-soft coupling # NOTE: this may lead to unstable feedback loop
                 self._solver_soft.particles[f_, i_global, i_b].vel += vel_d * self.material.soft_dv_coef
 
@@ -2748,7 +2748,7 @@ def get_contacts(self, with_entity=None, exclude_self_contact=False):
         if self._solver.n_envs == 0:
             contacts_info = {key: value[valid_mask] for key, value in contacts_info.items()}
         else:
-            contacts_info = {key: value[:, valid_mask] for key, value in contacts_info.items()}
+            contacts_info["valid_mask"] = valid_mask
 
         contacts_info["force_a"] = -contacts_info["force"]
         contacts_info["force_b"] = +contacts_info["force"]
 
@@ -82,6 +82,7 @@ def build_linear_corotated(self, fem_solver):
 
     @ti.func
     def pre_compute_linear_corotated(self, J, F, i_e, i_b):
+        # Computing Polar Decomposition instead of calling `R, P = ti.polar_decompose(F)` since `P` is not needed here
         U, S, V = ti.svd(F)
         R = U @ V.transpose()
         self.R[i_b, i_e] = R
@@ -431,7 +432,7 @@ def compute_energy_gradient_hessian_linear_corotated(self, mu, lam, J, F, actu,
         for i, k in ti.static(ti.ndrange(3, 3)):
             hessian_field[i_b, i, i, i_e][k, k] = mu
 
-        for i, j, alpha, beta in ti.static(ti.ndrange(3, 3, 3, 3)):
+        for i, j, alpha, beta in ti.ndrange(3, 3, 3, 3):
             hessian_field[i_b, j, beta, i_e][i, alpha] += mu * R[i, beta] * R[alpha, j] + lam * R[alpha, beta] * R[i, j]
 
         return energy, gradient