Leverage GsTaichi zero-copy in data accessors.

duburcqa · duburcqa · commit 2b7b9bc02cc7 · 2025-11-20T13:32:45.000+01:00
diff --git a/genesis/__init__.py b/genesis/__init__.py
@@ -41,6 +41,7 @@
 backend: gs_backend | None = None
 use_ndarray: bool | None = None
 use_fastcache: bool | None = None
+use_zerocopy: bool | None = None
 EPS: float | None = None
 
 
@@ -117,7 +118,7 @@ def init(
         backend = gs_backend.cpu
 
     # Configure GsTaichi fast cache and array type
-    global use_ndarray, use_fastcache
+    global use_ndarray, use_fastcache, use_zerocopy
     is_ndarray_disabled = (os.environ.get("GS_ENABLE_NDARRAY") or ("0" if sys.platform == "darwin" else "1")) == "0"
     if use_ndarray is None:
         _use_ndarray = not (is_ndarray_disabled or performance_mode)
@@ -136,6 +137,15 @@ def init(
             raise_exception("Genesis previous initialized. GsTaichi fast cache mode cannot be disabled anymore.")
     use_ndarray, use_fastcache = _use_ndarray, _use_fastcache
 
+    # Unlike dynamic vs static array mode, and fastcache, zero-copy can be toggle on/off between init without issue
+    _use_zerocopy = int(os.environ["GS_ENABLE_ZEROCOPY"]) if "GS_ENABLE_ZEROCOPY" in os.environ else None
+    if use_ndarray and backend in (gs_backend.cpu, gs_backend.cuda):
+        if _use_zerocopy is None:
+            _use_zerocopy = True
+    elif _use_zerocopy:
+        raise_exception(f"Zero-copy only support by GsTaichi dynamic array mode on CPU and CUDA backend.")
+    use_zerocopy = _use_zerocopy
+
     # Define the right dtypes in accordance with selected backend and precision
     global ti_float, np_float, tc_float
     if precision == "32":
diff --git a/genesis/engine/entities/rigid_entity/rigid_entity.py b/genesis/engine/entities/rigid_entity/rigid_entity.py
@@ -560,6 +560,7 @@ def _build(self):
 
         self._n_qs = self.n_qs
         self._n_dofs = self.n_dofs
+        self._n_geoms = self.n_geoms
         self._is_built = True
 
         verts_start = 0
@@ -575,6 +576,8 @@ def _build(self):
             self._free_verts_idx_local = torch.cat(free_verts_idx_local)
         if fixed_verts_idx_local:
             self._fixed_verts_idx_local = torch.cat(fixed_verts_idx_local)
+        self._n_free_verts = len(self._free_verts_idx_local)
+        self._n_fixed_verts = len(self._fixed_verts_idx_local)
 
         self._geoms = self.geoms
         self._vgeoms = self.vgeoms
@@ -2015,23 +2018,36 @@ def get_verts(self):
         verts : torch.Tensor, shape (n_envs, n_verts, 3)
             The vertices of the entity.
         """
-        self._solver.update_verts_for_geoms(range(self.geom_start, self.geom_end))
+        self._solver.update_verts_for_geoms(slice(self.geom_start, self.geom_end))
 
-        tensor = torch.empty((self._solver._B, self.n_verts, 3), dtype=gs.tc_float, device=gs.device)
-        has_fixed_verts, has_free_vertices = len(self._fixed_verts_idx_local) > 0, len(self._free_verts_idx_local) > 0
-        if has_fixed_verts:
-            _kernel_get_fixed_verts(
-                tensor, self._fixed_verts_idx_local, self._fixed_verts_state_start, self._solver.fixed_verts_state
-            )
-        if has_free_vertices:
-            # FIXME: Get around some bug in gstaichi when using gstaichi with metal backend
-            must_copy = gs.backend == gs.metal and has_fixed_verts
-            tensor_free = torch.zeros_like(tensor) if must_copy else tensor
-            _kernel_get_free_verts(
-                tensor_free, self._free_verts_idx_local, self._free_verts_state_start, self._solver.free_verts_state
-            )
-            if must_copy:
-                tensor += tensor_free
+        n_fixed_verts, n_free_vertices = self._n_fixed_verts, self._n_free_verts
+        tensor = torch.empty((self._solver._B, n_fixed_verts + n_free_vertices, 3), dtype=gs.tc_float, device=gs.device)
+
+        if n_fixed_verts > 0:
+            if gs.use_zerocopy:
+                fixed_verts_state = ti_to_torch(self._solver.fixed_verts_state.pos)
+                tensor[:, self._fixed_verts_idx_local] = fixed_verts_state[
+                    self._fixed_verts_state_start : self._fixed_verts_state_start + n_fixed_verts
+                ]
+            else:
+                _kernel_get_fixed_verts(
+                    tensor, self._fixed_verts_idx_local, self._fixed_verts_state_start, self._solver.fixed_verts_state
+                )
+        if n_free_vertices > 0:
+            if gs.use_zerocopy:
+                free_verts_state = ti_to_torch(self._solver.free_verts_state.pos, transpose=True)
+                tensor[:, self._free_verts_idx_local] = free_verts_state[
+                    :, self._free_verts_state_start : self._fixed_verts_state_start + n_free_vertices
+                ]
+            else:
+                # FIXME: Get around some bug in gstaichi when using gstaichi with metal backend
+                must_copy = gs.backend == gs.metal and n_fixed_verts > 0
+                tensor_free = torch.zeros_like(tensor) if must_copy else tensor
+                _kernel_get_free_verts(
+                    tensor_free, self._free_verts_idx_local, self._free_verts_state_start, self._solver.free_verts_state
+                )
+                if must_copy:
+                    tensor += tensor_free
 
         if self._solver.n_envs == 0:
             tensor = tensor.squeeze(0)
@@ -2840,6 +2856,8 @@ def n_dofs(self):
     @property
     def n_geoms(self):
         """The number of `RigidGeom` in the entity."""
+        if self._is_built:
+            return self._n_geoms
         return sum(link.n_geoms for link in self._links)
 
     @property
diff --git a/genesis/engine/entities/rigid_entity/rigid_link.py b/genesis/engine/entities/rigid_entity/rigid_link.py
@@ -305,7 +305,7 @@ def get_verts(self):
         """
         Get the vertices of the link's collision body (concatenation of all `link.geoms`) in the world frame.
         """
-        self._solver.update_verts_for_geoms(range(self.geom_start, self.geom_end))
+        self._solver.update_verts_for_geoms(slice(self.geom_start, self.geom_end))
 
         if self.is_fixed and not self._entity._batch_fixed_verts:
             tensor = torch.empty((self.n_verts, 3), dtype=gs.tc_float, device=gs.device)
diff --git a/genesis/engine/simulator.py b/genesis/engine/simulator.py
@@ -44,7 +44,7 @@
     from .solvers.base_solver import Solver
 
 
-RATE_CHECK_ERRNO = 10
+RATE_CHECK_ERRNO = 10 if not gs.use_zerocopy else 1
 
 
 @ti.data_oriented
diff --git a/genesis/engine/solvers/rigid/rigid_solver_decomp.py b/genesis/engine/solvers/rigid/rigid_solver_decomp.py
@@ -894,7 +894,7 @@ def substep(self):
             )
 
     def check_errno(self):
-        match kernel_get_errno(self._errno):
+        match ti_to_torch(self._errno):
             case 1:
                 max_collision_pairs_broad = self.collider._collider_info.max_collision_pairs_broad[None]
                 gs.raise_exception(
@@ -1362,8 +1362,10 @@ def _sanitize_1D_io_variables(
         _inputs_idx = torch.as_tensor(inputs_idx, dtype=gs.tc_int, device=gs.device).contiguous()
         if _inputs_idx is not inputs_idx:
             gs.logger.debug(ALLOCATE_TENSOR_WARNING)
-        _inputs_idx = torch.atleast_1d(_inputs_idx)
-        if _inputs_idx.ndim != 1:
+        _inputs_ndim = _inputs_idx.ndim
+        if _inputs_ndim == 0:
+            _inputs_idx = _inputs_idx[None]
+        elif _inputs_ndim > 1:
             gs.raise_exception(f"Expecting 1D tensor for `{idx_name}`.")
         if not ((0 <= _inputs_idx).all() or (_inputs_idx < input_size).all()):
             gs.raise_exception(f"`{idx_name}` is out-of-range.")
@@ -1372,19 +1374,23 @@ def _sanitize_1D_io_variables(
             _tensor = torch.as_tensor(tensor, dtype=gs.tc_float, device=gs.device).contiguous()
             if _tensor is not tensor:
                 gs.logger.debug(ALLOCATE_TENSOR_WARNING)
-            tensor = _tensor.unsqueeze(0) if batched and self.n_envs and _tensor.ndim == 1 else _tensor
-
+            tensor_ndim = _tensor.ndim
+            if batched and self.n_envs and tensor_ndim == 1:
+                tensor = _tensor.unsqueeze(0)
+                tensor_ndim += 1
+            else:
+                tensor = _tensor
             if tensor.shape[-1] != len(inputs_idx):
                 gs.raise_exception(f"Last dimension of the input tensor does not match length of `{idx_name}`.")
 
             if batched:
                 if self.n_envs == 0:
-                    if tensor.ndim != 1:
+                    if tensor_ndim != 1:
                         gs.raise_exception(
                             f"Invalid input shape: {tensor.shape}. Expecting a 1D tensor for non-parallelized scene."
                         )
                 else:
-                    if tensor.ndim == 2:
+                    if tensor_ndim == 2:
                         if tensor.shape[0] != len(envs_idx):
                             gs.raise_exception(
                                 f"Invalid input shape: {tensor.shape}. First dimension of the input tensor does not match "
@@ -1395,7 +1401,7 @@ def _sanitize_1D_io_variables(
                             f"Invalid input shape: {tensor.shape}. Expecting a 2D tensor for scene with parallelized envs."
                         )
             else:
-                if tensor.ndim != 1:
+                if tensor_ndim != 1:
                     gs.raise_exception("Expecting 1D output tensor.")
         return tensor, _inputs_idx, envs_idx
 
@@ -2382,6 +2388,11 @@ def set_drone_rpm(self, n_propellers, propellers_link_idxs, propellers_rpm, prop
         )
 
     def update_verts_for_geoms(self, geoms_idx):
+        if gs.use_zerocopy:
+            verts_updated = ti_to_torch(self.geoms_state.verts_updated, transpose=False)
+            if verts_updated[geoms_idx].all():
+                return
+
         _, geoms_idx, _ = self._sanitize_1D_io_variables(
             None, geoms_idx, self.n_geoms, None, idx_name="geoms_idx", skip_allocation=True, unsafe=False
         )
@@ -6957,8 +6968,3 @@ def kernel_set_geoms_friction(
     ti.loop_config(serialize=ti.static(static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL))
     for i_g_ in ti.ndrange(geoms_idx.shape[0]):
         geoms_info.friction[geoms_idx[i_g_]] = friction[i_g_]
-
-
-@ti.kernel(fastcache=gs.use_fastcache)
-def kernel_get_errno(errno: array_class.V_ANNOTATION) -> ti.i32:
-    return errno[None]
diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py
@@ -1,9 +1,10 @@
 import math
 import dataclasses
-from functools import partial
+from functools import partial, wraps
 
 import gstaichi as ti
 import numpy as np
+import torch
 
 import genesis as gs
 
@@ -12,12 +13,20 @@
     gs.raise_exception("Genesis hasn't been initialized. Did you call `gs.init()`?")
 
 
-V_ANNOTATION = ti.types.ndarray() if gs.use_ndarray else ti.template
-V = ti.ndarray if gs.use_ndarray else ti.field
-V_VEC = ti.Vector.ndarray if gs.use_ndarray else ti.Vector.field
-V_MAT = ti.Matrix.ndarray if gs.use_ndarray else ti.Matrix.field
+def build_tensor_type(tensor_type):
+    @wraps(tensor_type)
+    def _tensor_type_wrapper(*args, **kwargs):
+        tensor = tensor_type(*args, **kwargs)
+        try:
+            # dlpack does not hold alive the original memory, so not need to track lifetime in tensor deleter
+            tensor._tc = torch.utils.dlpack.from_dlpack(tensor.to_dlpack())
+        except RuntimeError as e:
+            raise RuntimeError(f"Zero-copy is not supported for backend '{gs.backend}'.") from e
+        return tensor
 
-DATA_ORIENTED = partial(dataclasses.dataclass, frozen=True) if gs.use_ndarray else ti.data_oriented
+    if gs.use_zerocopy:
+        return _tensor_type_wrapper
+    return tensor_type
 
 
 def maybe_shape(shape, is_on):
@@ -59,6 +68,11 @@ def __init__(self, *args, **kwargs):
         return super().__new__(cls, name, bases, namespace)
 
 
+V_ANNOTATION = ti.types.ndarray() if gs.use_ndarray else ti.template
+V = build_tensor_type(ti.ndarray if gs.use_ndarray else ti.field)
+V_VEC = build_tensor_type(ti.Vector.ndarray if gs.use_ndarray else ti.Vector.field)
+V_MAT = build_tensor_type(ti.Matrix.ndarray if gs.use_ndarray else ti.Matrix.field)
+DATA_ORIENTED = partial(dataclasses.dataclass, frozen=True) if gs.use_ndarray else ti.data_oriented
 BASE_METACLASS = type if gs.use_ndarray else AutoInitMeta
 
 
diff --git a/genesis/utils/misc.py b/genesis/utils/misc.py
@@ -581,39 +581,55 @@ def ti_to_python(
     # Get metadata
     ti_data_meta = _get_ti_metadata(value)
 
-    # Extract value as a whole.
-    # Note that this is usually much faster than using a custom kernel to extract a slice.
-    # The implementation is based on `taichi.lang.(ScalarField | MatrixField).to_torch`.
-    is_metal = gs.device.type == "mps"
-    out_dtype = _to_torch_type_fast(ti_data_meta.dtype) if to_torch else _to_numpy_type_fast(ti_data_meta.dtype)
-    data_type = type(value)
-    if issubclass(data_type, (ti.ScalarField, ti.ScalarNdarray)):
-        if to_torch:
-            out = torch.zeros(ti_data_meta.shape, dtype=out_dtype, device="cpu" if is_metal else gs.device)
-        else:
-            out = np.zeros(ti_data_meta.shape, dtype=out_dtype)
-        TO_EXT_ARR_FAST_MAP[data_type](value, out)
-    elif issubclass(data_type, ti.MatrixField):
-        as_vector = value.m == 1
-        shape_ext = (value.n,) if as_vector else (value.n, value.m)
-        if to_torch:
-            out = torch.empty(ti_data_meta.shape + shape_ext, dtype=out_dtype, device="cpu" if is_metal else gs.device)
-        else:
-            out = np.zeros(ti_data_meta.shape + shape_ext, dtype=out_dtype)
-        TO_EXT_ARR_FAST_MAP[data_type](value, out, as_vector)
-    elif issubclass(data_type, (ti.VectorNdarray, ti.MatrixNdarray)):
-        layout_is_aos = 1
-        as_vector = issubclass(data_type, ti.VectorNdarray)
-        shape_ext = (value.n,) if as_vector else (value.n, value.m)
-        if to_torch:
-            out = torch.empty(ti_data_meta.shape + shape_ext, dtype=out_dtype, device="cpu" if is_metal else gs.device)
+    use_zerocopy = gs.use_zerocopy
+    if gs.use_zerocopy:
+        # Leverage zero-copy if enabled
+        try:
+            out = value._tc
+            if not to_torch:
+                out = tensor_to_array(out)
+        except AttributeError:
+            gs.logger.debug("Zezo-copy memory sharing not available for this tensor. Falling back to copy mode.")
+            use_zerocopy = False
+
+    if not use_zerocopy:
+        # Extract value as a whole.
+        # Note that this is usually much faster than using a custom kernel to extract a slice.
+        # The implementation is based on `taichi.lang.(ScalarField | MatrixField).to_torch`.
+        is_metal = gs.device.type == "mps"
+        out_dtype = _to_torch_type_fast(ti_data_meta.dtype) if to_torch else _to_numpy_type_fast(ti_data_meta.dtype)
+        data_type = type(value)
+        if issubclass(data_type, (ti.ScalarField, ti.ScalarNdarray)):
+            if to_torch:
+                out = torch.zeros(ti_data_meta.shape, dtype=out_dtype, device="cpu" if is_metal else gs.device)
+            else:
+                out = np.zeros(ti_data_meta.shape, dtype=out_dtype)
+            TO_EXT_ARR_FAST_MAP[data_type](value, out)
+        elif issubclass(data_type, ti.MatrixField):
+            as_vector = value.m == 1
+            shape_ext = (value.n,) if as_vector else (value.n, value.m)
+            if to_torch:
+                out = torch.empty(
+                    ti_data_meta.shape + shape_ext, dtype=out_dtype, device="cpu" if is_metal else gs.device
+                )
+            else:
+                out = np.zeros(ti_data_meta.shape + shape_ext, dtype=out_dtype)
+            TO_EXT_ARR_FAST_MAP[data_type](value, out, as_vector)
+        elif issubclass(data_type, (ti.VectorNdarray, ti.MatrixNdarray)):
+            layout_is_aos = 1
+            as_vector = issubclass(data_type, ti.VectorNdarray)
+            shape_ext = (value.n,) if as_vector else (value.n, value.m)
+            if to_torch:
+                out = torch.empty(
+                    ti_data_meta.shape + shape_ext, dtype=out_dtype, device="cpu" if is_metal else gs.device
+                )
+            else:
+                out = np.zeros(ti_data_meta.shape + shape_ext, dtype=out_dtype)
+            TO_EXT_ARR_FAST_MAP[ti.MatrixNdarray](value, out, layout_is_aos, as_vector)
         else:
-            out = np.zeros(ti_data_meta.shape + shape_ext, dtype=out_dtype)
-        TO_EXT_ARR_FAST_MAP[ti.MatrixNdarray](value, out, layout_is_aos, as_vector)
-    else:
-        gs.raise_exception(f"Unsupported type '{type(value)}'.")
-    if to_torch and is_metal:
-        out = out.to(gs.device)
+            gs.raise_exception(f"Unsupported type '{type(value)}'.")
+        if to_torch and is_metal:
+            out = out.to(gs.device)
 
     # Transpose if necessary and requested.
     # Note that it is worth transposing here before slicing, as it preserve row-major memory alignment in case of
@@ -645,7 +661,7 @@ def extract_slice(
     """
     # Make sure that the user-arguments are valid if requested
     if not unsafe:
-        if value.ndim == 1 and col_mask is not None:
+        if col_mask is not None and value.ndim == 1:
             gs.raise_exception("Cannot specify column mask for 1D tensor.")
         for i, mask in enumerate((row_mask, col_mask)):
             if mask is None or isinstance(mask, slice):
@@ -739,6 +755,8 @@ def ti_to_torch(
         unsafe (bool, optional): Whether to skip validity check of the masks.
     """
     tensor = ti_to_python(value, transpose, to_torch=True)
+    if row_mask is None and col_mask is None:
+        return tensor
 
     ti_data_meta = _get_ti_metadata(value)
     if len(ti_data_meta.shape) < 2:
@@ -771,6 +789,8 @@ def ti_to_numpy(
         unsafe (bool, optional): Whether to skip validity check of the masks.
     """
     tensor = ti_to_python(value, transpose, to_torch=False)
+    if row_mask is None and col_mask is None:
+        return tensor
 
     ti_data_meta = _get_ti_metadata(value)
     if len(ti_data_meta.shape) < 2:
diff --git a/tests/test_gstaichi.py b/tests/test_gstaichi.py