haosulab · StoneT2000 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/docs/metadata/robot.json b/docs/metadata/robot.json
@@ -71,10 +71,6 @@
     "quality": "A",
     "name": "Standard Open Arm 100 (SO100)"
   },
-  "stompy": {
-    "quality": "C",
-    "name": "Stompy"
-  },
   "humanoid": {
     "quality": "A",
     "name": "Mujoco Humanoid"

diff --git a/mani_skill/agents/__init__.py b/mani_skill/agents/__init__.py
@@ -1,3 +1,4 @@
+# pyright: reportUnusedImport=false
 from .base_agent import BaseAgent
 from .multi_agent import MultiAgent
 from .registration import REGISTERED_AGENTS, register_agent
diff --git a/mani_skill/agents/base_agent.py b/mani_skill/agents/base_agent.py
@@ -10,10 +10,7 @@
 from gymnasium import spaces
 
 from mani_skill import format_path
-from mani_skill.agents.controllers.pd_joint_pos import (
-    PDJointPosController,
-    PDJointPosControllerConfig,
-)
+from mani_skill.agents.controllers.pd_joint_pos import PDJointPosControllerConfig
 from mani_skill.sensors.base_sensor import BaseSensor, BaseSensorConfig
 from mani_skill.utils import assets, download_asset, sapien_utils
 from mani_skill.utils.logging_utils import logger
@@ -62,7 +59,7 @@ class BaseAgent:
     """unique identifier string of this"""
     urdf_path: Union[str, None] = None
     """path to the .urdf file describe the agent's geometry and visuals. One of urdf_path or mjcf_path must be provided."""
-    urdf_config: Union[str, dict, None] = None
+    urdf_config: Union[dict, None] = None
     """Optional provide a urdf_config to further modify the created articulation"""
     mjcf_path: Union[str, None] = None
     """path to a MJCF .xml file defining a robot. This will only load the articulation defined in the XML and nothing else.
@@ -88,7 +85,7 @@ def __init__(
         scene: ManiSkillScene,
         control_freq: int,
         control_mode: Optional[str] = None,
-        agent_idx: Optional[str] = None,
+        agent_idx: Optional[int] = None,
         initial_pose: Optional[Union[sapien.Pose, Pose]] = None,
         build_separate: bool = False,
     ):
@@ -246,7 +243,7 @@ def control_mode(self):
         """Get the currently activated controller uid."""
         return self._control_mode
 
-    def set_control_mode(self, control_mode: str = None):
+    def set_control_mode(self, control_mode: Optional[str] = None):
         """Sets the controller to an pre-existing controller of this agent.
         This does not reset the controller. If given control mode is None, will set to the default control mode."""
         if control_mode is None:
@@ -265,14 +262,17 @@ def set_control_mode(self, control_mode: str = None):
                 if "balance_passive_force" in config:
                     balance_passive_force = config.pop("balance_passive_force")
                 self.controllers[control_mode] = CombinedController(
-                    config,
-                    self.robot,
-                    self._control_freq,
+                    configs=config,
+                    articulation=self.robot,
                     scene=self.scene,
+                    control_freq=self._control_freq,
                 )
             else:
                 self.controllers[control_mode] = config.controller_cls(
-                    config, self.robot, self._control_freq, scene=self.scene
+                    config=config,
+                    articulation=self.robot,
+                    scene=self.scene,
+                    control_freq=self._control_freq,
                 )
             self.controllers[control_mode].set_drive_property()
             if balance_passive_force:
@@ -352,7 +352,7 @@ def get_controller_state(self):
         """
         return self.controller.get_state()
 
-    def set_controller_state(self, state: Array):
+    def set_controller_state(self, state: dict):
         """
         Set the state of the controller.
         """
@@ -387,15 +387,15 @@ def set_state(self, state: dict, ignore_controller=False):
 
         if not ignore_controller and "controller" in state:
             self.set_controller_state(state["controller"])
-        if self.device.type == "cuda":
+        if self.scene.gpu_sim_enabled:
             self.scene._gpu_apply_all()
-            self.scene.px.gpu_update_articulation_kinematics()
+            self.scene.px.gpu_update_articulation_kinematics()  # pyright: ignore[reportAttributeAccessIssue]
             self.scene._gpu_fetch_all()
 
     # -------------------------------------------------------------------------- #
     # Other
     # -------------------------------------------------------------------------- #
-    def reset(self, init_qpos: torch.Tensor = None):
+    def reset(self, init_qpos: Optional[torch.Tensor] = None):
         """
         Reset the robot to a clean state with zero velocity and forces.
 

diff --git a/mani_skill/agents/base_real_agent.py b/mani_skill/agents/base_real_agent.py
@@ -1,10 +1,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import numpy as np
-
 from mani_skill.agents.base_agent import BaseAgent
-from mani_skill.envs.sapien_env import BaseEnv
 from mani_skill.sensors.base_sensor import BaseSensorConfig
 from mani_skill.utils.structs.types import Array
 
@@ -18,11 +15,11 @@ class BaseRealAgent:
         sensor_configs (dict[str, BaseSensorConfig]): the sensor configs to create the agent with.
     """
 
+    _sim_agent: BaseAgent
+
     def __init__(self, sensor_configs: dict[str, BaseSensorConfig] = dict()):
         self.sensor_configs = sensor_configs
 
-        self._sim_agent: BaseAgent = None
-
         @dataclass
         class RealRobot:
             agent: BaseRealAgent
@@ -126,7 +123,7 @@ def get_sensor_data(self, sensor_names: Optional[list[str]] = None):
         """
         raise NotImplementedError
 
-    def get_sensor_params(self, sensor_names: list[str] = None):
+    def get_sensor_params(self, sensor_names: Optional[list[str]] = None):
         """
         Get the parameters of the desired sensors based on the given sensor names. If sensor_names is None then all sensor parameters should be returned. The expected format for cameras is in line with the simulation's
         format is:

diff --git a/mani_skill/agents/controllers/__init__.py b/mani_skill/agents/controllers/__init__.py
@@ -1,4 +1,5 @@
 # isort: off
+# pyright: reportUnusedImport=false
 from .pd_joint_pos import (
     PDJointPosController,
     PDJointPosControllerConfig,

diff --git a/mani_skill/agents/controllers/base_controller.py b/mani_skill/agents/controllers/base_controller.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, cast
 
 import numpy as np
-import sapien
-import sapien.physx as physx
 import torch
 from gymnasium import spaces
 from gymnasium.vector.utils import batch_space
@@ -32,11 +30,13 @@ class BaseController:
     """active joints controlled"""
     active_joint_indices: torch.Tensor
     """indices of active joints controlled. Equivalent to [x.active_index for x in self.joints]"""
-    action_space: spaces.Space
+
+    # NOTE (stao): We currently only support box action spaces. Eventually we may support others
+    action_space: spaces.Box
     """the action space of the controller, which by default has a batch dimension. This is typically already normalized as well."""
-    single_action_space: spaces.Space
+    single_action_space: spaces.Box
     """The unbatched version of the action space which is also typically already normalized by this class"""
-    _original_single_action_space: spaces.Space
+    _original_single_action_space: spaces.Box
     """The unbatched, original action space without any additional processing like normalization"""
 
     sets_target_qpos: bool
@@ -48,9 +48,9 @@ def __init__(
         self,
         config: "ControllerConfig",
         articulation: Articulation,
+        scene: ManiSkillScene,
         control_freq: int,
         sim_freq: Optional[int] = None,
-        scene: ManiSkillScene = None,
     ):
         self.config = config
         self.articulation = articulation
@@ -72,8 +72,8 @@ def __init__(
             self._clip_and_scale_action_space()
         self.action_space = self.single_action_space
         if self.scene.num_envs > 1:
-            self.action_space = batch_space(
-                self.single_action_space, n=self.scene.num_envs
+            self.action_space = cast(
+                spaces.Box, batch_space(self.single_action_space, n=self.scene.num_envs)
             )
 
     @property
@@ -122,8 +122,7 @@ def set_drive_property(self):
     def reset(self):
         """Resets the controller to an initial state. This is called upon environment creation and each environment reset"""
 
-    def _preprocess_action(self, action: Array):
-        # TODO(jigu): support discrete action
+    def _preprocess_action(self, action: torch.Tensor):
         if self.scene.num_envs > 1:
             action_dim = self.action_space.shape[1]
         else:
@@ -168,7 +167,7 @@ def _clip_and_scale_action_space(self):
         self.action_space_low = common.to_tensor(low, device=self.device)
         self.action_space_high = common.to_tensor(high, device=self.device)
 
-    def _clip_and_scale_action(self, action):
+    def _clip_and_scale_action(self, action: torch.Tensor):
         return gym_utils.clip_and_scale_action(
             action, self.action_space_low, self.action_space_high
         )
@@ -191,13 +190,16 @@ class ControllerConfig:
 # Composite controllers
 # -------------------------------------------------------------------------- #
 class DictController(BaseController):
+    single_action_space: spaces.Dict
+    action_space: spaces.Dict
+
     def __init__(
         self,
         configs: dict[str, ControllerConfig],
         articulation: Articulation,
+        scene: ManiSkillScene,
         control_freq: int,
-        sim_freq: int = None,
-        scene: ManiSkillScene = None,
+        sim_freq: Optional[int] = None,
     ):
         self.scene = scene
         self.configs = configs
@@ -207,15 +209,20 @@ def __init__(
         self.controllers: dict[str, BaseController] = dict()
         for uid, config in configs.items():
             self.controllers[uid] = config.controller_cls(
-                config, articulation, control_freq, sim_freq=sim_freq, scene=scene
+                config=config,
+                articulation=articulation,
+                scene=scene,
+                control_freq=control_freq,
+                sim_freq=sim_freq,
             )
         self._initialize_action_space()
         self._initialize_joints()
 
         self.action_space = self.single_action_space
         if self.scene.num_envs > 1:
-            self.action_space = batch_space(
-                self.single_action_space, n=self.scene.num_envs
+            self.action_space = cast(
+                spaces.Dict,
+                batch_space(self.single_action_space, n=self.scene.num_envs),
             )
 
         self.sets_target_qpos = False
@@ -239,10 +246,11 @@ def _initialize_action_space(self):
 
     def _initialize_joints(self):
         self.joints = []
-        self.active_joint_indices = []
+        active_joint_indices = []
         for controller in self.controllers.values():
             self.joints.extend(controller.joints)
-            self.active_joint_indices.extend(controller.active_joint_indices)
+            active_joint_indices.extend(controller.active_joint_indices)
+        self.active_joint_indices = torch.stack(active_joint_indices)
 
     def set_drive_property(self):
         for controller in self.controllers.values():
@@ -266,7 +274,7 @@ def get_state(self) -> dict:
 
     def set_state(self, state: dict):
         for uid, controller in self.controllers.items():
-            controller.set_state(state.get(uid))
+            controller.set_state(state.get(uid, {}))
 
     def from_qpos(self, qpos: Array):
         """Tries to generate the corresponding action given a full robot qpos.
@@ -306,13 +314,15 @@ def __repr__(self):
 
 
 class CombinedController(DictController):
-
     """A flat/combined view of multiple controllers."""
 
+    single_action_space: spaces.Box
+    action_space: spaces.Box
+
     def _initialize_action_space(self):
         super()._initialize_action_space()
         self.single_action_space, self.action_mapping = flatten_action_spaces(
-            self.single_action_space.spaces
+            cast(spaces.Dict, self.single_action_space).spaces
         )
 
     def set_action(self, action: np.ndarray):
@@ -337,7 +347,7 @@ def to_action_dict(self, action: np.ndarray):
         assert action.shape == (action_dim,), (action.shape, action_dim)
 
         action_dict = {}
-        for uid, controller in self.controllers.items():
+        for uid in self.controllers.keys():
             start, end = self.action_mapping[uid]
             action_dict[uid] = action[start:end]
         return action_dict

diff --git a/mani_skill/agents/controllers/pd_base_vel.py b/mani_skill/agents/controllers/pd_base_vel.py
@@ -1,8 +1,6 @@
 import numpy as np
 import torch
-
 from gymnasium import spaces
-from mani_skill.utils.structs.types import Array
 
 from .pd_joint_vel import PDJointVelController, PDJointVelControllerConfig
 
@@ -15,7 +13,7 @@ def _initialize_action_space(self):
         assert len(self.joints) >= 3, len(self.joints)
         super()._initialize_action_space()
 
-    def set_action(self, action: Array):
+    def set_action(self, action: torch.Tensor):
         action = self._preprocess_action(action)
         # Convert to ego-centric action
         # Assume the 3rd DoF stands for orientation
@@ -45,7 +43,7 @@ def _initialize_action_space(self):
         high = np.float32(np.broadcast_to(self.config.upper, 2))
         self.single_action_space = spaces.Box(low, high, dtype=np.float32)
 
-    def set_action(self, action: Array):
+    def set_action(self, action: torch.Tensor):
         action = self._preprocess_action(action)
         # action[:, 0] should correspond to forward vel
         # action[:, 1] should correspond to rotation vel