several minor improvements

yun-long · yun-long · commit 72eb15034fa0 · 2025-08-12T23:22:08.000-07:00
diff --git a/examples/manipulation/behavior_cloning.py b/examples/manipulation/behavior_cloning.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 from collections import deque
+from collections.abc import Iterator
 
 from torch.utils.tensorboard import SummaryWriter
 
@@ -37,7 +38,7 @@ def __init__(self, env, cfg: dict, teacher: nn.Module, device: str = "cpu"):
             img_shape=rgb_shape,
             state_dim=self._cfg["policy"]["action_head"]["state_obs_dim"],
             action_dim=action_dim,
-            device=self._device,
+            device=device,
         )
 
         # Training state
@@ -63,18 +64,18 @@ def learn(self, num_learning_iterations: int, log_dir: str) -> None:
             num_batches = 0
 
             start_time = time.time()
-            generator = self._buffer.get_batches(self._cfg.get("mini_batches_size", 32), self._cfg["num_epochs"])
+            generator = self._buffer.get_batches(self._cfg.get("num_mini_batches", 4), self._cfg["num_epochs"])
             for batch in generator:
                 # Forward pass for both action and pose prediction
-                pred_action = self._policy(batch["rgb_obs"].float(), batch["robot_pose"].float())
-                pred_left_pose, pred_right_pose = self._policy.predict_pose(batch["rgb_obs"].float())
+                pred_action = self._policy(batch["rgb_obs"], batch["robot_pose"])
+                pred_left_pose, pred_right_pose = self._policy.predict_pose(batch["rgb_obs"])
 
                 # Compute action prediction loss
-                action_loss = F.mse_loss(pred_action, batch["actions"].float())
+                action_loss = F.mse_loss(pred_action, batch["actions"])
 
                 # Compute pose estimation loss (position + orientation)
-                pose_left_loss = self._compute_pose_loss(pred_left_pose, batch["object_poses"].float())
-                pose_right_loss = self._compute_pose_loss(pred_right_pose, batch["object_poses"].float())
+                pose_left_loss = self._compute_pose_loss(pred_left_pose, batch["object_poses"])
+                pose_right_loss = self._compute_pose_loss(pred_right_pose, batch["object_poses"])
                 pose_loss = pose_left_loss + pose_right_loss
 
                 # Combined loss with weights
@@ -227,7 +228,7 @@ def load_finetuned_model(self, path: str) -> None:
 
 
 class ExperienceBuffer:
-    """Experience buffer."""
+    """A first-in-first-out buffer for experience replay."""
 
     def __init__(
         self,
@@ -238,20 +239,20 @@ def __init__(
         action_dim: int,
         device: str = "cpu",
     ):
+        self._num_envs = num_envs
+        self._max_size = max_size
         self._img_shape = img_shape
         self._state_dim = state_dim
         self._action_dim = action_dim
-        self._num_envs = num_envs
-        self._max_size = max_size
         self._device = device
+        self._ptr = 0
         self._size = 0
-        self._ptr = 0  # pointer to the next free slot in the buffer
 
-        # Initialize buffers
-        self._rgb_obs = torch.zeros(max_size, num_envs, *img_shape, device=device)
-        self._robot_pose = torch.zeros(max_size, num_envs, state_dim, device=device)
-        self._object_poses = torch.zeros(max_size, num_envs, 7, device=device)
-        self._actions = torch.zeros(max_size, num_envs, action_dim, device=device)
+        # Buffers for data
+        self._rgb_obs = torch.empty(max_size, num_envs, *img_shape, dtype=torch.float32, device=device)
+        self._robot_pose = torch.empty(max_size, num_envs, state_dim, dtype=torch.float32, device=device)
+        self._object_poses = torch.empty(max_size, num_envs, 7, dtype=torch.float32, device=device)
+        self._actions = torch.empty(max_size, num_envs, action_dim, dtype=torch.float32, device=device)
 
     def add(
         self,
@@ -261,42 +262,38 @@ def add(
         actions: torch.Tensor,
     ) -> None:
         """Add experience to buffer."""
-        ptr = self._ptr % self._max_size
-        self._rgb_obs[ptr].copy_(rgb_obs)
-        self._robot_pose[ptr].copy_(robot_pose)
-        self._object_poses[ptr].copy_(object_poses)
-        self._actions[ptr].copy_(actions)
-        self._ptr = self._ptr + 1
+        self._ptr = (self._ptr + 1) % self._max_size
+        self._rgb_obs[self._ptr] = rgb_obs
+        self._robot_pose[self._ptr] = robot_pose
+        self._object_poses[self._ptr] = object_poses
+        self._actions[self._ptr] = actions
         self._size = min(self._size + 1, self._max_size)
 
-    def get_batches(self, mini_batches_size: int, num_epochs: int):
+    def get_batches(self, num_mini_batches: int, num_epochs: int) -> Iterator[dict[str, torch.Tensor]]:
         """Generate batches for training."""
-        buffer_size = self._size * self._num_envs
-        indices = torch.randperm(buffer_size, device=self._device)
         # calculate the size of each mini-batch
-        num_batches = min(buffer_size // mini_batches_size, 10)
+        batch_size = self._size // num_mini_batches
         for _ in range(num_epochs):
-            for batch_idx in range(num_batches):
-                start = batch_idx * mini_batches_size
-                end = start + mini_batches_size
-                mb_indices = indices[start:end]
+            indices = torch.randperm(self._size)
+            for batch_idx in range(0, self._size, batch_size):
+                batch_indices = indices[batch_idx : batch_idx + batch_size]
 
                 # Yield a mini-batch of data
-                batch = {
-                    "rgb_obs": self._rgb_obs.view(-1, *self._img_shape)[mb_indices],
-                    "robot_pose": self._robot_pose.view(-1, self._state_dim)[mb_indices],
-                    "object_poses": self._object_poses.view(-1, 7)[mb_indices],
-                    "actions": self._actions.view(-1, self._action_dim)[mb_indices],
+                yield {
+                    "rgb_obs": self._rgb_obs[batch_indices].view(-1, *self._img_shape),
+                    "robot_pose": self._robot_pose[batch_indices].view(-1, self._state_dim),
+                    "object_poses": self._object_poses[batch_indices].view(-1, 7),
+                    "actions": self._actions[batch_indices].view(-1, self._action_dim),
                 }
-                yield batch
 
     def clear(self) -> None:
         """Clear the buffer."""
         self._rgb_obs.zero_()
         self._robot_pose.zero_()
+        self._object_poses.zero_()
         self._actions.zero_()
-        self._size = 0
         self._ptr = 0
+        self._size = 0
 
     def is_full(self) -> bool:
         """Check if buffer is full."""
@@ -343,9 +340,7 @@ def __init__(self, config: dict, action_dim: int):
         pose_mlp_cfg["output_dim"] = 7
         self.pose_mlp = self._build_mlp(pose_mlp_cfg)
 
-        # Force float32 for better performance
-        self.float()
-
+    @staticmethod
     def _build_cnn(self, config: dict) -> nn.Sequential:
         """Build CNN encoder for grayscale images."""
         layers = []
@@ -372,7 +367,8 @@ def _build_cnn(self, config: dict) -> nn.Sequential:
 
         return nn.Sequential(*layers)
 
-    def _build_mlp(self, config: dict) -> nn.Sequential:
+    @staticmethod
+    def _build_mlp(config: dict) -> nn.Sequential:
         mlp_input_dim = config["input_dim"]
         layers = []
         for hidden_dim in config["hidden_dims"]:
@@ -393,11 +389,6 @@ def get_features(self, rgb_obs: torch.Tensor) -> torch.Tensor:
 
     def forward(self, rgb_obs: torch.Tensor, state_obs: torch.Tensor | None = None) -> dict:
         """Forward pass with shared stereo encoder for rgb images."""
-        # Ensure float32 for better performance
-        rgb_obs = rgb_obs.float()
-        if state_obs is not None:
-            state_obs = state_obs.float()
-
         # Get features
         left_features, right_features = self.get_features(rgb_obs)
 
@@ -417,8 +408,6 @@ def forward(self, rgb_obs: torch.Tensor, state_obs: torch.Tensor | None = None)
 
     def predict_pose(self, rgb_obs: torch.Tensor) -> torch.Tensor:
         """Predict pose from rgb images and state observations."""
-        # Ensure float32 for better performance
-        rgb_obs = rgb_obs.float()
         left_features, right_features = self.get_features(rgb_obs)
         left_pose = self.pose_mlp(left_features)
         right_pose = self.pose_mlp(right_features)
diff --git a/examples/manipulation/grasp_env.py b/examples/manipulation/grasp_env.py
@@ -1,13 +1,16 @@
 import torch
 import math
 from typing import Literal
+
 import genesis as gs
 from genesis.utils.geom import (
     xyz_to_quat,
     transform_quat_by_quat,
     transform_by_quat,
 )
 
+MAX_DEPTH = 10.0
+
 
 class GraspEnv:
     def __init__(
@@ -223,9 +226,9 @@ def get_observations(self) -> tuple[torch.Tensor, dict]:
         #
         obs_components = [
             finger_pos - obj_pos,  # 3D position difference
-            finger_quat,  # current orientation (4D quaternion)
+            finger_quat,  # current orientation (w, x, y, z)
             obj_pos,  # goal position
-            obj_quat,  # goal orientation (4D quaternion)
+            obj_quat,  # goal orientation (w, x, y, z)
         ]
         obs_tensor = torch.cat(obs_components, dim=-1)
         self.extras["observations"]["critic"] = obs_tensor
@@ -237,27 +240,25 @@ def rescale_action(self, action: torch.Tensor) -> torch.Tensor:
 
     def get_depth_image(self, normalize: bool = True) -> torch.Tensor:
         # Render depth image from the camera
-        _, depth, _, _ = self.batch_cam.render(rgb=False, depth=True)
+        _, depth, _, _ = self.batch_cam.render(rgb=False, depth=True, segmentation=False, normal=False)
         depth = depth.permute(0, 3, 1, 2)  # shape (B, 1, H, W)
         if normalize:
-            depth = torch.clamp(depth, min=0.0, max=10)
-            depth = (depth - 0.0) / (10.0 - 0.0)  # normalize to [0, 1]
+            depth = torch.clamp(depth, min=0.0, max=MAX_DEPTH)
+            depth = (depth - 0.0) / (MAX_DEPTH - 0.0)  # normalize to [0, 1]
         return depth
 
     def get_stereo_rgb_images(self, normalize: bool = True) -> torch.Tensor:
-        rgb_left, _, _, _ = self.left_cam.render(rgb=True, depth=False)
-        rgb_right, _, _, _ = self.right_cam.render(rgb=True, depth=False)
+        rgb_left, _, _, _ = self.left_cam.render(rgb=True, depth=False, segmentation=False, normal=False)
+        rgb_right, _, _, _ = self.right_cam.render(rgb=True, depth=False, segmentation=False, normal=False)
 
         # Convert to proper format
         rgb_left = rgb_left.permute(0, 3, 1, 2)[:, :3]  # shape (B, 3, H, W)
         rgb_right = rgb_right.permute(0, 3, 1, 2)[:, :3]  # shape (B, 3, H, W)
 
         # Normalize if requested
         if normalize:
-            rgb_left = torch.clamp(rgb_left, min=0.0, max=255.0)
-            rgb_left = (rgb_left - 0.0) / (255.0 - 0.0)
-            rgb_right = torch.clamp(rgb_right, min=0.0, max=255.0)
-            rgb_right = (rgb_right - 0.0) / (255.0 - 0.0)
+            rgb_left = torch.clamp(rgb_left, min=0.0, max=255.0) / 255.0
+            rgb_right = torch.clamp(rgb_right, min=0.0, max=255.0) / 255.0
 
         # Concatenate left and right rgb images along channel dimension
         # Result: [B, 6, H, W] where channel 0 is left rgb, channel 1 is right rgb
diff --git a/examples/manipulation/grasp_train.py b/examples/manipulation/grasp_train.py
@@ -29,7 +29,7 @@ def get_train_cfg(exp_name, max_iterations):
             "class_name": "PPO",
             "clip_param": 0.2,
             "desired_kl": 0.01,
-            "entropy_coef": 0.00,
+            "entropy_coef": 0.0,
             "gamma": 0.99,
             "lam": 0.95,
             "learning_rate": 0.0003,
@@ -72,7 +72,7 @@ def get_train_cfg(exp_name, max_iterations):
         "num_steps_per_env": 24,
         "learning_rate": 0.001,
         "num_epochs": 5,
-        "mini_batches_size": 512,
+        "num_mini_batches": 10,
         "max_grad_norm": 1.0,
         # Network architecture
         "policy": {
@@ -210,5 +210,10 @@ def main():
 
 """
 # training
-python examples/manipulation/grasp_train.py
+
+# to train the RL policy
+python examples/manipulation/grasp_train.py --stage=rl  
+
+# to train the BC policy (requires RL policy to be trained first)
+python examples/manipulation/grasp_train.py --stage=bc 
 """