[rollout,docs] fix: improve error message (volcengine#4682) and docstrings (volcengine#1345) (volcengine#4729)

yurekami · claude · boren-ms · commit c04c345e2b8e · 2025-12-30T19:28:44.000Z
## Summary This PR contains two contributions: ### 1. Fix for Issue volcengine#4682 - Informative error message for `generate_sequences` - **Problem:** `vLLMAsyncRollout.generate_sequences()` raised a bare `NotImplementedError`, leaving users confused when running generation scripts - **Root cause:** The vLLM SPMD (sync) mode was retired in PR volcengine#4411, but the generation workflow (`main_generation.py`) still expects a synchronous `generate_sequences()` method - **Fix:** Added an informative error message explaining: - Sync mode was retired in PR volcengine#4411 - Users should use the async server interface (`vLLMReplica`, `AsyncLLMServerManager`) - Alternative: use `HFRollout` for synchronous generation - Links to issue volcengine#4682 for details - Also updated `generation.yaml` config comments to document the limitation ### 2. Documentation improvement for Issue volcengine#1345 - Google-style docstrings in `device.py` Standardized all function docstrings in `verl/utils/device.py` to follow Google-style documentation format: - `is_torch_npu_available()`: Added detailed description and return type - `get_visible_devices_keyword()`: Clarified purpose and return values - `get_device_name()`: Improved description of supported devices - `get_torch_device()`: Documented fallback behavior - `get_device_id()`: Concise description with example - `get_nccl_backend()`: Explained HCCL vs NCCL selection - `set_expandable_segments()`: Added OOM context and Note section - `auto_set_ascend_device_name()`: Documented NPU auto-configuration - `get_device_capability()`: Added proper type hints and description ## Test plan - [x] Python syntax verification passed for all modified files - [ ] CI tests should pass (no functional changes, only error messages and docstrings) Fixes volcengine#4682 Contributes to volcengine#1345 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: yurekami <yurekami@users.noreply.github.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/verl/trainer/config/generation.yaml b/verl/trainer/config/generation.yaml
@@ -16,7 +16,11 @@ model:
 rollout:
   _target_: verl.workers.config.RolloutConfig
   name: vllm
-  mode: sync # sync: LLM, async: AsyncLLM
+  # NOTE: 'sync' mode was removed in PR #4411. Only 'async' mode is supported.
+  # WARNING: The main_generation.py workflow is currently broken for vLLM async rollout
+  # as it requires synchronous generate_sequences() which vLLMAsyncRollout doesn't support.
+  # See issue #4682 for discussion and workarounds.
+  mode: async
   temperature: 1.0
   top_k: 50 # 0 for hf rollout, -1 for vllm rollout
   top_p: 0.7
diff --git a/verl/utils/device.py b/verl/utils/device.py
@@ -16,7 +16,14 @@
 
 
 def is_torch_npu_available() -> bool:
-    """Check the availability of NPU"""
+    """Check if Ascend NPU is available for PyTorch operations.
+
+    Attempts to detect NPU availability by checking for the torch.npu module
+    and its is_available() function.
+
+    Returns:
+        bool: True if NPU is available, False otherwise.
+    """
     try:
         if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)):
             return torch.npu.is_available()
@@ -30,18 +37,26 @@ def is_torch_npu_available() -> bool:
 
 
 def get_visible_devices_keyword() -> str:
-    """Function that gets visible devices keyword name.
+    """Get the environment variable name for visible device selection.
+
+    Returns the appropriate environment variable name based on the available
+    accelerator type (CUDA or Ascend NPU).
+
     Returns:
-        'CUDA_VISIBLE_DEVICES' or `ASCEND_RT_VISIBLE_DEVICES`
+        str: 'CUDA_VISIBLE_DEVICES' if CUDA is available,
+            'ASCEND_RT_VISIBLE_DEVICES' otherwise.
     """
     return "CUDA_VISIBLE_DEVICES" if is_cuda_available else "ASCEND_RT_VISIBLE_DEVICES"
 
 
 def get_device_name() -> str:
-    """Function that gets the torch.device based on the current machine.
-    This currently only supports CPU, CUDA, NPU.
+    """Get the device type string based on available accelerators.
+
+    Detects the available accelerator and returns the corresponding PyTorch
+    device type string. Currently supports CUDA, Ascend NPU, and CPU.
+
     Returns:
-        device
+        str: Device type string ('cuda', 'npu', or 'cpu').
     """
     if is_cuda_available:
         device = "cuda"
@@ -52,10 +67,15 @@ def get_device_name() -> str:
     return device
 
 
-def get_torch_device() -> any:
-    """Return the corresponding torch attribute based on the device type string.
+def get_torch_device():
+    """Get the PyTorch device module for the current accelerator.
+
+    Returns the torch device namespace (e.g., torch.cuda, torch.npu) based on
+    the detected accelerator type. Falls back to torch.cuda if the namespace
+    is not found.
+
     Returns:
-        module: The corresponding torch device namespace, or torch.cuda if not found.
+        module: The PyTorch device module (torch.cuda, torch.npu, etc.).
     """
     device_name = get_device_name()
     try:
@@ -66,17 +86,22 @@ def get_torch_device() -> any:
 
 
 def get_device_id() -> int:
-    """Return current device id based on the device type.
+    """Get the index of the current accelerator device.
+
     Returns:
-        device index
+        int: The current device index (e.g., 0 for 'cuda:0').
     """
     return get_torch_device().current_device()
 
 
 def get_nccl_backend() -> str:
-    """Return nccl backend type based on the device type.
+    """Get the distributed communication backend based on device type.
+
+    Returns the appropriate collective communication backend for the
+    detected accelerator (HCCL for Ascend NPU, NCCL for CUDA).
+
     Returns:
-        nccl backend type string.
+        str: Backend name ('hccl' for NPU, 'nccl' for CUDA/default).
     """
     if is_npu_available:
         return "hccl"
@@ -86,15 +111,32 @@ def get_nccl_backend() -> str:
 
 
 def set_expandable_segments(enable: bool) -> None:
-    """Enable or disable expandable segments for cuda.
+    """Configure CUDA memory allocator expandable segments setting.
+
+    Expandable segments can help avoid out-of-memory (OOM) errors by allowing
+    the memory allocator to expand existing memory segments rather than
+    allocating new ones.
+
     Args:
-        enable (bool): Whether to enable expandable segments. Used to avoid OOM.
+        enable: If True, enable expandable segments. If False, disable them.
+
+    Note:
+        This function only has an effect when CUDA is available.
     """
     if is_cuda_available:
         torch.cuda.memory._set_allocator_settings(f"expandable_segments:{enable}")
 
 
-def auto_set_ascend_device_name(config):
+def auto_set_ascend_device_name(config) -> None:
+    """Automatically configure device name for Ascend NPU environments.
+
+    If running on an Ascend NPU system, this function ensures the trainer
+    device configuration is set to 'npu'. Logs a warning if the config
+    was set to a different device type.
+
+    Args:
+        config: Configuration object with trainer.device attribute.
+    """
     if config and config.trainer and config.trainer.device:
         if is_torch_npu_available():
             if config.trainer.device != "npu":
@@ -106,7 +148,16 @@ def auto_set_ascend_device_name(config):
             config.trainer.device = "npu"
 
 
-def get_device_capability(device_id: int = 0) -> tuple[int, int]:
+def get_device_capability(device_id: int = 0) -> tuple[int | None, int | None]:
+    """Get the compute capability of a CUDA device.
+
+    Args:
+        device_id: The CUDA device index to query. Defaults to 0.
+
+    Returns:
+        tuple: A tuple of (major, minor) compute capability version,
+            or (None, None) if CUDA is not available.
+    """
     major, minor = None, None
     if is_cuda_available:
         major, minor = torch.cuda.get_device_capability(device_id)
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout.py b/verl/workers/rollout/vllm_rollout/vllm_rollout.py
@@ -270,8 +270,22 @@ async def update_weights(self, weights: Generator[tuple[str, torch.Tensor], None
                 model.load_weights(weights)
 
     def generate_sequences(self, prompts: DataProto) -> DataProto:
-        """Batch generate sequences in sync mode."""
-        raise NotImplementedError
+        """Batch generate sequences in sync mode.
+
+        Note: vLLMAsyncRollout uses async server mode and does not support synchronous
+        generation. Since SPMD mode was retired (PR #4411), the generation workflow
+        should use the async server interface instead.
+
+        Raises:
+            NotImplementedError: Always raised as sync generation is not supported.
+        """
+        raise NotImplementedError(
+            "vLLMAsyncRollout does not support synchronous generate_sequences(). "
+            "The vLLM SPMD mode was retired in PR #4411. For batch generation, "
+            "please use the async server interface via vLLMReplica and AsyncLLMServerManager, "
+            "or use HFRollout for synchronous generation. "
+            "See https://github.com/volcengine/verl/issues/4682 for more details."
+        )
 
     # ==================== server mode public methods ====================