remove refit_ipc_memory_ratio

Superjomn · Superjomn · commit f3d5d86fe726 · 2026-01-15T17:28:44.000+08:00
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
@@ -189,7 +189,6 @@ class RolloutConfig(BaseConfig):
     custom: Optional[dict] = None
 
     update_weights_bucket_megabytes: int = 512
-    refit_ipc_memory_ratio: float = 0.5
 
     skip_rollout: bool = False
 
diff --git a/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py b/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py
@@ -30,7 +30,6 @@
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.multiprocessing.reductions import reduce_tensor
 
-from verl.utils.device import get_torch_device
 from verl.workers.config import HFModelConfig, RolloutConfig
 from verl.workers.rollout.base import BaseRollout
 from verl.workers.rollout.utils import is_valid_ipv6_address
@@ -46,17 +45,6 @@
 DEFAULT_MAX_WAIT_TIME = 300.0
 
 
-def get_total_available_bytes(pg: dist.ProcessGroup, rank: int, ratio: float, message: str = "") -> int:
-    mem_allocated = get_torch_device().memory_allocated()
-    mem_reserved = get_torch_device().memory_reserved()
-    mem_free, mem_total = get_torch_device().mem_get_info()
-    mem_free = mem_free + mem_reserved - mem_allocated
-    mem_free = torch.tensor(mem_free)
-    dist.all_reduce(mem_free, op=dist.ReduceOp.MIN, group=pg)
-    mem_free = mem_free.item()
-    return int(mem_free * ratio)
-
-
 def device_id_to_physical_device_id(id: int) -> int:
     """Convert a logical device ID to a physical device ID considering CUDA_VISIBLE_DEVICES."""
     if "CUDA_VISIBLE_DEVICES" in os.environ:
@@ -409,12 +397,7 @@ async def update_weights(self, weights: Generator[tuple[str, torch.Tensor], None
         if self.is_leader_rank:
             await self._init_server_adapter()
 
-        total_available_bytes = await asyncio.to_thread(
-            get_total_available_bytes,
-            self.hybrid_device_mesh["exclude_dp"].get_group(),
-            self.hybrid_device_mesh["exclude_dp"].get_local_rank(),
-            self.config.refit_ipc_memory_ratio,
-        )
+        total_available_bytes = int(self.config.update_weights_bucket_megabytes) * 1024 * 1024
 
         try:
             device_uuid = get_device_uuid(self.gpu_id)