release gc and torch cache before wakeup

Superjomn · Superjomn · commit 3f79afbd13f3 · 2026-01-13T21:34:42.000+08:00
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
@@ -16,6 +16,7 @@
 """
 
 import datetime
+import gc
 import json
 import logging
 import os
@@ -2006,6 +2007,11 @@ def compute_rm_score(self, data: DataProto):
 class AsyncActorRolloutRefWorker(ActorRolloutRefWorker):
     @register(dispatch_mode=Dispatch.DIRECT_ROLLOUT_METHOD)
     async def wake_up(self):
+        print(f"AsyncActorRolloutRefWorker wake_up before gc, rank: {self.rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+        gc.collect()
+        torch.cuda.empty_cache()
+        print(f"AsyncActorRolloutRefWorker wake_up after gc, rank: {self.rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+
         await self.rollout_mode()
         return True
 
diff --git a/verl/workers/rollout/trtllm_rollout/trtllm_async_server.py b/verl/workers/rollout/trtllm_rollout/trtllm_async_server.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 import asyncio
 import logging
+import gc
+import torch
 import os
 from typing import Any, Optional
 
@@ -183,6 +185,11 @@ async def generate(
         return TokenOutput(token_ids=token_ids, log_probs=log_probs)
 
     async def wake_up(self):
+        print(f"TRTLLMHttpServer wake_up, replica_rank: {self.replica_rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+        gc.collect()
+        torch.cuda.empty_cache()
+        print(f"TRTLLMHttpServer wake_up after gc, replica_rank: {self.replica_rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+
         if self.rollout_mode == RolloutMode.HYBRID:
             # Call all workers to switch between trainer mode and rollout mode.
             await asyncio.gather(*[worker.wake_up.remote() for worker in self.workers])