Skip to content

Commit 3f79afb

Browse files
committed
release gc and torch cache before wakeup
1 parent 0334716 commit 3f79afb

File tree

2 files changed

+13
-0
lines changed

2 files changed

+13
-0
lines changed

verl/workers/fsdp_workers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""
1717

1818
import datetime
19+
import gc
1920
import json
2021
import logging
2122
import os
@@ -2006,6 +2007,11 @@ def compute_rm_score(self, data: DataProto):
20062007
class AsyncActorRolloutRefWorker(ActorRolloutRefWorker):
20072008
@register(dispatch_mode=Dispatch.DIRECT_ROLLOUT_METHOD)
20082009
async def wake_up(self):
2010+
print(f"AsyncActorRolloutRefWorker wake_up before gc, rank: {self.rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
2011+
gc.collect()
2012+
torch.cuda.empty_cache()
2013+
print(f"AsyncActorRolloutRefWorker wake_up after gc, rank: {self.rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
2014+
20092015
await self.rollout_mode()
20102016
return True
20112017

verl/workers/rollout/trtllm_rollout/trtllm_async_server.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# limitations under the License.
1414
import asyncio
1515
import logging
16+
import gc
17+
import torch
1618
import os
1719
from typing import Any, Optional
1820

@@ -183,6 +185,11 @@ async def generate(
183185
return TokenOutput(token_ids=token_ids, log_probs=log_probs)
184186

185187
async def wake_up(self):
188+
print(f"TRTLLMHttpServer wake_up, replica_rank: {self.replica_rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
189+
gc.collect()
190+
torch.cuda.empty_cache()
191+
print(f"TRTLLMHttpServer wake_up after gc, replica_rank: {self.replica_rank}, gpu_mem: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, gpu_cache: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
192+
186193
if self.rollout_mode == RolloutMode.HYBRID:
187194
# Call all workers to switch between trainer mode and rollout mode.
188195
await asyncio.gather(*[worker.wake_up.remote() for worker in self.workers])

0 commit comments

Comments
 (0)