Skip to content

Commit fc3f7fb

Browse files
committed
fix: handle runtime error when getting memory pool
1 parent 302bafd commit fc3f7fb

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

checkpoint_engine/ps.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -920,6 +920,9 @@ def unregister_checkpoint(self, checkpoint_name: str):
920920
)
921921
return
922922

923+
# TODO: currently, we just mark the shared memory pool as unused when unregistering.
924+
# Physically releasing the shared memory pool is not supported yet.
925+
# We may add unregister shared memory pool logic in the future if necessary.
923926
if checkpoint_name == self._current_shared_memory_pool_user:
924927
self._current_shared_memory_pool_user = ""
925928
return
@@ -945,14 +948,18 @@ def gather_metas(self, checkpoint_name: str):
945948
self.init_process_group()
946949
assert dist.is_initialized(), "process group is not initialized"
947950
metas_lst: list[DataToGather | None] = [None for _ in range(self._world_size)] # type: ignore
951+
try:
952+
memory_pool = self._get_memory_pool(checkpoint_name)
953+
except RuntimeError:
954+
memory_pool = []
948955
metas = DataToGather(
949956
memory_buffer_metas_list=[
950957
MemoryBufferMetas(
951958
metas=x.metas,
952959
ptr=x.buffer.data_ptr(),
953960
size=x.size,
954961
)
955-
for x in (self._get_memory_pool(checkpoint_name) or [])
962+
for x in memory_pool
956963
],
957964
p2p_store_addr=None if self._p2p_store is None else self._p2p_store.addr,
958965
host_ip=get_ip(),

0 commit comments

Comments
 (0)