Skip to content

Commit b976022

Browse files
committed
fix: handle runtime error when getting memory pool
1 parent 9b644df commit b976022

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

checkpoint_engine/ps.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,9 @@ def unregister_checkpoint(self, checkpoint_name: str):
930930
)
931931
return
932932

933+
# TODO: currently, we just mark the shared memory pool as unused when unregistering.
934+
# Physically releasing the shared memory pool is not supported yet.
935+
# We may add unregister shared memory pool logic in the future if necessary.
933936
if checkpoint_name == self._current_shared_memory_pool_user:
934937
self._current_shared_memory_pool_user = ""
935938
return
@@ -955,14 +958,18 @@ def gather_metas(self, checkpoint_name: str):
955958
self.init_process_group()
956959
assert dist.is_initialized(), "process group is not initialized"
957960
metas_lst: list[DataToGather | None] = [None for _ in range(self._world_size)] # type: ignore
961+
try:
962+
memory_pool = self._get_memory_pool(checkpoint_name)
963+
except RuntimeError:
964+
memory_pool = []
958965
metas = DataToGather(
959966
memory_buffer_metas_list=[
960967
MemoryBufferMetas(
961968
metas=x.metas,
962969
ptr=x.buffer.data_ptr(),
963970
size=x.size,
964971
)
965-
for x in (self._get_memory_pool(checkpoint_name) or [])
972+
for x in memory_pool
966973
],
967974
p2p_store_addr=None if self._p2p_store is None else self._p2p_store.addr,
968975
host_ip=get_ip(),

0 commit comments

Comments
 (0)