File tree Expand file tree Collapse file tree 1 file changed +8
-1
lines changed
Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Original file line number Diff line number Diff line change @@ -930,6 +930,9 @@ def unregister_checkpoint(self, checkpoint_name: str):
930930 )
931931 return
932932
933+ # TODO: currently, we just mark the shared memory pool as unused when unregistering.
934+ # Physically releasing the shared memory pool is not supported yet.
935+ # We may add unregister shared memory pool logic in the future if necessary.
933936 if checkpoint_name == self ._current_shared_memory_pool_user :
934937 self ._current_shared_memory_pool_user = ""
935938 return
@@ -955,14 +958,18 @@ def gather_metas(self, checkpoint_name: str):
955958 self .init_process_group ()
956959 assert dist .is_initialized (), "process group is not initialized"
957960 metas_lst : list [DataToGather | None ] = [None for _ in range (self ._world_size )] # type: ignore
961+ try :
962+ memory_pool = self ._get_memory_pool (checkpoint_name )
963+ except RuntimeError :
964+ memory_pool = []
958965 metas = DataToGather (
959966 memory_buffer_metas_list = [
960967 MemoryBufferMetas (
961968 metas = x .metas ,
962969 ptr = x .buffer .data_ptr (),
963970 size = x .size ,
964971 )
965- for x in ( self . _get_memory_pool ( checkpoint_name ) or [])
972+ for x in memory_pool
966973 ],
967974 p2p_store_addr = None if self ._p2p_store is None else self ._p2p_store .addr ,
968975 host_ip = get_ip (),
You can’t perform that action at this time.
0 commit comments