File tree Expand file tree Collapse file tree 1 file changed +8
-1
lines changed
Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Original file line number Diff line number Diff line change @@ -920,6 +920,9 @@ def unregister_checkpoint(self, checkpoint_name: str):
920920 )
921921 return
922922
923+ # TODO: currently, we just mark the shared memory pool as unused when unregistering.
924+ # Physically releasing the shared memory pool is not supported yet.
925+ # We may add unregister shared memory pool logic in the future if necessary.
923926 if checkpoint_name == self ._current_shared_memory_pool_user :
924927 self ._current_shared_memory_pool_user = ""
925928 return
@@ -945,14 +948,18 @@ def gather_metas(self, checkpoint_name: str):
945948 self .init_process_group ()
946949 assert dist .is_initialized (), "process group is not initialized"
947950 metas_lst : list [DataToGather | None ] = [None for _ in range (self ._world_size )] # type: ignore
951+ try :
952+ memory_pool = self ._get_memory_pool (checkpoint_name )
953+ except RuntimeError :
954+ memory_pool = []
948955 metas = DataToGather (
949956 memory_buffer_metas_list = [
950957 MemoryBufferMetas (
951958 metas = x .metas ,
952959 ptr = x .buffer .data_ptr (),
953960 size = x .size ,
954961 )
955- for x in ( self . _get_memory_pool ( checkpoint_name ) or [])
962+ for x in memory_pool
956963 ],
957964 p2p_store_addr = None if self ._p2p_store is None else self ._p2p_store .addr ,
958965 host_ip = get_ip (),
You can’t perform that action at this time.
0 commit comments