File tree Expand file tree Collapse file tree 1 file changed +5
-2
lines changed
Expand file tree Collapse file tree 1 file changed +5
-2
lines changed Original file line number Diff line number Diff line change @@ -100,8 +100,9 @@ def update_weights(
100100 update_method : Literal ["broadcast" , "p2p" , "all" ] = "broadcast" ,
101101 uds : str | None = None ,
102102):
103- ps .register_checkpoint (checkpoint_name , files = checkpoint_files , named_tensors = named_tensors )
104103 ps .init_process_group ()
104+ dist .barrier ()
105+ ps .register_checkpoint (checkpoint_name , files = checkpoint_files , named_tensors = named_tensors )
105106 check_vllm_ready (endpoint , inference_parallel_size , uds )
106107 dist .barrier ()
107108 with timer ("Gather metas" ):
@@ -173,7 +174,9 @@ def join(
173174 args .uds ,
174175 )
175176 else :
176- if os .path .exists (os .path .join (args .checkpoint_path , "model.safetensors.index.json" )):
177+ if os .path .exists (
178+ os .path .join (args .checkpoint_path , "model.safetensors.index.json" )
179+ ) and not args .checkpoint_path .startswith ("/dev/shm/" ): # noqa: S108
177180 named_tensors = split_tensors (args .checkpoint_path , rank , world_size )
178181 checkpoint_files = []
179182 else :
You can’t perform that action at this time.
0 commit comments