Skip to content

Commit 7e19c8a

Browse files
committed
feat: inplace-pin-memory need synchronization barrier
1 parent 9d4efa5 commit 7e19c8a

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

examples/update.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,9 @@ def update_weights(
100100
update_method: Literal["broadcast", "p2p", "all"] = "broadcast",
101101
uds: str | None = None,
102102
):
103-
ps.register_checkpoint(checkpoint_name, files=checkpoint_files, named_tensors=named_tensors)
104103
ps.init_process_group()
104+
dist.barrier()
105+
ps.register_checkpoint(checkpoint_name, files=checkpoint_files, named_tensors=named_tensors)
105106
check_vllm_ready(endpoint, inference_parallel_size, uds)
106107
dist.barrier()
107108
with timer("Gather metas"):
@@ -173,7 +174,9 @@ def join(
173174
args.uds,
174175
)
175176
else:
176-
if os.path.exists(os.path.join(args.checkpoint_path, "model.safetensors.index.json")):
177+
if os.path.exists(
178+
os.path.join(args.checkpoint_path, "model.safetensors.index.json")
179+
) and not args.checkpoint_path.startswith("/dev/shm/"): # noqa: S108
177180
named_tensors = split_tensors(args.checkpoint_path, rank, world_size)
178181
checkpoint_files = []
179182
else:

0 commit comments

Comments
 (0)