Skip to content

Commit 490c222

Browse files
authored
fix destroy process group error when using p2p update (#30)
1 parent 9dcbf49 commit 490c222

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

checkpoint_engine/ps.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -792,13 +792,15 @@ def update(
792792
self.init_process_group()
793793
self._update_per_bucket(checkpoint_name, req_func)
794794
else:
795-
if self._rank not in ranks:
795+
if not self._auto_pg and self._rank not in ranks:
796796
return
797797
if self._auto_pg:
798798
if dist.is_initialized():
799799
dist.destroy_process_group()
800800
# HACK: wait 2s to ensure destroy is finished
801801
time.sleep(2)
802+
if self._rank not in ranks:
803+
return
802804
self.init_process_group_for_ranks(ranks)
803805
self._update_per_bucket_p2p(checkpoint_name, req_func, ranks)
804806
if self._auto_pg:

0 commit comments

Comments
 (0)