We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 0def0b8 commit 75c8052Copy full SHA for 75c8052
1 file changed
test/distributed/test_c10d_nccl.py
@@ -639,6 +639,14 @@ def _helper_test_extra_cuda_context_by_memory(self):
639
"""
640
device = torch.device(f"cuda:{self.rank:d}")
641
x = torch.empty((1,), device=device)
642
+
643
+ # We need this barrier to ensure that all nodes have completed init_process_group
644
+ # If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
645
+ # then we artificially see a bump in memory usage. As per the following comment,
646
+ # we are going to be moving away from this function:
647
+ # https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
648
+ c10d.barrier()
649
650
# Rank 0 takes a snapshot before collective -- this snapshot should have
651
# included rank 0's own context.
652
if self.rank == 0:
0 commit comments