Add barrier around dataset processor for race condition

szaman19 · szaman19 · commit b3b4e9b74169 · 2025-07-26T10:12:15.000-07:00
diff --git a/DGraph/CommunicatorBase.py b/DGraph/CommunicatorBase.py
@@ -26,3 +26,6 @@ def get_rank(self) -> int:
 
     def get_world_size(self) -> int:
         raise NotImplementedError
+
+    def barrier(self):
+        raise NotImplementedError
diff --git a/DGraph/data/ogbn_datasets.py b/DGraph/data/ogbn_datasets.py
@@ -174,9 +174,20 @@ def __init__(
         self._rank = self.comm_object.get_rank()
         self._world_size = self.comm_object.get_world_size()
 
-        self.dataset = NodePropPredDataset(
-            name=dname,
-        )
+        comm_object.barrier()
+        if comm_object.get_rank() == 0:
+            self.dataset = NodePropPredDataset(
+                name=dname,
+            )
+        # Block until the dataset is loaded on rank 0
+        comm_object.barrier()
+        # Load the dataset on all other ranks, but this should use the
+        # processed data on rank 0
+        if comm_object.get_rank() != 0:
+            self.dataset = NodePropPredDataset(
+                name=dname,
+            )
+
         graph_data, labels = self.dataset[0]
 
         self.split_idx = self.dataset.get_idx_split()
diff --git a/DGraph/distributed/nccl/NCCLBackendEngine.py b/DGraph/distributed/nccl/NCCLBackendEngine.py
@@ -512,6 +512,11 @@ def __init__(self, ranks_per_graph=-1, *args, **kwargs):
         if not NCCLBackendEngine._is_initialized:
             self.init_process_group(ranks_per_graph)
 
+    def barrier(self) -> None:
+        if not dist.is_initialized():
+            raise RuntimeError("NCCL backend engine is not initialized")
+        dist.barrier()
+
     def init_process_group(self, ranks_per_graph=-1, *args, **kwargs):
         if not dist.is_initialized():
             dist.init_process_group(backend="nccl", *args, **kwargs)
diff --git a/experiments/OGB/GCN.py b/experiments/OGB/GCN.py
@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 import torch.distributed as dist
+from DGraph.utils.TimingReport import TimingReport
 
 
 class ConvLayer(nn.Module):
@@ -54,24 +55,41 @@ def forward(
         num_local_nodes = node_features.size(1)
         _src_indices = edge_index[:, 0, :]
         _dst_indices = edge_index[:, 1, :]
+        TimingReport.start("pre-processing")
         _src_rank_mappings = torch.cat(
             [rank_mapping[0].unsqueeze(0), rank_mapping[0].unsqueeze(0)], dim=0
         )
         _dst_rank_mappings = torch.cat(
             [rank_mapping[0].unsqueeze(0), rank_mapping[1].unsqueeze(0)], dim=0
         )
+        TimingReport.stop("pre-processing")
+        TimingReport.start("Gather_1")
         x = self.comm.gather(
             node_features, _dst_indices, _dst_rank_mappings, cache=gather_cache
         )
+        TimingReport.stop("Gather_1")
+        TimingReport.start("Conv_1")
         x = self.conv1(x)
+        TimingReport.stop("Conv_1")
+        TimingReport.start("Scatter_1")
         x = self.comm.scatter(
             x, _src_indices, _src_rank_mappings, num_local_nodes, cache=scatter_cache
         )
+        TimingReport.stop("Scatter_1")
+        TimingReport.start("Gather_2")
         x = self.comm.gather(x, _dst_indices, _dst_rank_mappings, cache=gather_cache)
+        TimingReport.stop("Gather_2")
+        TimingReport.start("Conv_2")
         x = self.conv2(x)
+        TimingReport.stop("Conv_2")
+        TimingReport.start("Scatter_2")
         x = self.comm.scatter(
             x, _src_indices, _src_rank_mappings, num_local_nodes, cache=scatter_cache
         )
+        TimingReport.stop("Scatter_2")
+        TimingReport.start("Final_FC")
         x = self.fc(x)
+        TimingReport.stop("Final_FC")
+
         # x = self.softmax(x)
         return x
diff --git a/experiments/OGB/main.py b/experiments/OGB/main.py
@@ -38,6 +38,8 @@
 )
 import numpy as np
 import os
+from DGraph.utils.TimingReport import TimingReport
+import json
 
 
 class SingleProcessDummyCommunicator(CommunicatorBase):
@@ -81,6 +83,10 @@ def rank_cuda_device(self):
         device = torch.cuda.current_device()
         return device
 
+    def barrier(self):
+        # No-op for single process
+        pass
+
 
 def _run_experiment(
     dataset,
@@ -98,7 +104,7 @@ def _run_experiment(
     torch.cuda.set_device(local_rank)
     device = torch.cuda.current_device()
     model = GCN(
-        in_channels=100, hidden_dims=hidden_dims, num_classes=num_classes, comm=comm
+        in_channels=128, hidden_dims=hidden_dims, num_classes=num_classes, comm=comm
     )
     rank = comm.get_rank()
     model = model.to(device)
@@ -126,7 +132,7 @@ def _run_experiment(
             print(f"Rank: {rank} Mapping: {rank_mappings.shape}")
             print(f"Rank: {rank} Node Features: {node_features.shape}")
             print(f"Rank: {rank} Edge Indices: {edge_indices.shape}")
-        dist.barrier()
+        comm.barrier()
     criterion = torch.nn.CrossEntropyLoss()
 
     train_mask = dataset.graph_obj.get_local_mask("train", rank)
@@ -152,14 +158,13 @@ def _run_experiment(
         scatter_cache_file = f"{cache_prefix}_scatter_cache_{world_size}_{rank}.pt"
         gather_cache_file = f"{cache_prefix}_gather_cache_{world_size}_{rank}.pt"
 
+        # if os.path.exists(scatter_cache_file):
+        #     print(f"Rank: {rank} Loading scatter cache from {scatter_cache_file}")
+        #     scatter_cache = torch.load(scatter_cache_file, weights_only=False)
+        # else:
+        #     print(f"Rank: {rank} Scatter cache not found, generating new cache")
+        #     print(f"Rank: {rank} Cache file: {scatter_cache_file}")
 
-        if os.path.exists(scatter_cache_file):
-            print(f"Rank: {rank} Loading scatter cache from {scatter_cache_file}")
-            scatter_cache = torch.load(scatter_cache_file, weights_only=False)
-        else:
-            print(f"Rank: {rank} Scatter cache not found, generating new cache")
-            print(f"Rank: {rank} Cache file: {scatter_cache_file}")
-            
         if os.path.exists(gather_cache_file):
             print(f"Rank: {rank} Loading gather cache from {gather_cache_file}")
             gather_cache = torch.load(gather_cache_file, weights_only=False)
@@ -227,7 +232,9 @@ def _run_experiment(
             assert rank != rank
             assert value.shape[0] == scatter_cache.gather_recv_comm_vector
         end_time = perf_counter()
-        print(f"Rank: {rank} Cache Generation Time: {end_time - start_time:.4f} s")
+        elapsed_time_in_ms = (end_time - start_time) * 1000
+        print(f"Rank: {rank} Cache Generation Time: {elapsed_time_in_ms:.4f} ms")
+        TimingReport.add_time("cache_generation_time", elapsed_time_in_ms)
 
         with open(f"{log_prefix}_gather_cache_{world_size}_{rank}.pt", "wb") as f:
             torch.save(gather_cache, f)
@@ -237,7 +244,7 @@ def _run_experiment(
 
     training_times = []
     for i in range(epochs):
-        dist.barrier()
+        comm.barrier()
         torch.cuda.synchronize()
         start_time = torch.cuda.Event(enable_timing=True)
         end_time = torch.cuda.Event(enable_timing=True)
@@ -254,7 +261,7 @@ def _run_experiment(
         dist_print_ephemeral(f"Epoch {i} \t Loss: {loss.item()}", rank)
         optimizer.step()
 
-        dist.barrier()
+        comm.barrier()
         end_time.record(stream)
         torch.cuda.synchronize()
         training_times.append(start_time.elapsed_time(end_time))
@@ -362,6 +369,7 @@ def main(
                 node_rank_placement_file, weights_only=False
             )
 
+    TimingReport.init(comm)
     safe_create_dir(log_dir, comm.get_rank())
     training_dataset = DistributedOGBWrapper(
         f"ogbn-{dataset}",
@@ -377,7 +385,7 @@ def main(
     validation_accuracies = np.zeros((runs, epochs))
     world_size = comm.get_world_size()
 
-    dist.barrier()
+    comm.barrier()
     print(f"Running experiment with {world_size} processes on dataset {dataset}")
     print(f"Using cache: {use_cache}")
 
@@ -397,6 +405,11 @@ def main(
         validation_trajectores[i] = val_traj
         validation_accuracies[i] = val_accuracy
 
+    write_experiment_log(
+        json.dumps(TimingReport._timers),
+        f"{log_dir}/timing_report_world_size_{world_size}_cache_{use_cache}.json",
+        comm.get_rank(),
+    )
     visualize_trajectories(
         training_trajectores,
         "Training Loss",