vllm-project · bbrowning · Dec 12, 2025 · bbrowning · Dec 16, 2025
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -273,9 +273,10 @@ def custom_all_reduce(self, input: torch.Tensor) -> torch.Tensor | None:
             if torch.cuda.is_current_stream_capturing():
                 return self.all_reduce(input, registered=True)
             else:
-                # If warm up, mimic the allocation pattern since custom
-                # allreduce is out-of-place.
-                return torch.empty_like(input)
+                # During warmup, we need to run the all_reduce operation so
+                # that the allocation, copy, and collective behaviors
+                # are consistent between warmup, capture, and replay.
+                return self.all_reduce(input, registered=False)
         else:
             # Note: outside of cuda graph context, custom allreduce incurs a
             # cost of cudaMemcpy, which should be small (<=1% of overall