ubuntu@gb200-0:~/fireworks/symm-mem-recipes$ export GLOO_SOCKET_IFNAME=enp50s0 # helps TCP rendezvous on the right NIC
torchrun \
--nproc-per-node 4 --nnodes 2 --node_rank 0 \
--rdzv-endpoint 172.27.59.217:29500 \
--no_python python3 symm_mem_all_reduce.py --impl multimem_all_reduce --impl one_shot_all_reduce --impl two_shot_all_reduce
/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
W1002 16:38:18.315000 223460 torch/distributed/run.py:803]
W1002 16:38:18.315000 223460 torch/distributed/run.py:803] *****************************************
W1002 16:38:18.315000 223460 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W1002 16:38:18.315000 223460 torch/distributed/run.py:803] *****************************************
/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
Benchmarking two_shot_all_reduce...
[rank1]: Traceback (most recent call last):
[rank1]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 123, in <module>
[rank1]: main()
[rank1]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1442, in __call__
[rank1]: return self.main(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1363, in main
[rank1]: rv = self.invoke(ctx)
[rank1]: ^^^^^^^^^^^^^^^^
[rank1]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1226, in invoke
[rank1]: return ctx.invoke(self.callback, **ctx.params)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 794, in invoke
[rank1]: return callback(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 117, in main
[rank1]: benchmark(device, impl, msg_sz_bytes)
[rank1]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 70, in benchmark
[rank1]: symm_mem.rendezvous(msg, dist.group.WORLD.group_name)
[rank1]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_symmetric_memory/__init__.py", line 1734, in rendezvous
[rank1]: return _SymmetricMemory.rendezvous(tensor, group_name)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: RuntimeError: CUDASymmetricMemoryAllocator::rendezvous: detected allocations from overlapping devices from different ranks.
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 123, in <module>
[rank0]: main()
[rank0]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1442, in __call__
[rank0]: return self.main(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1363, in main
[rank0]: rv = self.invoke(ctx)
[rank0]: ^^^^^^^^^^^^^^^^
[rank0]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1226, in invoke
[rank0]: return ctx.invoke(self.callback, **ctx.params)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 794, in invoke
[rank0]: return callback(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 117, in main
[rank0]: benchmark(device, impl, msg_sz_bytes)
[rank0]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 70, in benchmark
[rank0]: symm_mem.rendezvous(msg, dist.group.WORLD.group_name)
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_symmetric_memory/__init__.py", line 1734, in rendezvous
[rank0]: return _SymmetricMemory.rendezvous(tensor, group_name)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: RuntimeError: CUDASymmetricMemoryAllocator::rendezvous: detected allocations from overlapping devices from different ranks.
[rank3]: Traceback (most recent call last):
[rank3]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 123, in <module>
[rank3]: main()
[rank3]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1442, in __call__
[rank3]: return self.main(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1363, in main
[rank3]: rv = self.invoke(ctx)
[rank3]: ^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1226, in invoke
[rank3]: return ctx.invoke(self.callback, **ctx.params)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 794, in invoke
[rank3]: return callback(*args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 117, in main
[rank3]: benchmark(device, impl, msg_sz_bytes)
[rank3]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 70, in benchmark
[rank3]: symm_mem.rendezvous(msg, dist.group.WORLD.group_name)
[rank3]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_symmetric_memory/__init__.py", line 1734, in rendezvous
[rank3]: return _SymmetricMemory.rendezvous(tensor, group_name)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: RuntimeError: CUDASymmetricMemoryAllocator::rendezvous: detected allocations from overlapping devices from different ranks.
[rank2]: Traceback (most recent call last):
[rank2]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 123, in <module>
[rank2]: main()
[rank2]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1442, in __call__
[rank2]: return self.main(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1363, in main
[rank2]: rv = self.invoke(ctx)
[rank2]: ^^^^^^^^^^^^^^^^
[rank2]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 1226, in invoke
[rank2]: return ctx.invoke(self.callback, **ctx.params)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 794, in invoke
[rank2]: return callback(*args, **kwargs)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 117, in main
[rank2]: benchmark(device, impl, msg_sz_bytes)
[rank2]: File "/home/ubuntu/fireworks/symm-mem-recipes/symm_mem_all_reduce.py", line 70, in benchmark
[rank2]: symm_mem.rendezvous(msg, dist.group.WORLD.group_name)
[rank2]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_symmetric_memory/__init__.py", line 1734, in rendezvous
[rank2]: return _SymmetricMemory.rendezvous(tensor, group_name)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: RuntimeError: CUDASymmetricMemoryAllocator::rendezvous: detected allocations from overlapping devices from different ranks.
Note - works on a single node.
Testing on 2 GB200 nodes on multinode nvlink
Note - works on a single node.