NVIDIA
diff --git a/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 0 additions & 9 deletions b/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎megatron/core/inference/symmetric_memory.py‎
Lines changed: 182 additions & 0 deletions b/‎megatron/core/inference/symmetric_memory.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎megatron/core/inference/text_generation_controllers/text_generation_controller.py‎
Lines changed: 0 additions & 9 deletions b/‎megatron/core/inference/text_generation_controllers/text_generation_controller.py‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎megatron/core/parallel_state.py‎
Lines changed: 5 additions & 54 deletions b/‎megatron/core/parallel_state.py‎
Lines changed: 5 additions & 54 deletions
@@ -573,7 +573,6 @@ def __init__(self, model_config: TransformerConfig, inference_config: InferenceC
 
         # Allocate GPU state.
         self.is_tensor_state_allocated = False
-        self.is_symmetric_memory_initialized = False
         self.initialize_all_tensors()
 
         # Print info.
@@ -2893,11 +2892,3 @@ def get_kvcache_utilization_stats(self) -> dict:
             'total_request_count': int(total_request_count),
             'max_requests': int(self.max_requests),
         }
-
-    def maybe_initialize_symmetric_memory(self):
-        """
-        Initializes symmetric memory for inference, if not already initialized
-        """
-        if not self.is_symmetric_memory_initialized:
-            parallel_state._set_global_symmetric_memory_buffer()
-            self.is_symmetric_memory_initialized = True
@@ -0,0 +1,182 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""Lazy-initialized symmetric memory manager for inference.
+
+Provides a registry of SymmetricMemoryBuffer instances keyed by a
+user-supplied identifier (e.g. "tp", "ep").  Buffers are created on first
+access so that callers never need to worry about initialization ordering
+relative to the inference context.
+"""
+
+from __future__ import annotations
+
+import operator
+from functools import reduce
+from typing import Optional
+
+import torch
+
+try:
+    import torch.distributed._symmetric_memory as symm_mem
+
+    HAVE_TORCH_SYMM_MEM = True
+except ImportError:
+    HAVE_TORCH_SYMM_MEM = False
+
+try:
+    import triton  # pylint: disable=unused-import
+
+    HAVE_TRITON = True
+except ImportError:
+    HAVE_TRITON = False
+
+
+class SymmetricMemoryBuffer:
+    """
+     symmetric memory buffer used in inference.
+    This buffer is used by mcore-inference's low-latency
+    NVLS all-gather and reduce-scatter collectives.
+    """
+
+    def __init__(self, size_in_mb, process_group):
+        if not HAVE_TORCH_SYMM_MEM or not HAVE_TRITON:
+            # This should be hit if the user is running an older
+            # version of torch, or if they do not have triton
+            # installed.
+            self.symm_buffer = None
+            self.symm_mem_hdl = None
+        else:
+            numel = int(size_in_mb * 1024 * 1024)  # size in bytes
+            try:
+                symm_mem.enable_symm_mem_for_group(process_group.group_name)
+                self.symm_buffer = symm_mem.empty(numel, dtype=torch.uint8, device='cuda')
+                self.symm_mem_hdl = symm_mem.rendezvous(self.symm_buffer, process_group)
+            except RuntimeError as e:
+                # If symmetric memory initialization fails, set buffer and handle to None
+                # This should happen if the process group is not contained within NVlink
+                self.symm_buffer = None
+                self.symm_mem_hdl = None
+
+    def _can_allocate(self, numel, dtype) -> bool:
+        """
+        Returns whether enough symmetric memory is available
+        for the given tensor shape and dtype.
+        """
+        if self.symm_mem_hdl is None:
+            return False
+        size_of_dtype = torch.tensor([], dtype=dtype).element_size()
+        required_len = numel * size_of_dtype
+        return required_len <= self.symm_buffer.numel()
+
+    def _allocate(self, numel, dtype) -> torch.Tensor:
+        """
+        Allocates a sub-tensor from the self.symm_buffer for the given numel and dtype"""
+        required_bytes = numel * torch.tensor([], dtype=dtype).element_size()
+        return self.symm_buffer[0:required_bytes].view(dtype).view(numel)
+
+    def maybe_get_tensors(self, tensor_specs, alignment=16):
+        """
+        Pack multiple tensors contiguously in the symmetric buffer with alignment.
+
+        Each tensor's starting offset is aligned to `alignment` bytes (default 16
+        for 128-bit multimem access).
+
+        Args:
+            tensor_specs: list of (numel, dtype) tuples.
+            alignment: byte alignment for each tensor's start offset (default 16).
+
+        Returns:
+            {"handle": None, "tensors": None} if unavailable or insufficient space.
+            {"handle": symm_mem_hdl, "tensors": [(raw_byte_view, byte_offset), ...]}
+            on success, where raw_byte_view is a uint8 slice of the buffer.
+        """
+        _NONE_RESULT = {"handle": None, "tensors": None}
+        if self.symm_mem_hdl is None:
+            return _NONE_RESULT
+
+        # Compute aligned byte sizes and running offsets
+        slices = []
+        current_offset = 0
+        for numel, dtype in tensor_specs:
+            nbytes = numel * torch.tensor([], dtype=dtype).element_size()
+            aligned_nbytes = ((nbytes + alignment - 1) // alignment) * alignment
+            slices.append((current_offset, nbytes))
+            current_offset += aligned_nbytes
+
+        if not self._can_allocate(current_offset, torch.uint8):
+            return _NONE_RESULT
+
+        tensors = []
+        for offset, nbytes in slices:
+            tensors.append((self.symm_buffer[offset : offset + nbytes], offset))
+
+        return {"handle": self.symm_mem_hdl, "tensors": tensors}
+
+    def maybe_get_tensor(self, tensor_shape, dtype):
+        """
+        Returns (potentially) a sub-tensor from the self.symm_buffer for the given shape.
+        If enough symmetric memory is not available, returns None.
+        """
+        if self.symm_mem_hdl is None:
+            return {"tensor": None, "handle": None}
+        numel = reduce(operator.mul, tensor_shape, 1)
+        if not self._can_allocate(numel, dtype):
+            return {"tensor": None, "handle": None}
+        return {
+            "tensor": self._allocate(numel, dtype).view(*tensor_shape),
+            "handle": self.symm_mem_hdl,
+        }
+
+
+class SymmetricMemoryManager:
+    """Registry of lazily-initialized symmetric memory buffers.
+
+    Usage::
+
+        buf = SymmetricMemoryManager.get_buffer("tp", process_group=tp_group)
+        result = buf.maybe_get_tensor(shape, dtype)
+    """
+
+    _buffers: dict[str, SymmetricMemoryBuffer] = {}
+    _default_size_mb: int = 256
+
+    @classmethod
+    def get_buffer(
+        cls,
+        key: str,
+        process_group: Optional[torch.distributed.ProcessGroup] = None,
+        size_mb: Optional[int] = None,
+    ) -> SymmetricMemoryBuffer:
+        """Return the buffer for *key*, creating it on first call.
+
+        Args:
+            key: Unique identifier (e.g. "tp", "ep").
+            process_group: Required on the first call for a given key.
+                Subsequent calls may omit it.
+            size_mb: Buffer size in MiB (default 256).
+        """
+        if key not in cls._buffers:
+            assert (
+                process_group is not None
+            ), f"SymmetricMemoryManager: process_group is required on first access for key='{key}'"
+            cls._buffers[key] = SymmetricMemoryBuffer(
+                size_in_mb=size_mb or cls._default_size_mb, process_group=process_group
+            )
+        return cls._buffers[key]
+
+    @classmethod
+    def destroy(cls, key: Optional[str] = None) -> None:
+        """Destroy one or all buffers.
+
+        Args:
+            key: If provided, destroy only that buffer. Otherwise destroy all.
+        """
+        if key is not None:
+            cls._buffers.pop(key, None)
+        else:
+            cls._buffers.clear()
+
+    @classmethod
+    def is_initialized(cls, key: str) -> bool:
+        """Check whether a buffer has been created for *key*."""
+        return key in cls._buffers
@@ -580,10 +580,6 @@ def _dynamic_step_context_init(
             else:
                 set_decode_expert_padding(unwrapped_model, False)
 
-        # initialize symmetric memory if needed
-        if model_config.transformer_impl == "inference_optimized":
-            context.maybe_initialize_symmetric_memory()
-
         if nccl_all_reduce_for_prefill and symmetric_ar_type is not None:
             if context.is_decode_only():
                 # Turn on symmetric all reduce when in decode mode
@@ -1595,11 +1591,6 @@ def dummy_forward(self):
         context = self.inference_wrapped_model.inference_context
         # if no cuda graphs, directly use dummy forward
         if not context.cuda_graph_batch_dimensions_list:
-            # initialize symmetric memory if needed
-            unwrapped_model = unwrap_model(self.inference_wrapped_model.model)
-            model_config = get_model_config(unwrapped_model)
-            if model_config.transformer_impl == "inference_optimized":
-                context.maybe_initialize_symmetric_memory()
             self.inference_wrapped_model.dummy_forward()
 
             # Disable MoE padding for MTP computation
 
@@ -12,7 +12,9 @@
 import numpy as np
 import torch
 
-from .utils import GlobalMemoryBuffer, GlobalSymmetricMemoryBuffer, is_torch_min_version
+from megatron.core.inference.symmetric_memory import SymmetricMemoryManager
+
+from .utils import GlobalMemoryBuffer, is_torch_min_version
 
 logger = logging.getLogger(__name__)
 
@@ -138,9 +140,6 @@
 # Memory buffers to avoid dynamic memory allocation
 _GLOBAL_MEMORY_BUFFER = None
 
-# Global symmetric memory buffers for inference
-_GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP = None
-_GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP = None
 
 # List of all process groups
 # Used for updating the timeout for all process groups
@@ -2017,62 +2016,18 @@ def _set_global_memory_buffer():
     _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
 
 
-def _set_global_symmetric_memory_buffer():
-    """Initialize global buffer."""
-    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP, _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP
-    assert (
-        _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP is None
-    ), "global symmetric memory buffer for TP is already initialized"
-    assert (
-        _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP is None
-    ), "global symmetric memory buffer for EP is already initialized"
-
-    _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP = GlobalSymmetricMemoryBuffer(
-        size_in_mb=256,  # todo: set from an argument?
-        process_group=get_tensor_model_parallel_group(),
-    )
-
-    _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP = GlobalSymmetricMemoryBuffer(
-        size_in_mb=256,  # todo: set from an argument?
-        process_group=get_expert_model_parallel_group(),
-    )
-
-
 def get_global_memory_buffer():
     """Return the global GlobalMemoryBuffer object"""
     assert _GLOBAL_MEMORY_BUFFER is not None, "global memory buffer is not initialized"
     return _GLOBAL_MEMORY_BUFFER
 
 
-def get_global_symmetric_memory_buffer_tp():
-    """Return the global GlobalSymmetricMemoryBuffer object"""
-    assert (
-        _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP is not None
-    ), "global symmetric memory buffer is not initialized"
-    return _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP
-
-
-def get_global_symmetric_memory_buffer_ep():
-    """Return the global GlobalSymmetricMemoryBuffer object"""
-    assert (
-        _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP is not None
-    ), "global symmetric memory buffer is not initialized"
-    return _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP
-
-
 def destroy_global_memory_buffer():
     """Sets the global memory buffer to None"""
     global _GLOBAL_MEMORY_BUFFER
     _GLOBAL_MEMORY_BUFFER = None
 
 
-def destroy_global_symmetric_memory_buffer():
-    """Sets the global symmetric memory buffer to None"""
-    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP, _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP
-    _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP = None
-    _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP = None
-
-
 def get_all_ranks():
     """Get caller's rank in tensor-model-parallel, data-parallel, context-parallel,
     pipeline-model-parallel and expert-model-parallel groups."""
@@ -2151,12 +2106,6 @@ def destroy_model_parallel():
     global _GLOBAL_MEMORY_BUFFER
     _GLOBAL_MEMORY_BUFFER = None
 
-    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP
-    _GLOBAL_SYMMETRIC_MEMORY_BUFFER_TP = None
-
-    global _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP
-    _GLOBAL_SYMMETRIC_MEMORY_BUFFER_EP = None
-
     global _DATA_PARALLEL_GROUP_GLOO
     if (
         _DATA_PARALLEL_GROUP_GLOO is not None
@@ -2239,3 +2188,5 @@ def destroy_model_parallel():
 
     global _global_process_group_list
     _global_process_group_list = None
+
+    SymmetricMemoryManager.destroy()