Revert "Use global user buffer when the bucket size does not fit FixedPoolAllocator (#2857)"

ko3n1g · ko3n1g · commit cb9a79217946 · 2026-03-15T17:38:19.000Z
This reverts commit afe443b.
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
@@ -122,16 +122,7 @@ class DistributedDataParallelConfig:
       This option will cause additional memory overhead, however, it is necessary for
       to register user buffer (nccl_ub=True) for the Megatron FSDP. 
       This option will be automatically set to True when nccl_ub=True.
-    """
-
-    fsdp_db_use_persist_buf_on_alloc_fail: bool = False
-    """Whether to fall back to persistent buffer when a bucket does not
-       fit FSDP double buffer size. If true, FSDP will use the persistently 
-       allocated buffer for the bucket that does not fit, it will enable NCCL 
-       user buffer with the cost of more memory usage. If false, FSDP will use
-       Dynamic memory allocator, NCCL user buffer won't not enabled, which 
-       usually leads to low performance. 
-    """
+   """
 
     fsdp_all_gather_in_start_param_sync: bool = True
     """
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py
@@ -119,23 +119,6 @@ class DistributedDataParallelConfig:
       This option will be automatically set to True when nccl_ub=True.
     """
 
-    fsdp_all_gather_in_start_param_sync: bool = True
-    """
-    If True, use all-gather during the initial Megatron-FSDP parameter
-    synchronization step. This can increase overlap between the first
-    parameter all-gather and computation, helping to better hide the
-    initial communication cost.
-    """
-
-    fsdp_db_use_persist_buf_on_alloc_fail: bool = False
-    """Whether to fall back to persistent buffer when a bucket does not
-       fit FSDP double buffer size. If true, FSDP will use the persistently 
-       allocated buffer for the bucket that does not fit, it will enable NCCL 
-       user buffer with the cost of more memory usage. If false, FSDP will use
-       Dynamic memory allocator, NCCL user buffer won't not enabled, which 
-       usually leads to low performance. 
-    """
-
     outer_dp_sharding_strategy: str = 'no_shard'
     """
     Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
@@ -98,7 +98,6 @@ def fully_shard_model(
     keep_fp8_transpose_cache: bool = False,
     nccl_ub: bool = False,
     fsdp_double_buffer: bool = False,
-    fsdp_db_use_persist_buf_on_alloc_fail: bool = False,
     disable_symmetric_registration: bool = False,
     enable_fine_grained_param_gather: bool = False,
 ) -> torch.nn.Module:
@@ -233,10 +232,6 @@ class that schedules the sharding lifecycle of the model parameters and gradient
         fsdp_double_buffer (bool):
             Whether to use double buffer for FSDP. Defaults to False.
 
-        fsdp_db_use_persist_buf_on_alloc_fail (bool):
-            Whether to fall back to persistent buffer allocator when a bucket does not
-            fit FSDP double buffer size.
-
         disable_symmetric_registration (bool):
             Whether to disable symmetric (window) registration for NCCL UB registration.
             This option forces conventional (local) UB registration when nccl_ub is set.
@@ -342,7 +337,6 @@ class that schedules the sharding lifecycle of the model parameters and gradient
         keep_fp8_transpose_cache=keep_fp8_transpose_cache,  # pylint: disable=C0301
         nccl_ub=nccl_ub,
         fsdp_double_buffer=fsdp_double_buffer or nccl_ub,
-        fsdp_db_use_persist_buf_on_alloc_fail=fsdp_db_use_persist_buf_on_alloc_fail,
         disable_symmetric_registration=disable_symmetric_registration,
         check_for_nan_in_grad=check_for_nan_in_grad,
     )
@@ -640,7 +634,6 @@ def fully_shard(
     keep_fp8_transpose_cache: bool = False,
     nccl_ub: bool = False,
     fsdp_double_buffer: bool = False,
-    fsdp_db_use_persist_buf_on_alloc_fail: bool = False,
     disable_symmetric_registration: bool = False,
     enable_fine_grained_param_gather: bool = False,
 ) -> tuple[MegatronFSDP, torch.optim.Optimizer]:
@@ -689,7 +682,6 @@ def fully_shard(
         keep_fp8_transpose_cache=keep_fp8_transpose_cache,
         nccl_ub=nccl_ub,
         fsdp_double_buffer=fsdp_double_buffer,
-        fsdp_db_use_persist_buf_on_alloc_fail=fsdp_db_use_persist_buf_on_alloc_fail,
         disable_symmetric_registration=disable_symmetric_registration,
         enable_fine_grained_param_gather=enable_fine_grained_param_gather,
     )
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
@@ -136,8 +136,6 @@ class MegatronFSDP(torch.nn.Module):
         fsdp_double_buffer (bool): Whether to use persistently allocated double buffers
             for the temporary memory needed in the FSDP communication. This flag is
             automatically set to True when nccl_ub is True.
-        fsdp_db_use_persist_buf_on_alloc_fail (bool): Whether to fall back to persistent buffer
-            allocator when a bucket does not fit FSDP double buffer size.
         disable_symmetric_registration (bool): Whether to disable symmetric (window) registration
             for NCCL userbuffer registration. This option will force to use conventional (local)
             userbuffer registration when nccl_ub is set.
@@ -157,7 +155,6 @@ class MegatronFSDP(torch.nn.Module):
         ...     keep_fp8_transpose_cache=False,
         ...     nccl_ub=False,
         ...     fsdp_double_buffer=False,
-        ...     fsdp_db_use_persist_buf_on_alloc_fail=False,
         ...     disable_symmetric_registration=False,
         ... )
     """
@@ -176,7 +173,6 @@ def __init__(
         keep_fp8_transpose_cache: bool = False,
         nccl_ub: bool = False,
         fsdp_double_buffer: bool = False,
-        fsdp_db_use_persist_buf_on_alloc_fail: bool = False,
         disable_symmetric_registration: bool = False,
         enable_fine_grained_param_gather_hook: bool = False,
     ):
@@ -221,7 +217,6 @@ def __init__(
                 keep_fp8_transpose_cache=keep_fp8_transpose_cache,  # pylint: disable=C0301
                 nccl_ub=nccl_ub,
                 fsdp_double_buffer=fsdp_double_buffer or nccl_ub,
-                fsdp_db_use_persist_buf_on_alloc_fail=fsdp_db_use_persist_buf_on_alloc_fail,
                 disable_symmetric_registration=disable_symmetric_registration,
             )
         else:
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -658,13 +658,7 @@ class FixedPoolAllocator(TemporaryBucketAllocator):
     deallocation of temporary buffers during FSDP operations.
     """
 
-    def __init__(
-        self,
-        name: str,
-        fsdp_param_groups: List["ParameterGroup"],
-        size: int = 2,
-        fallback_to_persistent_buffer: bool = False,
-    ):
+    def __init__(self, name: str, fsdp_param_groups: List["ParameterGroup"], size: int = 2):
         self.name = name
         self.fsdp_param_groups = fsdp_param_groups
         self.size = size  # Number of buffers in the pool (default is 2 for double buffering)
@@ -697,29 +691,6 @@ def __init__(
         ), "Found no FSDP units to use fixed-size buffering"
         self.fsdp_double_buffer_units = fsdp_units_to_double_buffer
 
-        if torch.distributed.get_rank() == 0:
-            for bucket_id, param_group in enumerate(fsdp_param_groups):
-                if (
-                    param_group.fsdp_unit_id == -1
-                    or param_group.fsdp_unit_id is None
-                    or param_group.fsdp_unit_id not in self.fsdp_double_buffer_units
-                ):
-                    logging.info(
-                        f"FSDP unit (id={param_group.fsdp_unit_id}) does not fit "
-                        "in FixedPoolAllcator"
-                    )
-                    if fallback_to_persistent_buffer is False:
-                        logging.info(
-                            "It will fall back to dynamic memory allocator, NCCL user "
-                            "buffer is not supported"
-                        )
-                    else:
-                        logging.info(
-                            "It will be allocated a persistent buffer. If the memory "
-                            "budget is tight, set "
-                            "trainer.strategy.ddp.fsdp_db_use_persist_buf_on_alloc_fail to False."
-                        )
-
         # Initialize buffer group status.
         # Each buffer group represents a set of buffers associated with an FSDP unit's bucket group.
         self.idle_buffer = []  # List of available (buf_group_id, offset) tuples.
@@ -732,7 +703,6 @@ def __init__(
                 self.idle_buffer.append((buf_group_id, bucket_offset))
 
         # Fallback allocator used if the fixed pool allocator cannot fulfill a request.
-        self.fallback_to_persistent_buffer = fallback_to_persistent_buffer
         self.backup_allocator = TemporaryBucketAllocator()
 
     def _is_two_bucket_group_equal(self, group_a, group_b):
@@ -785,31 +755,28 @@ def allocate(
                 f"current using_buffer: {self.using_buffer} \n"
                 f"current idle_buffer: {self.idle_buffer}"
             )
-        elif self.fallback_to_persistent_buffer is True:
-            buffer_name = f"{self.name}_not_fit_in_fixed_pool_{bucket_id}_{size}_{dtype}_{device}"
-        else:
-            # If the bucket is not eligible for fixed pool buffering, or no buffer is available,
-            # fall back to dynamic allocation via the backup allocator. This means that we
-            # will do dynamic memory allocation.
-            logging.debug(f"[FSDP] Using backup allocator for {bucket_id} {fsdp_unit_id}")
-            return self.backup_allocator.allocate(
-                bucket_id=bucket_id, size=size, dtype=dtype, device=device
+            # Synchronization is required before the allocation for the user buffer
+            if mem_alloc_context is not None and mem_alloc_context != nullcontext:
+                # Check if a new buffer allocation is required
+                if (
+                    self.allocation_tracker.get((buffer_name, dtype), None) is None
+                    or self.allocation_tracker[(buffer_name, dtype)] < size
+                ):
+                    # Requires synchronization for new buffer allocation
+                    self.allocation_tracker[(buffer_name, dtype)] = size
+                    torch.cuda.synchronize()
+            return Bucket(
+                data=get_global_memory_buffer().get_tensor(
+                    [size], dtype=dtype, name=buffer_name, mem_alloc_context=mem_alloc_context
+                )
             )
 
-        # Use buffer_name to get memory from global memory.
-        if mem_alloc_context is not None and mem_alloc_context != nullcontext:
-            # Check if a new buffer allocation is required
-            if (
-                self.allocation_tracker.get((buffer_name, dtype), None) is None
-                or self.allocation_tracker[(buffer_name, dtype)] < size
-            ):
-                # Requires synchronization for new buffer allocation
-                self.allocation_tracker[(buffer_name, dtype)] = size
-                torch.cuda.synchronize()
-        return Bucket(
-            data=get_global_memory_buffer().get_tensor(
-                [size], dtype=dtype, name=buffer_name, mem_alloc_context=mem_alloc_context
-            )
+        # If the bucket is not eligible for fixed pool buffering, or no buffer is available,
+        # fall back to dynamic allocation via the backup allocator. This means that we
+        # will do dynamic memory allocation.
+        logging.debug(f"[FSDP] Using backup allocator for {bucket_id} {fsdp_unit_id}")
+        return self.backup_allocator.allocate(
+            bucket_id=bucket_id, size=size, dtype=dtype, device=device
         )
 
     def _get_gbuf_name(self, buf_group_id: int, bucket_index: int):
@@ -828,10 +795,9 @@ def free(self, bucket_id: int):
             self.idle_buffer.append(self.using_buffer[bucket_id])
             del self.using_buffer[bucket_id]
             return
-        if self.fallback_to_persistent_buffer is False:
-            # If not managed by fixed pool allocator, delegate to the backup allocator.
-            logging.debug(f"[FSDP] Free from the backup allocator for {bucket_id} {fsdp_unit_id}")
-            self.backup_allocator.free(bucket_id)
+        # If not managed by fixed pool allocator, delegate to the backup allocator.
+        logging.debug(f"[FSDP] Free from the backup allocator for {bucket_id} {fsdp_unit_id}")
+        self.backup_allocator.free(bucket_id)
 
 
 class DataParallelBuffer:
@@ -1908,21 +1874,15 @@ def _init_each_parameter_group_buffers(self, meta_device_init_fp8_params):
         if self.ddp_config.fsdp_double_buffer and len(self.bucketing_policy.fsdp_unit_modules) > 0:
             UB_BUFFER_NUM = 2
             self.weight_alloc = FixedPoolAllocator(
-                name="fsdp_params",
-                fsdp_param_groups=self.parameter_groups,
-                size=UB_BUFFER_NUM,
-                fallback_to_persistent_buffer=self.ddp_config.fsdp_db_use_persist_buf_on_alloc_fail,
+                name="fsdp_params", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM
             )
             self.transpose_weight_alloc = FixedPoolAllocator(
                 name="fsdp_fp8_transpose_params",
                 fsdp_param_groups=self.parameter_groups,
                 size=UB_BUFFER_NUM,
             )
             self.main_grad_alloc = FixedPoolAllocator(
-                name="fsdp_grads",
-                fsdp_param_groups=self.parameter_groups,
-                size=UB_BUFFER_NUM,
-                fallback_to_persistent_buffer=self.ddp_config.fsdp_db_use_persist_buf_on_alloc_fail,
+                name="fsdp_grads", fsdp_param_groups=self.parameter_groups, size=UB_BUFFER_NUM
             )
             self.double_buf_units = self.weight_alloc.fsdp_double_buffer_units
         else: