Update pipeline.py

mesakhcienet · mesakhcienet · commit 4820092dc238 · 2026-04-10T16:40:52.000+08:00
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -968,24 +968,11 @@ def init_states(self, inputs):
     """Initializes the pipeline execution state and communication buffers.
 
     This sets up the memory needed to pass activations between pipeline stages
-    (`state_io` and `shift`) and allocates the empty Buffer Sliding Window (BSW)
-    that will hold the gathered FSDP weights.
+    (`state_io` and `shift`). BSW (Buffer Sliding Window) is computed locally
+    inside scan_body each iteration rather than pre-allocated, so that
+    jax.checkpoint can discard it between iterations to prevent OOM.
     """
-    loop_state = super().init_states(inputs)
-
-    weights = nnx.state(self.layers, _is_static_param)
-
-    def get_single_repeat_shape(x):
-      if x is None:
-        return None
-      return jnp.zeros_like(x[0]) if self.config.num_pipeline_repeats > 1 else jnp.zeros_like(x)
-
-    bsw = (
-        jax.tree.map(get_single_repeat_shape, weights),
-        jax.tree.map(get_single_repeat_shape, weights),
-    )
-
-    return loop_state, bsw
+    return super().init_states(inputs)
 
   def gather_microbatch_inputs_vmap(self, xs, ids, ids_dim):
     """Slices out the specific sequence inputs (e.g., positions, segments) for the current microbatch."""
@@ -1371,9 +1358,11 @@ def __call__(
           (self.config.num_pipeline_microbatches, self.pipeline_microbatch_size, self.config.max_target_length)
       )
 
-    loop_state, bsw = self.init_states(inputs)
+    loop_state = self.init_states(inputs)
 
-    # - Full spec (with circular_repeats axis) -> BSW creation via weight_prefetching.
+    # Two spec variants needed:
+    # - Full spec (with circular_repeats axis) -> BSW creation inside scan_body via
+    #   from_all_variables_to_repeat_weights + from_repeat_weights_to_bsw.
     #   from_repeat_weights_to_bsw's derive_stage_weight_partition_specs drops the
     #   first dim (repeat), so the input must still have it.
     # - Stripped logical spec (circular_repeats removed) -> BSW consumption via
@@ -1402,39 +1391,34 @@ def unbox_val(x):
 
     _, layers_params, layers_metrics, layers_mutables = nnx.split(layers_state, _is_static_param, nnx.Intermediate, ...)
 
-    # Pre-populate bsw[1] with iteration-0 weights so the first scan_body
-    # slide (next_bsw[0] = current_bsw[1]) picks up the correct weights.
-    # bsw[0] is a zero placeholder — it is immediately discarded by the slide.
-    init_repeat_weights = self.from_all_variables_to_repeat_weights(layers_params, 0)
-    init_w_curr = self.from_repeat_weights_to_bsw(init_repeat_weights, physical_partition_spec_full)
-    bsw = (bsw[0], init_w_curr)
-
     def scan_body(carry, _):
-      current_loop_state, current_bsw, current_layer_mutables = carry
+      current_loop_state, current_layer_mutables = carry
       # Fold loop_iteration into RNG keys so each scan step gets a unique
       # dropout mask — mirrors Linen's nn.scan(split_rngs={"random": True}).
       iteration = current_loop_state["loop_iteration"]
       advanced_mutables = _advance_rng_state(current_layer_mutables, iteration)
 
-      # 1. Async FSDP Prefetch — only gather NEXT repeat's weights (1 all-gather).
-      # The current repeat's weights are already in current_bsw[1], carried
-      # forward from the previous iteration's prefetch (sliding window).
-      # Use FULL spec - weight_prefetching drops the repeat axis internally via
-      # derive_stage_weight_partition_specs.
-      next_weight = self.weight_prefetching(
-          layers_params, physical_partition_spec_full, current_loop_state["loop_iteration"]
-      )
-      # Sliding window: previous next (current_bsw[1]) becomes current (bsw[0]),
-      # freshly prefetched next_weight becomes next (bsw[1]).
-      next_bsw = (current_bsw[1], next_weight)
-      next_bsw = jax.ad_checkpoint.checkpoint_name(next_bsw, "bsw")
-
-      # 2. Run Forward & State Shift
+      # Compute BOTH current and next weights locally (2 all-gathers per iteration).
+      # BSW is NOT carried through scan — it is a body intermediate that
+      # jax.checkpoint discards between iterations, preventing OOM.
+      # Trade-off: 2 all-gathers/iter instead of 1 (no sliding window).
+      # Acceptable until REG-1 (nested scan + custom VJP) restores the optimization.
+      cur_repeat_weights = self.from_all_variables_to_repeat_weights(
+          layers_params, iteration)
+      cur_bsw = self.from_repeat_weights_to_bsw(
+          cur_repeat_weights, physical_partition_spec_full)
+      nxt_repeat_weights = self.from_all_variables_to_repeat_weights(
+          layers_params, iteration + 1)
+      nxt_bsw = self.from_repeat_weights_to_bsw(
+          nxt_repeat_weights, physical_partition_spec_full)
+      bsw = (cur_bsw, nxt_bsw)
+
+      # Run Forward & State Shift
       # Use STRIPPED logical spec - run_one_iteration re-derives physical from it,
       # and get_current_weights_from_bsw expects specs without the repeat axis.
       new_loop_state, new_layer_state = self.run_one_iteration(
           current_loop_state,
-          next_bsw,
+          bsw,
           layers_graph,
           layers_metrics,
           advanced_mutables,
@@ -1446,7 +1430,7 @@ def scan_body(carry, _):
       )
 
       _, _, new_layer_metrics, new_layer_mutables = nnx.split(new_layer_state, _is_static_param, nnx.Intermediate, ...)
-      return (new_loop_state, next_bsw, new_layer_mutables), new_layer_metrics
+      return (new_loop_state, new_layer_mutables), new_layer_metrics
 
     if self.config.set_remat_policy_on_pipeline_iterations:
       scan_body = jax.checkpoint(
@@ -1455,16 +1439,16 @@ def scan_body(carry, _):
 
     # Memory Efficient Execution via pure JAX scan
     if self.config.scan_pipeline_iterations:
-      (loop_state, bsw, final_layer_mutables), stacked_metrics = jax.lax.scan(
-          scan_body, (loop_state, bsw, layers_mutables), None, length=total_iterations
+      (loop_state, final_layer_mutables), stacked_metrics = jax.lax.scan(
+          scan_body, (loop_state, layers_mutables), None, length=total_iterations
       )
     else:
-      current_carry = (loop_state, bsw, layers_mutables)
+      current_carry = (loop_state, layers_mutables)
       metrics_history = []
       for _ in range(total_iterations):
         current_carry, step_metrics = scan_body(current_carry, None)
         metrics_history.append(step_metrics)
-      loop_state, bsw, final_layer_mutables = current_carry
+      loop_state, final_layer_mutables = current_carry
       stacked_metrics = jax.tree.map(lambda *xs: jnp.stack(xs), *metrics_history) if metrics_history else layers_metrics
 
     final_layer_state = nnx.State.merge(layers_params, stacked_metrics, final_layer_mutables)