stream based gradient accumulation

s-noghabi · The tunix Authors · commit b41470afc0c4 · 2026-05-11T21:59:41.000-07:00
PiperOrigin-RevId: 914055918
diff --git a/tunix/rl/agentic/agentic_rl_learner.py b/tunix/rl/agentic/agentic_rl_learner.py
@@ -768,7 +768,9 @@ def train(
           self._training_config.max_seq_token_per_tpu,
       )
       train_data_gen = rl_utils.pack_sequences(
-          train_data_gen, self._training_config.max_seq_token_per_tpu
+          train_data_gen,
+          self._training_config.max_seq_token_per_tpu,
+          target_items_per_update=grad_acc_steps,
       )
     micro_batches_since_last_sync = 0
     micro_batches_per_full_batch = full_batch_size // train_micro_batch_size
diff --git a/tunix/rl/common.py b/tunix/rl/common.py
@@ -105,6 +105,7 @@ class TrainExample:
   old_per_token_logps: jax.Array | None
   segment_ids: jax.Array | None = None
   segment_positions: jax.Array | None = None
+  is_update_step: jax.Array | None = None
 
 
 def compute_kl_divergence(
diff --git a/tunix/rl/rl_learner.py b/tunix/rl/rl_learner.py
@@ -724,7 +724,9 @@ def queue_iterator():
           self._training_config.max_seq_token_per_tpu,
       )
       train_data_gen = rl_utils.pack_sequences(
-          train_data_gen, self._training_config.max_seq_token_per_tpu
+          train_data_gen,
+          self._training_config.max_seq_token_per_tpu,
+          target_items_per_update=grad_acc_steps,
       )
 
     curr_eval_ds = None
diff --git a/tunix/rl/utils.py b/tunix/rl/utils.py
@@ -335,13 +335,15 @@ def pack_sequences(
     item_iterator: Iterator[list[common.TrainExample]],
     max_token_budget: int,
     pad_id: int = 0,
+    target_queue_items_per_update: int | None = None,
 ) -> Iterator[list[common.TrainExample]]:
   """Packs a stream of TrainExamples into 1D sequences up to a token budget."""
   buffer = []
   current_tokens = 0
   example_cls = common.TrainExample
+  accumulated_queue_items = 0
 
-  def _flush_buffer() -> list[common.TrainExample]:
+  def _flush_buffer(is_update: bool = False) -> list[common.TrainExample]:
     nonlocal buffer, current_tokens
     if not buffer:
       return []
@@ -429,13 +431,16 @@ def _pad(arr_list, val, length):
     if has_policy_version:
       kwargs["policy_version"] = buffer[0]["policy_version"]
 
+    kwargs["is_update_step"] = jnp.array(is_update, dtype=jnp.bool_)
+
     packed_example = example_cls(**kwargs)  # pytype: disable=wrong-keyword-args
 
     buffer.clear()
     current_tokens = 0
     return [packed_example]
 
   for item_list in item_iterator:
+    accumulated_queue_items += 1
     for example in item_list:
       example_cls = type(example)
       unpadded_items = unpad_train_example(example)
@@ -453,13 +458,21 @@ def _pad(arr_list, val, length):
           continue
 
         if current_tokens + tokens > max_token_budget:
-          yield _flush_buffer()
+          # Flush normally. The final batch logic below will trigger is_update=True.
+          yield _flush_buffer(is_update=False)
 
         buffer.append(item)
         current_tokens += tokens
 
+    if (
+        target_queue_items_per_update
+        and accumulated_queue_items >= target_queue_items_per_update
+    ):
+      yield _flush_buffer(is_update=True)
+      accumulated_queue_items = 0
+
   if buffer:
-    yield _flush_buffer()
+    yield _flush_buffer(is_update=True)
 
 
 VERIFY_UPDATE_PARAMS_KEY = "VERIFY_UPDATE_PARAMS_SRC_TO_TGT_MODULE_NAME"
diff --git a/tunix/sft/peft_trainer.py b/tunix/sft/peft_trainer.py
@@ -164,6 +164,35 @@ def _calculate_global_batch_size(train_example: Any) -> int:
   )
 
 
+class AccGrad(nnx.Variable):
+  pass
+
+
+class GradientAccumulator(nnx.Module):
+  """Accumulates gradients manually."""
+
+  def __init__(self, model: nnx.Module, wrt: Any):
+    state = nnx.state(model, wrt)
+    self.grads = jax.tree_util.tree_map(
+        lambda x: AccGrad(jnp.zeros_like(x)), state
+    )
+
+  def add(self, grads: Any):
+    def _add(acc, g):
+      acc.value = acc.value + g
+
+    jax.tree_util.tree_map(_add, self.grads, grads)
+
+  def get(self):
+    return jax.tree_util.tree_map(lambda x: x.value, self.grads)
+
+  def reset(self):
+    def _reset(acc):
+      acc.value = jnp.zeros_like(acc.value)
+
+    jax.tree_util.tree_map(_reset, self.grads)
+
+
 class PeftTrainer:
   """PEFT trainer for LoRA. Only LoRA parameters are updated.
 
@@ -186,7 +215,7 @@ class PeftTrainer:
     data_hooks: The data hooks to use.
   """
 
-  supports_sequence_packing = False
+  supports_sequence_packing = True
 
   def __init__(
       self,
@@ -209,14 +238,9 @@ def __init__(
     self.model = model
     self.config = training_config
     self._lora_enabled = utils.is_lora_enabled(self.model)
-    if training_config.gradient_accumulation_steps is not None:
-      optimizer = optax.MultiSteps(
-          optimizer, training_config.gradient_accumulation_steps
-      )
-    if self._lora_enabled:
-      self.optimizer = nnx.Optimizer(self.model, optimizer, wrt=nnx.LoRAParam)
-    else:
-      self.optimizer = nnx.Optimizer(self.model, optimizer, wrt=nnx.Param)
+    wrt_target = nnx.LoRAParam if self._lora_enabled else nnx.Param
+    self.optimizer = nnx.Optimizer(self.model, optimizer, wrt=wrt_target)
+    self.grad_accumulator = GradientAccumulator(self.model, wrt_target)
 
     self.loss_fn = _default_loss_fn
     self.eval_loss_fn = _default_loss_fn
@@ -329,14 +353,21 @@ def with_gen_model_input_fn(
     return self
 
   def _train_step(
-      self, model: nnx.Module, optimizer: nnx.Optimizer, inputs: Any
+      self,
+      model: nnx.Module,
+      optimizer: nnx.Optimizer,
+      grad_accumulator: GradientAccumulator,
+      inputs: Any,
+      is_update_step: jax.Array,
   ) -> Tuple[ArrayLike, Any | None, ArrayLike]:
     """Main body for one train step.
 
     Args:
       model: The model to train.
       optimizer: The optimizer to use.
+      grad_accumulator: The gradient accumulator to use.
       inputs: The training input.
+      is_update_step: Whether to update the model.
 
     Returns:
       A tuple containing the loss, auxiliary data (or None if has_aux is False),
@@ -350,8 +381,21 @@ def _train_step(
         has_aux=self._has_aux,
     )
     out, grads = grad_fn(model, **inputs)
-    grad_norm = optax.global_norm(grads)
-    optimizer.update(model, grads)
+
+    grad_accumulator.add(grads)
+
+    def apply_updates():
+      acc_grads = grad_accumulator.get()
+      norm = optax.global_norm(acc_grads)
+      optimizer.update(model, acc_grads)
+      grad_accumulator.reset()
+      return norm
+
+    def skip_updates():
+      return jnp.array(0.0, dtype=jnp.float32)
+
+    grad_norm = jax.lax.cond(is_update_step, apply_updates, skip_updates)
+
     if self._has_aux:
       loss, aux = out
       return loss, aux, grad_norm
@@ -397,6 +441,15 @@ def _shard_optimizer(self, mesh: shd.Mesh) -> None:
     )
     nnx.update(self.optimizer, optimizer_sharded_state)
 
+    wrt_target = nnx.LoRAParam if self._lora_enabled else nnx.Param
+    model_state = nnx.state(self.model, wrt_target)
+    model_pspecs = nnx.get_partition_spec(model_state)
+    accumulator_state = nnx.state(self.grad_accumulator, AccGrad)
+    accumulator_sharded_state = jax.lax.with_sharding_constraint(
+        accumulator_state, model_pspecs
+    )
+    nnx.update(self.grad_accumulator, accumulator_sharded_state)
+
   def jit_train_and_eval_step(
       self, skip_jit: bool = False, cache_nnx_graph: bool = False
   ):
@@ -419,7 +472,7 @@ def jit_train_and_eval_step(
     if self._jitted_train_step_fn is None:
       self._shard_optimizer(pxla.thread_resources.env.physical_mesh)
       self._jitted_train_step_fn = nnx.jit(
-          train_step, donate_argnames=("optimizer",)
+          train_step, donate_argnames=("optimizer", "grad_accumulator")
       )
       self._jitted_eval_step_fn = nnx.jit(eval_step)
 
@@ -431,7 +484,10 @@ def maybe_cache_and_partial(f, *args):
           return functools.partial(f, *args)
 
       self._jitted_train_step_fn = maybe_cache_and_partial(
-          self._jitted_train_step_fn, self.model, self.optimizer
+          self._jitted_train_step_fn,
+          self.model,
+          self.optimizer,
+          self.grad_accumulator,
       )
       self._jitted_eval_step_fn = maybe_cache_and_partial(
           self._jitted_eval_step_fn, self.model
@@ -695,6 +751,28 @@ def train(
             perf_constants.MINI_BATCH: mini_batch,
         }
 
+        self._iter_steps += 1
+
+        is_update_step_val = None
+        if (
+            isinstance(train_example, dict)
+            and "is_update_step" in train_example
+        ):
+          val = train_example["is_update_step"]
+          if val is not None:
+            is_update_step_val = bool(np.asarray(val).item())
+        elif hasattr(train_example, "is_update_step"):
+          val = train_example.is_update_step
+          if val is not None:
+            is_update_step_val = bool(np.asarray(val).item())
+
+        if is_update_step_val is None:
+          is_update_step_val = (
+              self._iter_steps
+              % self.config.get_with_default("gradient_accumulation_steps", 1)
+              == 0
+          )
+
         with self._perf_tracer.span(
             "peft_train_step",
             pxla.thread_resources.env.physical_mesh.devices,
@@ -703,7 +781,10 @@ def train(
             pxla.thread_resources.env.physical_mesh.devices,
             tags=tags,
         ) as span_v2:
-          train_loss, aux, grad_norm = train_step(train_example)
+          train_loss, aux, grad_norm = train_step(
+              train_example,
+              is_update_step=jnp.array(is_update_step_val, dtype=jnp.bool_),
+          )
           span.device_end([train_loss])
           span_v2.async_end([train_loss])
 
@@ -716,13 +797,8 @@ def train(
         )
         # NB: put this after self._buffer_metrics is important
         self._post_process_train_step(aux)
-        self._iter_steps += 1
 
-        if (
-            self._iter_steps
-            % self.config.get_with_default("gradient_accumulation_steps", 1)
-            == 0
-        ):
+        if is_update_step_val:
           self._train_steps += 1
           self._write_train_metrics()
 

Original file line number	Diff line number	Diff line change
`@@ -768,7 +768,9 @@ def train(`
`768`	`768`	`self._training_config.max_seq_token_per_tpu,`
`769`	`769`	`)`
`770`	`770`	`train_data_gen = rl_utils.pack_sequences(`
`771`		`- train_data_gen, self._training_config.max_seq_token_per_tpu`
	`771`	`+ train_data_gen,`
	`772`	`+ self._training_config.max_seq_token_per_tpu,`
	`773`	`+ target_items_per_update=grad_acc_steps,`
`772`	`774`	`)`
`773`	`775`	`micro_batches_since_last_sync = 0`
`774`	`776`	`micro_batches_per_full_batch = full_batch_size // train_micro_batch_size`
Original file line number	Diff line number	Diff line change
`@@ -724,7 +724,9 @@ def queue_iterator():`
`724`	`724`	`self._training_config.max_seq_token_per_tpu,`
`725`	`725`	`)`
`726`	`726`	`train_data_gen = rl_utils.pack_sequences(`
`727`		`- train_data_gen, self._training_config.max_seq_token_per_tpu`
	`727`	`+ train_data_gen,`
	`728`	`+ self._training_config.max_seq_token_per_tpu,`
	`729`	`+ target_items_per_update=grad_acc_steps,`
`728`	`730`	`)`
`729`	`731`
`730`	`732`	`curr_eval_ds = None`