speedup trainer for RL

tianshub · The tunix Authors · commit f2dcd33759ab · 2026-02-26T13:58:04.000-08:00
PiperOrigin-RevId: 875877560
diff --git a/tunix/sft/peft_trainer.py b/tunix/sft/peft_trainer.py
@@ -17,6 +17,7 @@
 from collections.abc import Iterable
 import contextlib
 import dataclasses
+import functools
 import time
 from typing import Any, Callable, Concatenate, Dict, List, Optional, ParamSpec, Tuple
 
@@ -376,13 +377,16 @@ def _shard_optimizer(self, mesh: shd.Mesh) -> None:
     )
     nnx.update(self.optimizer, optimizer_sharded_state)
 
-  def jit_train_and_eval_step(self, skip_jit: bool = False):
+  def jit_train_and_eval_step(
+      self, skip_jit: bool = False, cache_nnx_graph: bool = False
+  ):
     """Creates and returns the train and eval step functions.
 
     This function will return the cached ones if available.
 
     Args:
       skip_jit: If True, the train and eval step functions will not be JITed.
+      cache_nnx_graph: If True, the nnx graph will be cached.
 
     Returns:
       A tuple of train and eval step functions.
@@ -391,14 +395,28 @@ def jit_train_and_eval_step(self, skip_jit: bool = False):
     eval_step = self.create_eval_step_fn()
     if skip_jit:
       return train_step, eval_step
-    else:
-      if self._jitted_train_step_fn is None:
-        self._shard_optimizer(pxla.thread_resources.env.physical_mesh)
-        self._jitted_train_step_fn = nnx.jit(
-            train_step, donate_argnames=("optimizer",)
-        )
-        self._jitted_eval_step_fn = nnx.jit(eval_step)
-      return self._jitted_train_step_fn, self._jitted_eval_step_fn
+
+    if self._jitted_train_step_fn is None:
+      self._shard_optimizer(pxla.thread_resources.env.physical_mesh)
+      self._jitted_train_step_fn = nnx.jit(
+          train_step, donate_argnames=("optimizer",)
+      )
+      self._jitted_eval_step_fn = nnx.jit(eval_step)
+
+      def maybe_cache_and_partial(f, *args):
+        if cache_nnx_graph:
+          # wrap with partial so we can access jitted_fn in a consistent way.
+          return functools.partial(nnx.cached_partial(f, *args))
+        else:
+          return functools.partial(f, *args)
+
+      self._jitted_train_step_fn = maybe_cache_and_partial(
+          self._jitted_train_step_fn, self.model, self.optimizer
+      )
+      self._jitted_eval_step_fn = maybe_cache_and_partial(
+          self._jitted_eval_step_fn, self.model
+      )
+    return self._jitted_train_step_fn, self._jitted_eval_step_fn
 
   def _prepare_inputs(self, input_data: Any) -> Any:
     """Override this function for additional input preparation."""
@@ -565,41 +583,28 @@ def train(
       eval_ds: Iterable[Any] | None = None,
       skip_jit: bool = False,
       *,
-      cache_nnx_graph: bool = False,
+      cache_nnx_graph: bool = True,
   ) -> None:
     """Training loop."""
     logging.log_first_n(
         logging.INFO,
         f"Training with mesh: {pxla.thread_resources.env.physical_mesh}",
         1,
     )
-    train_step, eval_step = self.jit_train_and_eval_step(skip_jit)
+    train_step, eval_step = self.jit_train_and_eval_step(
+        skip_jit, cache_nnx_graph
+    )
     if not skip_jit:
-      cache_size = train_step.jitted_fn._cache_size()  # pytype: disable=attribute-error
+      cache_size = train_step.func.jitted_fn._cache_size()  # pytype: disable=attribute-error
       logging.log_if(
           logging.INFO,
           f"Compiled train_step cache size: {cache_size}",
           condition=cache_size not in self._jit_cache,
       )
       self._jit_cache.add(cache_size)
 
-    if cache_nnx_graph:
-      # For performance, cache the nnx graph traversals. However, the training
-      # loop must _not_ modify the model or optimizer graph in this case.  For
-      # example, the distillation trainer mutates the model graph by adding the
-      # distillation loss.
-      partial_train_step = nnx.cached_partial(
-          train_step, self.model, self.optimizer
-      )
-      partial_eval_step = nnx.cached_partial(eval_step, self.model)
-    else:
-      partial_train_step = lambda inputs: train_step(
-          self.model, self.optimizer, inputs
-      )
-      partial_eval_step = lambda inputs: eval_step(self.model, inputs)
-
     if eval_ds:
-      self._run_eval(eval_ds, partial_eval_step)
+      self._run_eval(eval_ds, eval_step)
 
     if self.config.max_steps is not None and self._pbar is None:
       self._pbar = progress_bar.ProgressBar(
@@ -679,7 +684,7 @@ def train(
           with self._perf_tracer.span(
               "peft_train_step", pxla.thread_resources.env.physical_mesh.devices
           ) as span:
-            train_loss, aux = partial_train_step(train_example)
+            train_loss, aux = train_step(train_example)
             span.device_end([train_loss])
 
           current_time = time.perf_counter()
@@ -718,7 +723,7 @@ def train(
                 eval_ds
                 and self._train_steps % self.config.eval_every_n_steps == 0
             ):
-              self._run_eval(eval_ds, partial_eval_step)
+              self._run_eval(eval_ds, eval_step)
 
         self._prof.maybe_deactivate(self._iter_steps)