remove unnecessary rollout round

tianshub · The tunix Authors · commit e03f1bf80c58 · 2026-02-27T15:21:45.000-08:00
PiperOrigin-RevId: 876437319
diff --git a/tunix/rl/experimental/agentic_rl_learner.py b/tunix/rl/experimental/agentic_rl_learner.py
@@ -511,10 +511,8 @@ def _batch_to_train_example(
     """
     # Create a merged training_input where each field from the original input
     # is repeated G times to align with the G completions.
-    num_generations = self.algo_config.num_generations
-    prompt_index = batch_results[0].pair_index // num_generations
-    if mode == rl_cluster_lib.Mode.TRAIN and self._full_batch_size:
-      expected_step = prompt_index // self._full_batch_size
+    if mode == rl_cluster_lib.Mode.TRAIN:
+      expected_step = batch_results[0].group_id // self._full_batch_size
     else:
       expected_step = self.rl_cluster.global_steps
 
@@ -710,7 +708,10 @@ def train(
     micro_batches_since_last_sync = 0
     micro_batches_per_full_batch = full_batch_size // train_micro_batch_size
     for train_micro_batch in train_data_gen:
-      if self.rl_cluster.global_steps >= self._training_config.max_steps:
+      if (
+          self._training_config.max_steps
+          and self.rl_cluster.global_steps >= self._training_config.max_steps
+      ):
         logging.info(
             "Reached max_steps: %d >= %d",
             self.rl_cluster.global_steps,
@@ -825,7 +826,17 @@ def _put_prompts_to_queue(
       prompt_queue: The queue to put the batch into.
       batch: The batch of prompts (TrainingInputT).
     """
-    if len(batch["prompts"]) != self._full_batch_size:
+    if (
+        self._training_config.max_steps
+        and self.rl_cluster.global_steps >= self._training_config.max_steps
+    ):
+      logging.info(
+          "Reached max_steps: %d >= %d",
+          self.rl_cluster.global_steps,
+          self._training_config.max_steps,
+      )
+      prompt_queue.put(None)
+    elif len(batch["prompts"]) != self._full_batch_size:
       logging.warning(
           "partial batch %d vs %d detected. The rest of the batch will be"
           " skipped.",