erictang000
diff --git a/‎.github/workflows/gpu_skyrl_train_megatron.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/gpu_skyrl_train_megatron.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎skyrl/backends/skyrl_train/training_batch.py‎
Lines changed: 6 additions & 0 deletions b/‎skyrl/backends/skyrl_train/training_batch.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎skyrl/backends/skyrl_train/workers/megatron/megatron_model_wrapper.py‎
Lines changed: 15 additions & 1 deletion b/‎skyrl/backends/skyrl_train/workers/megatron/megatron_model_wrapper.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py‎
Lines changed: 240 additions & 93 deletions b/‎skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py‎
Lines changed: 240 additions & 93 deletions
diff --git a/‎skyrl/backends/skyrl_train/workers/worker.py‎
Lines changed: 81 additions & 28 deletions b/‎skyrl/backends/skyrl_train/workers/worker.py‎
Lines changed: 81 additions & 28 deletions
diff --git a/‎skyrl/backends/skyrl_train/workers/worker_dispatch.py‎
Lines changed: 45 additions & 0 deletions b/‎skyrl/backends/skyrl_train/workers/worker_dispatch.py‎
Lines changed: 45 additions & 0 deletions
@@ -69,5 +69,5 @@ jobs:
         run: |
           COMMIT_SHA="${{ github.event.pull_request.head.sha || github.sha }}"
           JOB_NAME="skyrl-train-gpu-ci-megatron-${COMMIT_SHA:0:7}-${{ github.run_id }}"
-          anyscale job submit -f ci/anyscale_gpu_ci_skyrl_train_megatron.yaml --name "$JOB_NAME" --timeout 7000
-          anyscale job wait --cloud sky-anyscale-aws-us-east-1 --name "$JOB_NAME" --timeout 7000
+          anyscale job submit -f ci/anyscale_gpu_ci_skyrl_train_megatron.yaml --name "$JOB_NAME" --timeout 8000
+          anyscale job wait --cloud sky-anyscale-aws-us-east-1 --name "$JOB_NAME" --timeout 8000
@@ -72,6 +72,12 @@ def __len__(self) -> int:
     def __getitem__(self, index):
         if isinstance(index, slice):
             return TensorList(self.tensors[index])
+        if isinstance(index, torch.Tensor):
+            if index.ndim == 0:
+                return self.tensors[int(index)]
+            return TensorList([self.tensors[int(i)] for i in index.tolist()])
+        if isinstance(index, (list, tuple)):
+            return TensorList([self.tensors[int(i)] for i in index])
         return self.tensors[index]
 
     def to(self, device=None, dtype=None, non_blocking=False):
 
@@ -309,6 +309,11 @@ def loss_func(logits, data):
             rollout_action_logprobs = data["rollout_action_logprobs"]
             action_mask = data.get("action_mask")
             num_microbatches = data.get("num_microbatches")
+            # Number of microbatches carrying real samples (excludes fully-padding
+            # microbatches added by token-based batching). Used to normalize the
+            # KL/entropy terms over real microbatches only. Falls back to
+            # num_microbatches when not provided (no padding microbatches).
+            num_real_microbatches = data.get("num_real_microbatches", num_microbatches)
 
             dp_size = mpu.get_data_parallel_world_size(with_context_parallel=False)
             tp_grp = mpu.get_tensor_model_parallel_group()
@@ -454,7 +459,16 @@ def loss_func(logits, data):
             # NOTE: The KL and entropy loss terms are not pre-scaled,
             # so we just average them across microbatches and DP workers.
             # KL and entropy use Megatron's existing microbatch and CP schedule scaling.
-            loss = policy_loss * grad_sum_correction_factor + (kl_loss_term - entropy_loss_term)
+            # Megatron divides by num_microbatches (which includes fully-padding microbatches
+            # added by token-based batching). Those padding microbatches contribute 0 to
+            # KL/entropy, so dividing by the full count would dilute the regularization by
+            # num_real/num_total. Scale up by num_microbatches/num_real_microbatches so the
+            # terms are averaged over real microbatches only (no-op when there is no padding).
+            kl_entropy_microbatch_scale = num_microbatches / max(1, num_real_microbatches)
+            loss = (
+                policy_loss * grad_sum_correction_factor
+                + (kl_loss_term - entropy_loss_term) * kl_entropy_microbatch_scale
+            )
             unscaled_loss = loss / grad_sum_correction_factor
 
             # Build per-sequence loss_fn_outputs with logprobs.
 
@@ -51,8 +51,10 @@
 )
 from skyrl.backends.skyrl_train.utils.torch_utils import masked_mean
 from skyrl.backends.skyrl_train.workers.worker_utils import (
+    BaseBatchIterator,
     BatchIterator,
     all_reduce_metrics,
+    get_microbatch_iterator,
     reduce_metrics,
 )
 from skyrl.env_vars import (
@@ -757,14 +759,19 @@ def forward_backward(
             :class:`WorkerOutput` with per-sample ``loss_fn_outputs`` and scalar
             ``metrics`` (all-reduced across DP).
         """
-        micro_batch_size = self.cfg.micro_train_batch_size_per_gpu
+        microbatch_iterator = get_microbatch_iterator(
+            data,
+            micro_batch_size=self.cfg.micro_train_batch_size_per_gpu,
+            max_tokens_per_microbatch=self.cfg.max_tokens_per_microbatch,
+        )
         all_metrics = defaultdict(list)
         all_loss_fn_outputs = []  # Handle separately from scalar metrics
 
-        for micro_batch in BatchIterator(data, micro_batch_size, drop_last=False):
-            microbatch_weight = micro_batch_size / len(data)
+        for microbatch in microbatch_iterator:
+            experience = BaseBatchIterator.batch_to_experience(microbatch)
+            microbatch_weight = len(microbatch) / len(data)
             metrics = self._forward_backward_micro(
-                micro_batch, microbatch_weight, loss_fn=loss_fn, loss_fn_config=loss_fn_config
+                experience, microbatch_weight, loss_fn=loss_fn, loss_fn_config=loss_fn_config
             )
 
             # Extract loss_fn_outputs before reduce_metrics (it's not a scalar metric)
@@ -782,6 +789,15 @@ def forward_backward(
         # Reduce across microbatches and all-reduce metrics across DP ranks
         # NOTE: Sum loss metrics because scaling is already applied at the advantage level
         result = reduce_metrics(all_metrics, sum_loss_metrics=sum_loss_metrics)
+
+        # Token-based batching diagnostics: total microbatches this rank ran and how many
+        # were purely-padding (added to equalize the microbatch count across DP ranks).
+        # Added before all-reduce so they are averaged across DP (num_microbatches is
+        # identical on every rank; num_padding_microbatches reports the per-rank average).
+        if self.cfg.max_tokens_per_microbatch > 0:
+            result["num_microbatches"] = float(len(microbatch_iterator))
+            result["num_padding_microbatches"] = float(getattr(microbatch_iterator, "num_padding_microbatches", 0))
+
         dp_group = self.device_mesh.get_group("dp")
         result = all_reduce_metrics(result, self.strategy, group=dp_group, sum_loss_metrics=sum_loss_metrics)
 
@@ -1023,11 +1039,16 @@ def forward(
         """
         if loss_fn is None:
             # Inference forward path: run in micro batches and emit per-sample logprobs.
-            micro_batches = data.chunk(self.cfg.micro_forward_batch_size_per_gpu)
-            outputs = []
-            for micro_batch in micro_batches:
-                outputs.append(self._forward_micro_batch(micro_batch))
-            output = TrainingOutputBatch.cat(outputs)
+            # Uses token-based micro-batching when `max_tokens_per_microbatch > 0`, otherwise
+            # falls back to fixed sample-count chunking. `reorder_and_combine_batches` restores
+            # the original sample order (and strips padding) for the token-based iterator.
+            microbatch_iterator = get_microbatch_iterator(
+                data,
+                micro_batch_size=self.cfg.micro_forward_batch_size_per_gpu,
+                max_tokens_per_microbatch=self.cfg.max_tokens_per_microbatch,
+            )
+            outputs = [self._forward_micro_batch(micro_batch) for micro_batch in microbatch_iterator]
+            output = microbatch_iterator.reorder_and_combine_batches(outputs)
             if output.device is not None and output.device != torch.device("cpu"):
                 output = output.to("cpu")
             row_tensor = output["output"]
@@ -1247,7 +1268,8 @@ def forward_backward(self, data: TrainingInputBatch) -> WorkerOutput:
         """
         Perform forward and backward passes for a batch, handling micro-batching internally.
 
-        The batch is split into micro batches based on micro_train_batch_size_per_gpu.
+        The batch is split into micro batches based on micro_train_batch_size_per_gpu,
+        or by token count if max_tokens_per_microbatch is configured.
         Gradients accumulate across micro batches. Gradient scaling happens at optim_step.
 
         Args:
@@ -1257,12 +1279,26 @@ def forward_backward(self, data: TrainingInputBatch) -> WorkerOutput:
             :class:`WorkerOutput` with empty ``loss_fn_outputs`` and scalar
             ``metrics`` (all-reduced across DP).
         """
-        micro_batch_size = self.cfg.micro_train_batch_size_per_gpu
+        use_token_batching = self.cfg.max_tokens_per_microbatch > 0
+        microbatch_iterator = get_microbatch_iterator(
+            data,
+            micro_batch_size=self.cfg.micro_train_batch_size_per_gpu,
+            max_tokens_per_microbatch=self.cfg.max_tokens_per_microbatch,
+        )
         all_metrics = defaultdict(list)
 
-        for micro_batch in BatchIterator(data, micro_batch_size, drop_last=False):
-            metrics = self._forward_backward_micro(micro_batch)
-            self._micro_batches_accumulated += 1
+        for microbatch in microbatch_iterator:
+            experience = BaseBatchIterator.batch_to_experience(microbatch)
+
+            if use_token_batching:
+                # With token-based batching, microbatches may have different sizes.
+                # Scale loss by microbatch_weight so gradients are correctly weighted.
+                microbatch_weight = len(microbatch) / len(data)
+                metrics = self._forward_backward_micro(experience, microbatch_weight=microbatch_weight)
+            else:
+                metrics = self._forward_backward_micro(experience)
+                self._micro_batches_accumulated += 1
+
             for k, v in metrics.items():
                 all_metrics[k].append(v)
 
@@ -1274,14 +1310,17 @@ def forward_backward(self, data: TrainingInputBatch) -> WorkerOutput:
 
         return WorkerOutput(metrics=result)
 
-    def _forward_backward_micro(self, experience: Experience) -> Dict[str, float]:
+    def _forward_backward_micro(
+        self, experience: Experience, microbatch_weight: Optional[float] = None
+    ) -> Dict[str, float]:
         """
         Perform forward and backward pass for one micro batch.
 
-        Loss is NOT scaled here - gradient scaling happens at optim_step time.
-
         Args:
             experience: Experience object for one micro batch
+            microbatch_weight: If provided, scale loss by this weight before backward.
+                Used with token-based batching where microbatches have variable sizes.
+                If None, loss is unscaled (gradient scaling happens at optim_step time).
 
         Returns:
             All-reduced metrics dict for this micro batch
@@ -1313,7 +1352,11 @@ def _forward_backward_micro(self, experience: Experience) -> Dict[str, float]:
                 config=self.cfg.algorithm,
                 loss_mask=loss_mask,
             )
-        # NO loss scaling here - gradient scaling happens at optim_step
+
+        if microbatch_weight is not None:
+            # Token-based batching: scale loss by weight so gradients are properly weighted
+            loss = loss * microbatch_weight
+        # else: NO loss scaling here - gradient scaling happens at optim_step
         self.strategy.backward(loss, self.model, self.optimizer)
 
         status = {
@@ -1333,6 +1376,8 @@ def optim_step(self) -> float:
             The gradient norm (before scaling, after clipping)
         """
         # Scale accumulated gradients by 1/N to get correct average
+        # NOTE: When using token-based batching, loss is pre-scaled by microbatch_weight
+        # in forward_backward, so _micro_batches_accumulated stays 0 and no scaling needed.
         if self._micro_batches_accumulated > 0:
             scale = 1.0 / self._micro_batches_accumulated
             for param in self.model.parameters():
@@ -1381,11 +1426,15 @@ def forward(self, data: TrainingInputBatch) -> WorkerOutput:
         per-sample dict with key ``"values"``.
         """
         # Run in micro batches and emit per-sample values.
-        micro_batches = data.chunk(self.cfg.micro_forward_batch_size_per_gpu)
-        outputs = []
-        for micro_batch in micro_batches:
-            outputs.append(self._forward_micro_batch(micro_batch))
-        output = TrainingOutputBatch.cat(outputs)
+        # Uses token-based micro-batching when `max_tokens_per_microbatch > 0`; otherwise fixed
+        # sample-count chunking. `reorder_and_combine_batches` restores original sample order.
+        microbatch_iterator = get_microbatch_iterator(
+            data,
+            micro_batch_size=self.cfg.micro_forward_batch_size_per_gpu,
+            max_tokens_per_microbatch=self.cfg.max_tokens_per_microbatch,
+        )
+        outputs = [self._forward_micro_batch(micro_batch) for micro_batch in microbatch_iterator]
+        output = microbatch_iterator.reorder_and_combine_batches(outputs)
         if output.device is not None and output.device != torch.device("cpu"):
             output = output.to("cpu")
         row_tensor = output["output"]
@@ -1408,11 +1457,15 @@ def forward(self, data: TrainingInputBatch) -> WorkerOutput:
         per-sample dict with key ``"logprobs"``.
         """
         # Run in micro batches and emit per-sample logprobs.
-        micro_batches = data.chunk(self.cfg.micro_forward_batch_size_per_gpu)
-        outputs = []
-        for micro_batch in micro_batches:
-            outputs.append(self._forward_micro_batch(micro_batch))
-        output = TrainingOutputBatch.cat(outputs)
+        # Uses token-based micro-batching when `max_tokens_per_microbatch > 0`; otherwise fixed
+        # sample-count chunking. `reorder_and_combine_batches` restores original sample order.
+        microbatch_iterator = get_microbatch_iterator(
+            data,
+            micro_batch_size=self.cfg.micro_forward_batch_size_per_gpu,
+            max_tokens_per_microbatch=self.cfg.max_tokens_per_microbatch,
+        )
+        outputs = [self._forward_micro_batch(micro_batch) for micro_batch in microbatch_iterator]
+        output = microbatch_iterator.reorder_and_combine_batches(outputs)
         if output.device is not None and output.device != torch.device("cpu"):
             output = output.to("cpu")
         row_tensor = output["output"]
 
@@ -237,6 +237,51 @@ def forward(
 
         return WorkerOutput.cat(self._actor_groups[model].actor_infos, results)
 
+    def forward_from_staged(
+        self,
+        model: str,
+        chunk_refs: List[ObjectRef],
+        loss_fn: Optional[str] = None,
+        loss_fn_config: Optional[Dict[str, Any]] = None,
+        model_id: Optional[str] = None,
+    ) -> WorkerOutput:
+        """Run a forward pass using pre-staged per-DP chunks.
+
+        Consumes per-DP chunks already placed in the object store by :meth:`stage_data`, so
+        serialization of the per-mini-batch chunks is amortized off the dispatch critical path
+        across mini-batches (see :meth:`forward_backward_from_staged`). The chunks are produced
+        exactly as in :meth:`stage_data`, so the per-rank partition (and thus the microbatch packing)
+        matches what ``forward_backward`` sees for the same mini-batch.
+
+        Args:
+            model: Model identifier ("policy", "critic", or "ref")
+            chunk_refs: Pre-staged ObjectRefs, one per DP rank (from ``stage_data``)
+            loss_fn: Optional resolved loss function name. When set, the worker computes
+                     loss + per-sample outputs without backward (no_grad).
+            loss_fn_config: Optional config overrides for the loss function.
+            model_id: Optional Tinker model_id; selects the LoRA adapter before the forward.
+
+        Returns:
+            :class:`WorkerOutput` aggregated across DP ranks.
+        """
+        self._ensure_on_gpu(model, need_optimizer=False, need_model=True)
+        self.ensure_active_adapter(model, model_id)
+
+        kwargs = {}
+        if loss_fn is not None:
+            kwargs["loss_fn"] = loss_fn
+        if loss_fn_config is not None:
+            kwargs["loss_fn_config"] = loss_fn_config
+
+        refs = MeshDispatch.dispatch_from_staged(
+            self._actor_groups[model].actor_infos,
+            "forward",
+            chunk_refs=chunk_refs,
+            **kwargs,
+        )
+        results = ray.get(refs)
+        return WorkerOutput.cat(self._actor_groups[model].actor_infos, results)
+
     def stage_data(
         self,
         model: str,