Add RL token throughput and packing metrics

tdene · jalbericiola · tdene · commit fafaa0ca90b8 · 2026-03-15T17:30:13.000-05:00
Co-authored-by: Jorge Albericio &lt;jalbericiola@nvidia.com&gt;
diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py
@@ -57,6 +57,10 @@
     get_sequence_packing_tensorboard_metrics,
     get_sequence_packing_log_info,
     get_default_packed_seq_params,
+    get_packing_actual_tokens,
+    get_packing_compute_tokens,
+    get_packing_efficiency,
+    get_packing_avg_seq_length,
     update_microbatch_calculator,
 )
 from megatron.rl.agent.api import (
@@ -300,11 +304,22 @@ def __init__(self):
         self.last_collection_iteration = 0
         self.sequences_this_iteration_on_rank = 0
         self.latest_batch_num_sequences = 0
+        # Derived throughput metrics (set by training_log, read by RLProfiler)
+        self.tokens_per_sec = None
+        self.tokens_per_sec_per_gpu = None
+        self.actual_tokens_per_sec = None
+        self.actual_tokens_per_sec_per_gpu = None
+        self.packing_efficiency = None
 
     def reset_iteration_counters(self, iteration):
         """Reset per-iteration counters."""
         self.sequences_this_iteration_on_rank = 0
         self.last_collection_iteration = iteration
+        self.tokens_per_sec = None
+        self.tokens_per_sec_per_gpu = None
+        self.actual_tokens_per_sec = None
+        self.actual_tokens_per_sec_per_gpu = None
+        self.packing_efficiency = None
 
     def increment_sequences(self, count):
         """Increment the sequence counter."""
diff --git a/megatron/rl/sequence_packing_utils.py b/megatron/rl/sequence_packing_utils.py
@@ -1173,3 +1173,79 @@ def get_sequence_packing_tensorboard_metrics(args):
         metrics['bin-batch-size'] = bin_batch_size
         metrics['consumed-bins'] = args.consumed_train_bins
     return metrics
+
+
+def get_packing_actual_tokens(packing_context: PackingContext) -> int:
+    """Get the actual number of tokens (non-padding) in the packed sequences for this rank.
+
+    Args:
+        packing_context: The PackingContext containing packing information.
+
+    Returns:
+        Total number of actual tokens across all bins on this rank.
+    """
+    if packing_context is None or packing_context.packing_info is None:
+        return 0
+
+    packing_info = packing_context.packing_info
+    my_bin_seq_indices = packing_info.bin_seq_indices
+
+    # Sum the actual sequence lengths for all sequences in bins assigned to this rank
+    actual_tokens = sum(
+        packing_info.seq_lengths[idx]
+        for indices in my_bin_seq_indices
+        for idx in indices
+    )
+    return actual_tokens
+
+
+def get_packing_compute_tokens(packing_context: PackingContext) -> int:
+    """Get the total compute tokens (including padding) for packed sequences on this rank.
+
+    Args:
+        packing_context: The PackingContext containing packing information.
+
+    Returns:
+        Total compute tokens (num_bins * bin_size) on this rank.
+    """
+    if packing_context is None or packing_context.packed_trajs is None:
+        return 0
+
+    packed_trajs = packing_context.packed_trajs
+    return packed_trajs.shape[0] * packed_trajs.shape[1]
+
+
+def get_packing_efficiency(packing_context: PackingContext) -> float:
+    """Get the packing efficiency (actual_tokens / total_capacity) across all DP ranks.
+
+    Args:
+        packing_context: The PackingContext containing packing information.
+
+    Returns:
+        Packing efficiency as a float between 0 and 1.
+    """
+    if packing_context is None or packing_context.packing_info is None:
+        return 0.0
+
+    total_actual_tokens = sum(packing_context.packing_info.seq_lengths)
+    num_ranks = mpu.get_data_parallel_world_size()
+    bins_per_rank = packing_context.packed_trajs.shape[0] if packing_context.packed_trajs is not None else 0
+    bin_size = packing_context.packed_trajs.shape[1] if packing_context.packed_trajs is not None else 0
+    total_capacity = bins_per_rank * bin_size * num_ranks
+
+    if total_capacity == 0:
+        return 0.0
+
+    return total_actual_tokens / total_capacity
+
+
+def get_packing_avg_seq_length(packing_context: PackingContext) -> float:
+    """Get the average sequence length across all sequences in the packing context."""
+    if packing_context is None or packing_context.packing_info is None:
+        return 0.0
+
+    seq_lengths = packing_context.packing_info.seq_lengths
+    if not seq_lengths or len(seq_lengths) == 0:
+        return 0.0
+
+    return sum(seq_lengths) / len(seq_lengths)
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -2165,6 +2165,66 @@ def training_log(
             total_loss_dict[skipped_iters_key]
         )
         log_string += ' number of nan iterations: {:3d} |'.format(total_loss_dict[nan_iters_key])
+
+        # Compute tokens/sec metrics for logging
+        tokens_per_sec = None
+        tokens_per_sec_per_gpu = None
+        actual_tokens_per_sec = None
+        actual_tokens_per_sec_per_gpu = None
+        packing_efficiency = None
+
+        if hasattr(args, 'seq_length') and args.seq_length > 0:
+            # Compute tokens (includes padding for consistency with tensor shapes)
+            tokens_per_iteration = batch_size * args.seq_length
+            tokens_per_sec = tokens_per_iteration / elapsed_time_per_iteration
+            tokens_per_sec_per_gpu = tokens_per_sec / args.world_size
+
+            # For sequence packing, also compute actual tokens (non-padding)
+            if has_rl_utils and getattr(args, 'perform_rl_step', False) and getattr(args, 'rl_use_sequence_packing', False):
+                runtime_state = rl_utils.get_rl_runtime_state()
+                if runtime_state.packing_context is not None:
+                    # Get actual tokens from packing context
+                    actual_tokens = rl_utils.get_packing_actual_tokens(runtime_state.packing_context)
+                    compute_tokens = rl_utils.get_packing_compute_tokens(runtime_state.packing_context)
+
+                    # Scale to global batch (all DP ranks)
+                    actual_tokens_global = actual_tokens * mpu.get_data_parallel_world_size()
+
+                    actual_tokens_per_sec = actual_tokens_global / elapsed_time_per_iteration
+                    actual_tokens_per_sec_per_gpu = actual_tokens_per_sec / args.world_size
+                    packing_efficiency = rl_utils.get_packing_efficiency(runtime_state.packing_context)
+
+            # Add tokens/sec to log string
+            log_string += f' toks/s: {tokens_per_sec:.0f} |'
+            log_string += f' toks/s/gpu: {tokens_per_sec_per_gpu:.0f} |'
+            if actual_tokens_per_sec is not None:
+                log_string += f' actual_toks/s: {actual_tokens_per_sec:.0f} |'
+                log_string += f' actual_toks/s/gpu: {actual_tokens_per_sec_per_gpu:.0f} |'
+                log_string += f' packing_eff: {packing_efficiency:.1%} |'
+
+        # Store derived throughput metrics on RLRuntimeState so that
+        # downstream consumers (e.g. RLProfiler) can read them.
+        if has_rl_utils and getattr(args, 'perform_rl_step', False):
+            runtime_state = rl_utils.get_rl_runtime_state()
+            runtime_state.tokens_per_sec = tokens_per_sec
+            runtime_state.tokens_per_sec_per_gpu = tokens_per_sec_per_gpu
+            runtime_state.actual_tokens_per_sec = actual_tokens_per_sec
+            runtime_state.actual_tokens_per_sec_per_gpu = actual_tokens_per_sec_per_gpu
+            runtime_state.packing_efficiency = packing_efficiency
+
+        # Log average (non-padding) sequence length.  With sequence packing this
+        # shows how long the real sequences are; without packing it equals seq_length
+        # (all sequences are padded to the same length) — still useful as a baseline
+        # so the metric is always present for comparison.
+        if has_rl_utils and getattr(args, 'perform_rl_step', False):
+            runtime_state = rl_utils.get_rl_runtime_state()
+            packing_ctx = runtime_state.packing_context
+            if getattr(args, 'rl_use_sequence_packing', False) and packing_ctx is not None:
+                avg_seq_length = rl_utils.get_packing_avg_seq_length(packing_ctx)
+                log_string += f' avg_seq_len: {avg_seq_length:.1f} |'
+            elif args.log_throughput:
+                log_string += f' avg_seq_len: {args.seq_length} |'
+
         if should_reset:
             total_loss_dict[advanced_iters_key] = 0
             total_loss_dict[skipped_iters_key] = 0
diff --git a/tests/unit_tests/rl/test_sequence_packing_utils.py b/tests/unit_tests/rl/test_sequence_packing_utils.py
@@ -412,6 +412,57 @@ def test_compute_packed_inference_logprobs_stats_shape_mismatch():
     assert group_stats.mean_piold_to_inf_prob is None
 
 
+def test_packing_observability_metrics():
+    """Test various observability metrics related to sequence packing."""
+
+    # 4 sequences with known lengths packed into 2 bins of size 16.
+    # Bin 0 holds seqs 0 (len 5) and 1 (len 3) → 8 actual tokens
+    # Bin 1 holds seqs 2 (len 10) and 3 (len 4) → 14 actual tokens
+    seq_lengths = [5, 3, 10, 4]
+    packing_info = sequence_packing_utils.PackingInfo(
+        bin_seq_indices=[[0, 1], [2, 3]],
+        seq_starts={0: [0, 5], 1: [0, 10]},
+        seq_lengths=seq_lengths,
+        seq_to_bin_idx=[0, 0, 1, 1],
+        packing_algo='fifo',
+    )
+
+    num_bins, bin_size = 2, 16
+    packed_trajs = torch.zeros(num_bins, bin_size, dtype=torch.long)
+    ctx = sequence_packing_utils.PackingContext(
+        bin_size=bin_size,
+        packer=None,
+        packing_info=packing_info,
+        original_generation_masks=None,
+        original_trajs=None,
+        packed_trajs=packed_trajs,
+        packed_position_ids=None,
+        packed_attention_mask=None,
+        packed_loss_mask=None,
+    )
+
+    # actual tokens = sum of all seq_lengths referenced by bin_seq_indices
+    assert sequence_packing_utils.get_packing_actual_tokens(ctx) == 5 + 3 + 10 + 4
+
+    # compute tokens = num_bins * bin_size
+    assert sequence_packing_utils.get_packing_compute_tokens(ctx) == 2 * 16
+
+    # avg seq length = mean of seq_lengths
+    assert sequence_packing_utils.get_packing_avg_seq_length(ctx) == pytest.approx(22 / 4)
+
+    # efficiency = total_actual / (bins_per_rank * bin_size * num_ranks)
+    with patch('megatron.core.mpu.get_data_parallel_world_size', return_value=4):
+        eff = sequence_packing_utils.get_packing_efficiency(ctx)
+        # total_actual = sum(seq_lengths) = 22, capacity = 2 * 16 * 4 = 128
+        assert eff == pytest.approx(22 / 128)
+
+    # None context returns zero for all metrics
+    assert sequence_packing_utils.get_packing_actual_tokens(None) == 0
+    assert sequence_packing_utils.get_packing_compute_tokens(None) == 0
+    assert sequence_packing_utils.get_packing_efficiency(None) == 0.0
+    assert sequence_packing_utils.get_packing_avg_seq_length(None) == 0.0
+
+
 @pytest.mark.parametrize("num_sequences", [1, 10, 48, 49, 50])
 def test_cu_seqlens_size(num_sequences):
     """Test that cu_seqlens always has a fixed size regardless of how many sequences are packed."""