feat(training): add --balance-by-flops for FLOPs-aware micro-batch partitioning (#44)

nightlessbaron · web-flow · commit 2b9b705bff55 · 2026-06-17T01:15:43.000-07:00
Port of THUDM/slime#2017. Adds --balance-by-flops flag that replaces token-count KK balancing with FLOPs-weighted KK for both DP rank assignment (_split_train_data_by_dp) and micro-batch packing (get_data_iterator). Uses the existing calculate_fwd_flops() which accounts for the full model architecture (MoE, LoRA, attention projections) rather than the simplified coeff*L+L² from upstream. Requires --use-dynamic-batch-size.
diff --git a/miles/backends/training_utils/data.py b/miles/backends/training_utils/data.py
@@ -7,6 +7,7 @@
 import torch.nn.functional as F
 
 from miles.utils.data import get_minimum_num_micro_batch_size
+from miles.utils.flops_utils import calculate_workloads
 from miles.utils.seqlen_balancing import get_seqlen_balanced_partitions
 from miles.utils.types import RolloutBatch
 
@@ -412,7 +413,11 @@ def _generate_data_iterator(rollout_data, micro_batch_size, micro_batch_indices=
         for i, num_mbs in enumerate(num_microbatches):
             start, end = i * num_local_gbs, (i + 1) * num_local_gbs
             samples = rollout_data["total_lengths"][start:end]
-            partitions = get_seqlen_balanced_partitions(samples, num_mbs, equal_size=False)
+            if getattr(args, "balance_by_flops", False):
+                weights = calculate_workloads(samples, args)
+                partitions = get_seqlen_balanced_partitions(weights, num_mbs, equal_size=False)
+            else:
+                partitions = get_seqlen_balanced_partitions(samples, num_mbs, equal_size=False)
             for j in range(num_mbs):
                 for k in range(len(partitions[j])):
                     partitions[j][k] += start
diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py
@@ -27,6 +27,7 @@
 from miles.rollout.inference_rollout.compatibility import call_rollout_function, load_rollout_function
 from miles.utils import dumper_utils, tracking_utils
 from miles.utils.environ import enable_experimental_rollout_refactor
+from miles.utils.flops_utils import calculate_workloads
 from miles.utils.health_monitor import RolloutHealthMonitor
 from miles.utils.http_utils import (
     _wrap_ipv6,
@@ -856,7 +857,11 @@ def _stat(xs):
         total_lengths = [len(t) for t in data["tokens"]]
         data["total_lengths"] = total_lengths
 
-        if self.args.balance_data:
+        balance_by_flops = getattr(self.args, "balance_by_flops", False)
+        if balance_by_flops:
+            workloads = calculate_workloads(total_lengths, self.args)
+            partitions = get_seqlen_balanced_partitions(workloads, dp_size, equal_size=True)
+        elif self.args.balance_data:
             partitions = get_seqlen_balanced_partitions(total_lengths, dp_size, equal_size=True)
         else:
             partitions = [range(i, len(total_lengths), dp_size) for i in range(dp_size)]
diff --git a/miles/utils/arguments.py b/miles/utils/arguments.py
@@ -685,6 +685,19 @@ def add_data_arguments(parser):
                 ),
             )
 
+            parser.add_argument(
+                "--balance-by-flops",
+                action="store_true",
+                default=False,
+                help=(
+                    "Use FLOPs-based workload estimation for DP rank assignment and micro-batch partitioning "
+                    "via Karmarkar-Karp instead of token-count balancing. FLOPs are computed from the full "
+                    "model config (hidden_size, ffn_hidden_size, MoE experts/topk, LoRA ranks) via "
+                    "calculate_fwd_flops, capturing the quadratic cost of attention. Produces more balanced "
+                    "micro-batches when sequence lengths vary widely. Requires --use-dynamic-batch-size."
+                ),
+            )
+
             parser.add_argument(
                 "--use-dynamic-batch-size",
                 action="store_true",
@@ -1956,6 +1969,9 @@ def miles_validate_args(args):
         if args.log_probs_max_tokens_per_gpu is None:
             args.log_probs_max_tokens_per_gpu = args.max_tokens_per_gpu
 
+    if getattr(args, "balance_by_flops", False):
+        assert args.use_dynamic_batch_size, "--balance-by-flops requires --use-dynamic-batch-size"
+
     if args.eps_clip_high is None:
         args.eps_clip_high = args.eps_clip
 
diff --git a/miles/utils/flops_utils.py b/miles/utils/flops_utils.py
@@ -125,3 +125,13 @@ def calculate_fwd_flops(
         total_flops += calculate_lm_head_flops(seqlen, hidden_size, vocab_size)
 
     return total_flops
+
+
+def calculate_workloads(seqlens, args):
+    """Return per-sequence forward FLOPs for Karmarkar-Karp balancing weights.
+
+    One workload value per sequence length, capturing the quadratic cost of
+    attention plus the model architecture (MoE, LoRA, attention projections).
+    Used by ``--balance-by-flops`` for DP rank assignment and micro-batch packing.
+    """
+    return [calculate_fwd_flops([sl], args) for sl in seqlens]