pp_warmup optimization (#185)

lhzhang333 · web-flow · commit f92e40ce283c · 2025-09-11T09:24:14.000+08:00
diff --git a/primus/README_patch.md b/primus/README_patch.md
@@ -47,7 +47,7 @@ These arguments are introduced in the Megatron module logic (e.g., training loop
 | `disable_last_saving`                | `false`       | v0.1.0  | Skip saving the final checkpoint at the last iteration.                                        | NA                                                                                                                                                                                                                                                                                                           | Useful for profiling or benchmarking runs.       |
 | `no_fp8_weight_transpose_cache`      | `false`       | v0.2.0  | Disable the FP8 weight transpose cache to save memory.                                         | `megatron.core.extensions.transformer_engine.TELinear`, `megatron.core.extensions.transformer_engine.TELayerNormColumnParallelLinear`, `megatron.core.extensions.transformer_engine.TEDelayedScaling`                                                                                                        | May affect performance but reduce memory use.    |
 | `decoder_pipeline_manual_split_list` | `null`        | v0.2.0  | Enable manual pipeline split in (interleaved) 1F1B pipeline parallelism.                       | `megatron.core.transformer.transformer_block.get_num_layers_to_build`, `megatron.core.transformer.transformer_layer.get_transformer_layer_offset`                                                                                                                                                            | May be deprecated when megatron gets updated.    |
-| `attn_warmup`                        | `false`       | v0.2.0  | Add attention fwd/bwd warmup to save iter1's time when pp is used.                             | NA                                                                                                                                                                                                                                                                                                           | Can save much time for pipeline debug.           |
+| `pp_warmup`                          | `false`       | v0.2.0  | Add attention/mlp fwd/bwd warmup to save iter1's time when pp degree is large.                             | NA                                                                                                                                                                                                                                                                                                           | Can save much time for pipeline debug.           |
 | `dump_pp_data`                       | `false`       | v0.2.0  | Enable dumping pp schedule data for visualization.                                             | `megatron.core.pipeline_parallel.schedules.forward_step`, `megatron.core.pipeline_parallel.schedules.backward_step`, `megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_with_interleaving`, `megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_without_interleaving` | Useful for pipeline schedule visualization.      |
 | `disable_profiler_activity_cpu`                | `false`       | v0.2.0  | Disable CPU activityt in torch profiling, .                                        | NA                                                                                                                                                                                                                                                                                                           | If you only want to trace CUDA kernels and get a smaller trace JSON file, you can enable this option. However, if you plan to run with TraceLen, please do not enable it.       |
 | `use_rocm_mem_info`                        | `false`       | v0.2.0  | Logging ROCm memory information in Megatron-LM Trainer                             | NA                                                                                                                                                                                                                                                                                                           | If `use_rocm_mem_info = True`, ROCm memory information will be collected with `rocm-smi` at every iteration.           |
diff --git a/primus/configs/modules/megatron/primus_megatron_module.yaml b/primus/configs/modules/megatron/primus_megatron_module.yaml
@@ -23,7 +23,7 @@ no_fp8_weight_transpose_cache: false
 decoder_pipeline_manual_split_list: null # int list
 
 # perf
-attn_warmup: false # set to true to decrease iter-1 time when using pp
+pp_warmup: false # set to true to decrease iter-1 time when using pp
 
 # tool
 dump_pp_data: false
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
@@ -1222,16 +1222,16 @@ def run(self, *args, **kwargs):
         one_logger = get_one_logger()
         args = get_args()
 
-        if args.attn_warmup:
-            from .utils import warmup_attn
+        if args.pp_warmup:
+            from .utils import pp_warmup
 
             log_rank_0(
                 "warmup attn on each rank in parallel to decrease "
                 "the first iter time, especially when pp is used"
             )
             timers = get_timers()
             timers("warmup-attn", log_level=0).start(barrier=True)
-            warmup_attn(args, self.config, self.model, self.optimizer)
+            pp_warmup(args, self.config, self.model, self.optimizer)
             timers("warmup-attn").stop()
             timers.log(["warmup-attn"], barrier=True)
 
diff --git a/primus/modules/trainer/megatron/utils.py b/primus/modules/trainer/megatron/utils.py
@@ -218,26 +218,34 @@ def get_transformer_layer_offset_patch(config, vp_stage):
     megatron.core.models.gpt.gpt_layer_specs.get_transformer_layer_offset = get_transformer_layer_offset_patch
 
 
-def warmup_attn(args, config, model, optimizer):
-    if model[0].use_forward_hook:
-        model[0].disable_forward_pre_hook()
-
-    attn = model[0].module.module.decoder.layers[0].self_attention
-    warmup_input = torch.randn(args.seq_length, 1, config.hidden_size, device="cuda", dtype=torch.bfloat16)
-    attention_mask = (
-        torch.tril(torch.ones((args.seq_length, args.seq_length), device="cuda")).unsqueeze(0).unsqueeze(0)
-        == 0
-    )
-
-    warmup_output = attn(warmup_input, attention_mask=attention_mask)
-    warmup_output[0].backward(torch.ones_like(warmup_output[0]))
-
+def pp_warmup(args, config, model, optimizer):
     for model_chunk in model:
-        model_chunk.zero_grad_buffer()
-    optimizer.zero_grad()
-
-    if model[0].use_forward_hook:
-        model[0].enable_forward_pre_hook()
+        with model_chunk.no_sync():
+            if model_chunk.use_forward_hook:
+                model_chunk.disable_forward_pre_hook()
+            dtype = torch.float32
+            if config.bf16:
+                dtype = torch.bfloat16
+            elif config.fp16:
+                dtype = torch.float16
+            seq_len = args.seq_length // args.tensor_model_parallel_size // args.context_parallel_size
+
+            for layer in model_chunk.module.module.decoder.layers:
+                attn_input = torch.randn(seq_len, 1, config.hidden_size, device="cuda", dtype=dtype)
+                attention_mask = (
+                    torch.tril(torch.ones((seq_len, seq_len), device="cuda")).unsqueeze(0).unsqueeze(0) == 0
+                )
+                attn_output = layer.self_attention(attn_input, attention_mask=attention_mask)
+                attn_output[0].backward(torch.ones_like(attn_output[0]))
+
+                mlp_input = torch.randn(seq_len, 1, config.hidden_size, device="cuda", dtype=dtype)
+                mlp_output = layer.mlp(mlp_input)
+                mlp_output[0].backward(torch.ones_like(mlp_output[0]))
+
+            if model_chunk.use_forward_hook:
+                model_chunk.enable_forward_pre_hook()
+            optimizer.zero_grad()
+    torch.cuda.empty_cache()
 
 
 def schedule_wrapper(func):