Fix TE FP8 padding with tensor parallelism

pzelasko · pzelasko · commit 17e203f503f3 · 2026-06-05T06:31:25.000-07:00
diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -179,11 +179,13 @@ def forward(
         te_fp8_config = (automodel_backend_config or {}).get("te_fp8", None)
         original_seq_len = input_embeds.shape[1] if input_embeds.dim() == 3 else input_embeds.shape[0]
         if cache is None and llm_kwargs.get("qkv_format", None) != "thd":
+            tp_size = self.device_mesh["tp"].size() if self._use_tp else 1
             input_embeds, attention_mask, llm_kwargs, original_seq_len = maybe_pad_bshd_inputs_for_te_fp8(
                 te_fp8_config,
                 input_embeds,
                 attention_mask,
                 llm_kwargs,
+                tp_size=tp_size,
             )
         with te_fp8_context(automodel_backend_config):
             out = self.llm(
@@ -462,7 +464,7 @@ def test_step(self, *args: Any, **kwargs: Any):
 
     def backward(self, *args, **kwargs):
         self._setup_moe_fsdp_sync()
-        with loss_parallel(), te_fp8_context(self.cfg.get("automodel_backend", None)):
+        with loss_parallel():
             super().backward(*args, **kwargs)
 
     def on_before_zero_grad(self, optimizer) -> None:
diff --git a/nemo/collections/speechlm2/parts/fp8.py b/nemo/collections/speechlm2/parts/fp8.py
@@ -14,7 +14,7 @@
 
 from collections.abc import Mapping
 from contextlib import nullcontext
-from math import gcd
+from math import gcd, lcm
 from typing import Any
 
 import torch
@@ -129,25 +129,33 @@ def validate_te_fp8_hidden_size(te_fp8_config: Any, hidden_size: int) -> None:
         )
 
 
-def get_te_fp8_bshd_sequence_multiple(batch_size: int) -> int:
-    """Return the minimal sequence-length multiple so B*T is divisible by 8."""
+def get_te_fp8_bshd_sequence_multiple(batch_size: int, tp_size: int = 1) -> int:
+    """Return the minimal BSHD sequence multiple for local TE FP8 Linear inputs."""
     if batch_size <= 0:
         raise ValueError(f"batch_size must be positive; got {batch_size}.")
-    return 8 // gcd(batch_size, 8)
+    if tp_size <= 0:
+        raise ValueError(f"tp_size must be positive; got {tp_size}.")
+
+    fp8_multiple = (8 * tp_size) // gcd(batch_size, 8 * tp_size)
+    return lcm(tp_size, fp8_multiple)
 
 
 def maybe_pad_bshd_inputs_for_te_fp8(
     te_fp8_config: Any,
     input_embeds: torch.Tensor,
     attention_mask: torch.Tensor | None,
     llm_kwargs: Mapping[str, Any] | None = None,
+    *,
+    tp_size: int = 1,
 ) -> tuple[torch.Tensor, torch.Tensor | None, dict[str, Any], int]:
     """Pad BSHD LLM inputs for TE FP8 and return the original sequence length.
 
     TE FP8 Linear requires the product of all input dimensions except the last
-    to be divisible by 8 and the last dimension to be divisible by 16. For
-    BSHD inputs this means ``B * T`` must be divisible by 8. Padding is appended
-    on the sequence dimension and can be trimmed from logits after the LLM.
+    to be divisible by 8 and the last dimension to be divisible by 16. With
+    BSHD sequence parallelism, local TE Linear inputs see ``B * T / TP`` rows,
+    so padding must keep ``T`` divisible by ``TP`` and ``B * T / TP`` divisible
+    by 8. Padding is appended on the sequence dimension and can be trimmed from
+    logits after the LLM.
     """
     llm_kwargs = dict(llm_kwargs or {})
     if input_embeds.dim() != 3:
@@ -159,7 +167,7 @@ def maybe_pad_bshd_inputs_for_te_fp8(
     batch_size, seq_len, hidden_size = input_embeds.shape
     validate_te_fp8_hidden_size(te_fp8_config, hidden_size)
 
-    seq_multiple = get_te_fp8_bshd_sequence_multiple(batch_size)
+    seq_multiple = get_te_fp8_bshd_sequence_multiple(batch_size, tp_size=tp_size)
     pad = (-seq_len) % seq_multiple
     if pad == 0:
         return input_embeds, attention_mask, llm_kwargs, original_seq_len
diff --git a/tests/collections/speechlm2/test_fp8.py b/tests/collections/speechlm2/test_fp8.py
@@ -178,6 +178,25 @@ def test_maybe_pad_bshd_inputs_for_te_fp8_noops_without_te_fp8():
     assert original_seq_len == 5
 
 
+@pytest.mark.parametrize(
+    ("batch_size", "tp_size", "expected_multiple"),
+    [
+        (1, 1, 8),
+        (2, 1, 4),
+        (16, 4, 4),
+        (1, 4, 32),
+        (2, 4, 16),
+        (8, 4, 4),
+    ],
+)
+def test_get_te_fp8_bshd_sequence_multiple_accounts_for_tp(batch_size, tp_size, expected_multiple):
+    multiple = fp8.get_te_fp8_bshd_sequence_multiple(batch_size, tp_size=tp_size)
+
+    assert multiple == expected_multiple
+    assert multiple % tp_size == 0
+    assert (batch_size * multiple // tp_size) % 8 == 0
+
+
 def test_maybe_pad_bshd_inputs_for_te_fp8_pads_sequence_tensors():
     input_embeds = torch.ones(2, 5, 16)
     attention_mask = torch.ones(2, 5, dtype=torch.bool)
@@ -200,6 +219,25 @@ def test_maybe_pad_bshd_inputs_for_te_fp8_pads_sequence_tensors():
     assert (llm_kwargs["position_ids"][:, 5:] == 0).all()
 
 
+def test_maybe_pad_bshd_inputs_for_te_fp8_accounts_for_tp():
+    input_embeds = torch.ones(16, 5, 16)
+    attention_mask = torch.ones(16, 5, dtype=torch.bool)
+
+    padded, padded_mask, llm_kwargs, original_seq_len = fp8.maybe_pad_bshd_inputs_for_te_fp8(
+        DictConfig({"recipe": "block"}),
+        input_embeds,
+        attention_mask,
+        tp_size=4,
+    )
+
+    assert original_seq_len == 5
+    assert padded.shape == (16, 8, 16)
+    assert padded.shape[1] % 4 == 0
+    assert (padded.shape[0] * padded.shape[1] // 4) % 8 == 0
+    assert padded_mask.shape == (16, 8)
+    assert llm_kwargs == {}
+
+
 def test_te_fp8_hidden_size_validation():
     te_fp8_config = DictConfig({"recipe": "block"})