NVIDIA · jomitchellnv · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
@@ -17,10 +17,12 @@
 
 import warnings
 from collections import OrderedDict
+from contextlib import nullcontext
 from typing import ClassVar, Unpack
 
 import torch
 import torch.nn as nn
+import transformer_engine.common.recipe
 import transformer_engine.pytorch
 import transformers
 from transformer_engine.pytorch.attention import InferenceParams
@@ -50,6 +52,7 @@ class NVLlamaConfig(LlamaConfig):
     #   "thd"  = Total tokens (packed/unpadded), Head, Dimension (sequence packing format)
     attn_input_format: str = "thd"
     self_attn_mask_type: str = "padding_causal"
+    layer_precision: list[str | None] | None = None
 
 
 class NVLlamaPreTrainedModel(PreTrainedModel):
@@ -159,11 +162,54 @@ def _init_method(x):
         self.rotary_emb = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
         self.rotary_emb.inv_freq = LlamaRotaryEmbedding(config=config).inv_freq
 
+        self._fp8_recipe: transformer_engine.common.recipe.Recipe | None = None
+        self._fp4_recipe: transformer_engine.common.recipe.Recipe | None = None
+
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def set_recipes(
+        self,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ) -> None:
+        """Attach quantization recipe objects for per-layer autocast.
+
+        Recipes are not serializable and must be set at runtime after model creation
+        and sharding (FSDP/DDP) but before training. The per-layer precision
+        assignments are read from ``self.config.layer_precision``.
+
+        Args:
+            fp8_recipe: The FP8 recipe instance (e.g., MXFP8BlockScaling), or None.
+            fp4_recipe: The FP4 recipe instance (e.g., NVFP4BlockScaling), or None.
+        """
+        self._fp8_recipe = fp8_recipe
+        self._fp4_recipe = fp4_recipe
+
+    def get_layer_autocast(self, layer_number: int):
+        """Return the appropriate TE autocast context manager for a given layer.
+
+        The context interacts with the outer FP8 autocast in the training script:
+        - FP8 layer: nullcontext() -- lets the outer FP8 autocast take effect.
+        - FP4 layer: te.pytorch.autocast(enabled=True, recipe=fp4_recipe) -- overrides to FP4.
+        - BF16 layer: te.pytorch.autocast(enabled=False) -- disables quantized compute.
+
+        Args:
+            layer_number: The 0-indexed layer number.
+
+        Returns:
+            A context manager for the layer's quantization mode.
+        """
+        precision = self.config.layer_precision[layer_number] if self.config.layer_precision is not None else None
+        if precision == "fp8":
+            return nullcontext()
+        elif precision == "fp4":
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=self._fp4_recipe)
+        else:
+            return transformer_engine.pytorch.autocast(enabled=False)
+
     def forward(
         self,
         input_ids: torch.Tensor | None = None,
@@ -240,23 +286,27 @@ def forward(
             if te_rope_emb.dtype == torch.float32:
                 warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states = (*all_hidden_states, hidden_states)
-
-            hidden_states = decoder_layer(
-                hidden_states,
-                attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
-                rotary_pos_emb=te_rope_emb,
-                inference_params=past_key_values,
-                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
-                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
-                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
-                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
-                max_seqlen_q=kwargs.get("max_length_q", None),
-                max_seqlen_kv=kwargs.get("max_length_k", None),
-                pad_between_seqs=kwargs.get("pad_between_seqs", None),
-            )
+        # Outer FP8 autocast enables FP8 compute for the decoder stack. Per-layer overrides (FP4, BF16) are handled
+        # by get_layer_autocast(), which nests inside this context.
+        with transformer_engine.pytorch.autocast(enabled=self._fp8_recipe is not None, recipe=self._fp8_recipe):
+            for layer_number, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
+                if output_hidden_states:
+                    all_hidden_states = (*all_hidden_states, hidden_states)
+
+                with self.get_layer_autocast(layer_number):
+                    hidden_states = decoder_layer(
+                        hidden_states,
+                        attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
+                        rotary_pos_emb=te_rope_emb,
+                        inference_params=past_key_values,
+                        cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                        cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                        cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                        cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                        max_seqlen_q=kwargs.get("max_length_q", None),
+                        max_seqlen_kv=kwargs.get("max_length_k", None),
+                        pad_between_seqs=kwargs.get("pad_between_seqs", None),
+                    )
 
         hidden_states = self.norm(hidden_states)
 

@@ -0,0 +1,243 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+import subprocess
+
+import pytest
+import torch
+from transformer_engine.pytorch.fp8 import check_fp8_support
+
+
+def requires_fp8(func):
+    """Decorator to skip tests that require FP8 support."""
+    fp8_available, reason = check_fp8_support()
+    return pytest.mark.skipif(not fp8_available, reason=f"FP8 is not supported on this GPU: {reason}")(func)
+
+
+requires_multi_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+    reason="Test requires at least 2 GPUs",
+)
+
+
+@pytest.mark.parametrize("strategy", ["ddp", "fsdp2"])
+@requires_fp8
+def test_single_process_attaches_correct_fp8_recipe(strategy, unused_tcp_port):
+    cmd = [
+        "torchrun",
+        "--nproc_per_node=1",
+        "--rdzv-backend=c10d",
+        f"--rdzv-endpoint=localhost:{unused_tcp_port}",
+        os.path.relpath(__file__),
+        "--strategy",
+        strategy,
+    ]
+
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        timeout=240,
+    )
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Command failed with exit code {result.returncode}")
+
+
+@pytest.mark.parametrize("strategy", ["ddp", "fsdp2"])
+@requires_fp8
+@requires_multi_gpu
+def test_multi_process_fp8_recipes_are_synced(strategy, unused_tcp_port):
+    cmd = [
+        "torchrun",
+        "--nproc_per_node=2",
+        "--rdzv-backend=c10d",
+        f"--rdzv-endpoint=localhost:{unused_tcp_port}",
+        os.path.relpath(__file__),
+        "--strategy",
+        strategy,
+    ]
+
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        timeout=240,
+    )
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Command failed with exit code {result.returncode}")
+
+
+if __name__ == "__main__":
+    import argparse
+    import enum
+    import os
+    import sys
+    from dataclasses import dataclass, field
+    from pathlib import Path
+
+    # Ensure the model directory is on sys.path for bare module imports.
+    sys.path.insert(0, Path(__file__).resolve().parent.parent.as_posix())
+
+    import torch.distributed as dist
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.fsdp import fully_shard
+    from torch.optim import AdamW
+    from transformer_engine.pytorch.fp8 import DelayedScaling, Format
+
+    from modeling_llama_te import NVLlamaConfig, NVLlamaForCausalLM
+
+    def recursive_assert(a, b, path=""):
+        if isinstance(a, dict) and isinstance(b, dict):
+            assert a.keys() == b.keys(), f"Dictionary keys mismatch: {a.keys()} != {b.keys()} at {path}"
+            for k in a:
+                recursive_assert(a[k], b[k], path=f"{path}.{k}")
+        elif isinstance(a, list) and isinstance(b, list):
+            assert len(a) == len(b), f"List lengths mismatch: {len(a)} != {len(b)} at {path}"
+            for i in range(len(a)):
+                recursive_assert(a[i], b[i], path=f"{path}.{i}")
+        elif isinstance(a, torch.Tensor) and isinstance(b, torch.Tensor):
+            torch.testing.assert_close(a, b, msg=f"Tensor mismatch at {path}")
+        else:
+            assert a == b, f"Value mismatch at {path}: {a} != {b}"
+
+    class Strategy(enum.StrEnum):
+        DDP = "ddp"
+        FSDP2 = "fsdp2"
+
+    @dataclass
+    class DistributedConfig:
+        """Class to track distributed ranks."""
+
+        rank: int = field(default_factory=dist.get_rank)
+        local_rank: int = field(default_factory=lambda: int(os.environ["LOCAL_RANK"]))
+        world_size: int = field(default_factory=dist.get_world_size)
+
+        def is_main_process(self) -> bool:
+            """This is the global rank 0 process, to be used for wandb logging, etc."""
+            return self.rank == 0
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--strategy", type=Strategy, default=Strategy.DDP, choices=[Strategy.FSDP2, Strategy.DDP])
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(backend="nccl")
+    dist_config = DistributedConfig()
+    torch.cuda.set_device(dist_config.local_rank)
+    device_mesh = init_device_mesh(
+        "cuda",
+        mesh_shape=(dist_config.world_size, 1),
+        mesh_dim_names=("dp", "tp"),
+    )
+    device = f"cuda:{dist_config.local_rank}"
+
+    fp8_recipe = DelayedScaling(fp8_format=Format.HYBRID, amax_compute_algo="max", amax_history_len=10)
+
+    config = NVLlamaConfig(
+        hidden_size=256,
+        intermediate_size=512,
+        num_hidden_layers=6,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        vocab_size=100,
+        dtype=torch.bfloat16,
+    )
+    config.layer_precision = ["fp8"] * config.num_hidden_layers
+    model = NVLlamaForCausalLM(config)
+
+    if args.strategy is Strategy.FSDP2:
+        for layer in model.model.layers:
+            fully_shard(layer, mesh=device_mesh["dp"])
+        fully_shard(model, mesh=device_mesh["dp"])
+        model.to(device)
+
+    elif args.strategy is Strategy.DDP:
+        model.to(device)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[dist_config.local_rank],
+            output_device=dist_config.local_rank,
+            device_mesh=device_mesh["dp"],
+        )
+
+    optimizer = AdamW(model.parameters())
+
+    # Attach FP8 recipes to the model (layer precision is already on config).
+    llama_model = model.module.model if args.strategy is Strategy.DDP else model.model
+    llama_model.set_recipes(fp8_recipe=fp8_recipe, fp4_recipe=None)
+
+    model.train()
+
+    generator = torch.Generator()
+    generator.manual_seed(torch.distributed.get_rank())
+
+    for _ in range(3):
+        input_data = {
+            "input_ids": torch.randint(0, config.vocab_size, (1, 32), generator=generator),
+            "labels": torch.randint(0, config.vocab_size, (1, 32), generator=generator),
+            "attention_mask": torch.ones(1, 32),
+        }
+        input_data = {k: v.to(torch.cuda.current_device()) for k, v in input_data.items()}
+
+        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+            outputs = model(**input_data)
+
+        outputs.loss.backward()
+
+    # Access FP8 extra states directly from modules instead of state_dict()
+    # since state_dict() now filters them out for HuggingFace compatibility
+    fp8_extra_states = {}
+    for name, module in model.named_modules():
+        if hasattr(module, "_extra_state") and callable(module._extra_state):
+            extra_state = module._extra_state()
+            if extra_state is not None and len(extra_state) > 0:
+                fp8_extra_states[f"{name}._extra_state"] = extra_state
+
+    # lm_head is BF16, not FP8, so exclude it from FP8 checks
+    fp8_extra_states = {key: val for key, val in fp8_extra_states.items() if "lm_head." not in key}
+
+    # 2 ranks, test to ensure that both ranks have the same FP8 extra states
+    if torch.distributed.get_world_size() == 2:
+        outputs_list = [None] * torch.distributed.get_world_size() if torch.distributed.get_rank() == 0 else None
+        torch.distributed.gather_object(fp8_extra_states, outputs_list, dst=0)
+        if torch.distributed.get_rank() == 0:
+            assert outputs_list is not None
+
+            for key in outputs_list[0]:
+                state_1 = outputs_list[0][key]
+                state_2 = outputs_list[1][key]
+                assert len(state_1) > 0, f"No FP8 extra states for {key}, rank 0"
+                assert len(state_2) > 0, f"No FP8 extra states for {key}, rank 1"
+                dict_1 = pickle.loads(state_1.detach().numpy(force=True).tobytes())
+                dict_2 = pickle.loads(state_2.detach().numpy(force=True).tobytes())
+                recursive_assert(dict_1, dict_2)
+
+    # One rank, test to ensure the correct FP8 extra states are saved
+    if torch.distributed.get_world_size() == 1:
+        for key, val in fp8_extra_states.items():
+            assert len(val) > 0, f"No FP8 extra states for {key}"
+            fp8_meta_dict = pickle.loads(val.detach().numpy(force=True).tobytes())
+            assert fp8_meta_dict["recipe"] == fp8_recipe, f"Recipe mismatch for {key}"
+
+    torch.distributed.destroy_process_group()