NVIDIA
diff --git a/‎bionemo-recipes/models/llama3/modeling_llama_te.py‎
Lines changed: 67 additions & 17 deletions b/‎bionemo-recipes/models/llama3/modeling_llama_te.py‎
Lines changed: 67 additions & 17 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/fp8_debugging.py‎
Lines changed: 0 additions & 64 deletions b/‎bionemo-recipes/recipes/llama3_native_te/fp8_debugging.py‎
Lines changed: 0 additions & 64 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/fp8_debugging_stats.yaml‎
Lines changed: 6 additions & 1 deletion b/‎bionemo-recipes/recipes/llama3_native_te/fp8_debugging_stats.yaml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml‎
Lines changed: 14 additions & 3 deletions b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py‎
Lines changed: 67 additions & 17 deletions b/‎bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py‎
Lines changed: 67 additions & 17 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/perf_logger.py‎
Lines changed: 5 additions & 5 deletions b/‎bionemo-recipes/recipes/llama3_native_te/perf_logger.py‎
Lines changed: 5 additions & 5 deletions
@@ -17,10 +17,12 @@
 
 import warnings
 from collections import OrderedDict
+from contextlib import nullcontext
 from typing import ClassVar, Unpack
 
 import torch
 import torch.nn as nn
+import transformer_engine.common.recipe
 import transformer_engine.pytorch
 import transformers
 from transformer_engine.pytorch.attention import InferenceParams
@@ -50,6 +52,7 @@ class NVLlamaConfig(LlamaConfig):
     #   "thd"  = Total tokens (packed/unpadded), Head, Dimension (sequence packing format)
     attn_input_format: str = "thd"
     self_attn_mask_type: str = "padding_causal"
+    layer_precision: list[str | None] | None = None
 
 
 class NVLlamaPreTrainedModel(PreTrainedModel):
@@ -159,11 +162,54 @@ def _init_method(x):
         self.rotary_emb = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
         self.rotary_emb.inv_freq = LlamaRotaryEmbedding(config=config).inv_freq
 
+        self._fp8_recipe: transformer_engine.common.recipe.Recipe | None = None
+        self._fp4_recipe: transformer_engine.common.recipe.Recipe | None = None
+
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def set_recipes(
+        self,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ) -> None:
+        """Attach quantization recipe objects for per-layer autocast.
+
+        Recipes are not serializable and must be set at runtime after model creation
+        and sharding (FSDP/DDP) but before training. The per-layer precision
+        assignments are read from ``self.config.layer_precision``.
+
+        Args:
+            fp8_recipe: The FP8 recipe instance (e.g., MXFP8BlockScaling), or None.
+            fp4_recipe: The FP4 recipe instance (e.g., NVFP4BlockScaling), or None.
+        """
+        self._fp8_recipe = fp8_recipe
+        self._fp4_recipe = fp4_recipe
+
+    def get_layer_autocast(self, layer_number: int):
+        """Return the appropriate TE autocast context manager for a given layer.
+
+        The context interacts with the outer FP8 autocast in the training script:
+        - FP8 layer: nullcontext() -- lets the outer FP8 autocast take effect.
+        - FP4 layer: te.pytorch.autocast(enabled=True, recipe=fp4_recipe) -- overrides to FP4.
+        - BF16 layer: te.pytorch.autocast(enabled=False) -- disables quantized compute.
+
+        Args:
+            layer_number: The 0-indexed layer number.
+
+        Returns:
+            A context manager for the layer's quantization mode.
+        """
+        precision = self.config.layer_precision[layer_number] if self.config.layer_precision is not None else None
+        if precision == "fp8":
+            return nullcontext()
+        elif precision == "fp4":
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=self._fp4_recipe)
+        else:
+            return transformer_engine.pytorch.autocast(enabled=False)
+
     def forward(
         self,
         input_ids: torch.Tensor | None = None,
@@ -240,23 +286,27 @@ def forward(
             if te_rope_emb.dtype == torch.float32:
                 warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states = (*all_hidden_states, hidden_states)
-
-            hidden_states = decoder_layer(
-                hidden_states,
-                attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
-                rotary_pos_emb=te_rope_emb,
-                inference_params=past_key_values,
-                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
-                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
-                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
-                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
-                max_seqlen_q=kwargs.get("max_length_q", None),
-                max_seqlen_kv=kwargs.get("max_length_k", None),
-                pad_between_seqs=kwargs.get("pad_between_seqs", None),
-            )
+        # Outer FP8 autocast enables FP8 compute for the decoder stack. Per-layer overrides (FP4, BF16) are handled
+        # by get_layer_autocast(), which nests inside this context.
+        with transformer_engine.pytorch.autocast(enabled=self._fp8_recipe is not None, recipe=self._fp8_recipe):
+            for layer_number, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
+                if output_hidden_states:
+                    all_hidden_states = (*all_hidden_states, hidden_states)
+
+                with self.get_layer_autocast(layer_number):
+                    hidden_states = decoder_layer(
+                        hidden_states,
+                        attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
+                        rotary_pos_emb=te_rope_emb,
+                        inference_params=past_key_values,
+                        cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                        cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                        cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                        cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                        max_seqlen_q=kwargs.get("max_length_q", None),
+                        max_seqlen_kv=kwargs.get("max_length_k", None),
+                        pad_between_seqs=kwargs.get("pad_between_seqs", None),
+                    )
 
         hidden_states = self.norm(hidden_states)
 
 
@@ -2,7 +2,7 @@ example_fp8_tensor_stat_collection:
     enabled: True
     layers:
         # Match the actual linear layers within attention that support FP8 stats
-        layer_types: [layernorm_qkv]
+        layer_types: [layernorm_qkv, proj, fc1, fc2]
     transformer_engine:
         LogFp8TensorStats:
             enabled: True
@@ -16,3 +16,8 @@ example_fp8_tensor_stat_collection:
             - tensor: weight
               stats: [underflows%, scale_inv_min, scale_inv_max, mse]
               freq: 10
+        LogTensorStats:
+          enabled: True
+          stats: [max, min, mean, std, l1_norm]
+          tensors: [dgrad, wgrad]
+          freq: 1
@@ -44,6 +44,12 @@ fp8_config:
   quantized_model_init_kwargs:
     enabled: false # If this is set to true, fp8_config.enabled must also be set to true.
 
+fp4_config:
+  enabled: false
+  fp4_recipe: transformer_engine.common.recipe.NVFP4BlockScaling
+  fp4_format: "E2M1"
+  fp4_recipe_kwargs: {}
+
 # Optimizer config
 adamw_kwargs:
   lr: 3e-3
@@ -70,10 +76,15 @@ checkpoint:
 logger:
   frequency: 100
 
-fp8_stats_config:
+quant_stats_config:
   enabled: false
-  fp8_stats_file: ./fp8_debugging_stats.yaml
-  fp8_log_dir: ./log_fp8_stats
+  quant_stats_file: ./fp8_debugging_stats.yaml
+  quant_log_dir: ./log_quant_stats
+
+# Note: The layers are going to come in 1 indexed and we convert them to be 0 indexed at runtime.
+fp8_layers: null
+fp4_layers: null
+use_fp32_master_weights: null
 
 profiler:
   enabled: false
 
@@ -17,10 +17,12 @@
 
 import warnings
 from collections import OrderedDict
+from contextlib import nullcontext
 from typing import ClassVar, Unpack
 
 import torch
 import torch.nn as nn
+import transformer_engine.common.recipe
 import transformer_engine.pytorch
 import transformers
 from transformer_engine.pytorch.attention import InferenceParams
@@ -50,6 +52,7 @@ class NVLlamaConfig(LlamaConfig):
     #   "thd"  = Total tokens (packed/unpadded), Head, Dimension (sequence packing format)
     attn_input_format: str = "thd"
     self_attn_mask_type: str = "padding_causal"
+    layer_precision: list[str | None] | None = None
 
 
 class NVLlamaPreTrainedModel(PreTrainedModel):
@@ -159,11 +162,54 @@ def _init_method(x):
         self.rotary_emb = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
         self.rotary_emb.inv_freq = LlamaRotaryEmbedding(config=config).inv_freq
 
+        self._fp8_recipe: transformer_engine.common.recipe.Recipe | None = None
+        self._fp4_recipe: transformer_engine.common.recipe.Recipe | None = None
+
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def set_recipes(
+        self,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ) -> None:
+        """Attach quantization recipe objects for per-layer autocast.
+
+        Recipes are not serializable and must be set at runtime after model creation
+        and sharding (FSDP/DDP) but before training. The per-layer precision
+        assignments are read from ``self.config.layer_precision``.
+
+        Args:
+            fp8_recipe: The FP8 recipe instance (e.g., MXFP8BlockScaling), or None.
+            fp4_recipe: The FP4 recipe instance (e.g., NVFP4BlockScaling), or None.
+        """
+        self._fp8_recipe = fp8_recipe
+        self._fp4_recipe = fp4_recipe
+
+    def get_layer_autocast(self, layer_number: int):
+        """Return the appropriate TE autocast context manager for a given layer.
+
+        The context interacts with the outer FP8 autocast in the training script:
+        - FP8 layer: nullcontext() -- lets the outer FP8 autocast take effect.
+        - FP4 layer: te.pytorch.autocast(enabled=True, recipe=fp4_recipe) -- overrides to FP4.
+        - BF16 layer: te.pytorch.autocast(enabled=False) -- disables quantized compute.
+
+        Args:
+            layer_number: The 0-indexed layer number.
+
+        Returns:
+            A context manager for the layer's quantization mode.
+        """
+        precision = self.config.layer_precision[layer_number] if self.config.layer_precision is not None else None
+        if precision == "fp8":
+            return nullcontext()
+        elif precision == "fp4":
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=self._fp4_recipe)
+        else:
+            return transformer_engine.pytorch.autocast(enabled=False)
+
     def forward(
         self,
         input_ids: torch.Tensor | None = None,
@@ -240,23 +286,27 @@ def forward(
             if te_rope_emb.dtype == torch.float32:
                 warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states = (*all_hidden_states, hidden_states)
-
-            hidden_states = decoder_layer(
-                hidden_states,
-                attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
-                rotary_pos_emb=te_rope_emb,
-                inference_params=past_key_values,
-                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
-                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
-                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
-                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
-                max_seqlen_q=kwargs.get("max_length_q", None),
-                max_seqlen_kv=kwargs.get("max_length_k", None),
-                pad_between_seqs=kwargs.get("pad_between_seqs", None),
-            )
+        # Outer FP8 autocast enables FP8 compute for the decoder stack. Per-layer overrides (FP4, BF16) are handled
+        # by get_layer_autocast(), which nests inside this context.
+        with transformer_engine.pytorch.autocast(enabled=self._fp8_recipe is not None, recipe=self._fp8_recipe):
+            for layer_number, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
+                if output_hidden_states:
+                    all_hidden_states = (*all_hidden_states, hidden_states)
+
+                with self.get_layer_autocast(layer_number):
+                    hidden_states = decoder_layer(
+                        hidden_states,
+                        attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
+                        rotary_pos_emb=te_rope_emb,
+                        inference_params=past_key_values,
+                        cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                        cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                        cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                        cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                        max_seqlen_q=kwargs.get("max_length_q", None),
+                        max_seqlen_kv=kwargs.get("max_length_k", None),
+                        pad_between_seqs=kwargs.get("pad_between_seqs", None),
+                    )
 
         hidden_states = self.norm(hidden_states)
 
 
@@ -91,7 +91,7 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig, start_step:
         self.grad_acc_step_count = 0
 
         # Whether to step debug_api.step() after each step
-        self.fp8_stats_enabled = args.fp8_stats_config.enabled
+        self.quant_stats_config = args.quant_stats_config.enabled
 
     @nvtx.annotate("PerfLogger.log_micro_step", color="pink")
     def log_micro_step(self, step: int, batch: dict[str, torch.Tensor], outputs: CausalLMOutputWithPast):
@@ -150,7 +150,7 @@ def log_step(
             if self._profiler is not None:
                 self._profiler.step(step)
 
-            if self.fp8_stats_enabled:
+            if self.quant_stats_config:
                 debug_api.step()
 
             if step % self.logging_frequency == 0 and step > 0:
@@ -201,15 +201,15 @@ def log_step(
 
     def finish(self):
         """Finish the logger and close the progress bar."""
+        if self.quant_stats_config:
+            debug_api.end_debug()
+
         if not self._dist_config.is_main_process():
             return
 
         wandb.finish()
         self._progress_bar.close()
 
-        if self.fp8_stats_enabled:
-            debug_api.end_debug()
-
 
 class NsightProfiler:
     """Nsight Systems profiler wrapper for performance analysis.