NVIDIA · xuwchen · Dec 1, 2025 · Jan 7, 2026 · Jan 9, 2026 · BestJuly
@@ -1660,6 +1660,20 @@ def __init__(
         self.optimizer_named_parameters = self._init_optimizer_named_parameters()
 
         self._log_parameter_groups()
+
+        # Sync parameter requires_grad with its parameter group.
+        self._sync_parameter_requires_grad_with_group()
+
+    def _sync_parameter_requires_grad_with_group(self):
+        """Sync parameter requires_grad attribute to match its parameter group.
+
+        Ensures parameter requires_grad attribute follows the parameter group setting,
+        which is the authoritative source after FSDP initialization.
+        """
+        for group in self.parameter_groups:
+            group_requires_grad = group.requires_grad
+            for param in group.params:
+                param.requires_grad = group_requires_grad
 
     def get_mem_alloc_context(self, groups=None, symmetric=True):
         """

@@ -424,7 +424,7 @@ def _preprocess(
             # return this extra tensor
             # this is for backwards compatibility with
             # legacy unit tests, which break if you
-            # return a 6 tuple instead of 5.
+            # return a 7 tuple instead of 6.
             preproc_output += (rotary_pos_cos_sin,)
 
         return preproc_output

@@ -45,6 +45,7 @@
 from megatron.core import parallel_state
 from megatron.core.tensor_parallel.layers import copy_tensor_model_parallel_attributes
 from megatron.core.transformer.transformer_layer import TransformerLayer
+from megatron.core.utils import get_model_config
 
 
 def get_ep_layer_offset(num_experts: int | None = None) -> int:
@@ -196,7 +197,8 @@ def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state_dict):
     assert HAVE_MEGATRON_FSDP, "This function requires Megatron-FSDP to be installed."
 
     # Extract num_experts from model config for expert parameter processing
-    num_experts = model.config.num_moe_experts if hasattr(model, 'config') else None
+    model_config = get_model_config(model)
+    num_experts = getattr(model_config, 'num_moe_experts', None)
 
     def intersection(s1, s2):
         # Only works for step=1