Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,20 @@ def __init__(
self.optimizer_named_parameters = self._init_optimizer_named_parameters()

self._log_parameter_groups()

# Sync parameter requires_grad with its parameter group.
self._sync_parameter_requires_grad_with_group()

def _sync_parameter_requires_grad_with_group(self):
"""Sync parameter requires_grad attribute to match its parameter group.

Ensures parameter requires_grad attribute follows the parameter group setting,
which is the authoritative source after FSDP initialization.
"""
for group in self.parameter_groups:
group_requires_grad = group.requires_grad
for param in group.params:
param.requires_grad = group_requires_grad

def get_mem_alloc_context(self, groups=None, symmetric=True):
"""
Expand Down
2 changes: 1 addition & 1 deletion megatron/core/models/gpt/gpt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def _preprocess(
# return this extra tensor
# this is for backwards compatibility with
# legacy unit tests, which break if you
# return a 6 tuple instead of 5.
# return a 7 tuple instead of 6.
preproc_output += (rotary_pos_cos_sin,)

return preproc_output
Expand Down
4 changes: 3 additions & 1 deletion megatron/core/transformer/fsdp_dtensor_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from megatron.core import parallel_state
from megatron.core.tensor_parallel.layers import copy_tensor_model_parallel_attributes
from megatron.core.transformer.transformer_layer import TransformerLayer
from megatron.core.utils import get_model_config


def get_ep_layer_offset(num_experts: int | None = None) -> int:
Expand Down Expand Up @@ -196,7 +197,8 @@ def handle_swiglu_in_state_dict(model, model_state_dict, optimizer_state_dict):
assert HAVE_MEGATRON_FSDP, "This function requires Megatron-FSDP to be installed."

# Extract num_experts from model config for expert parameter processing
num_experts = model.config.num_moe_experts if hasattr(model, 'config') else None
model_config = get_model_config(model)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this cause an error if the model does not have config? Suggest to use model_config = get_model_config(model, allow_none=True) here, and modify the API of get_model_config (default value can be False to match with original code logic).

num_experts = getattr(model_config, 'num_moe_experts', None)

def intersection(s1, s2):
# Only works for step=1
Expand Down