Skip to content

Commit 1995096

Browse files
committed
fix
1 parent 9adad4d commit 1995096

File tree

2 files changed

+4
-1
lines changed

2 files changed

+4
-1
lines changed

swift/megatron/arguments/megatron_args.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,7 @@ def _init_mixed_precision(self):
700700
if self.apply_query_key_layer_scaling:
701701
os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '1'
702702

703+
@staticmethod
703704
def _init_moe(self):
704705
if self.moe_router_dtype.lower() == 'none':
705706
self.moe_router_dtype = None
@@ -774,7 +775,7 @@ def __post_init__(self):
774775
logger.info('Setting args.tuner_type: lora')
775776
if self.adapters:
776777
self._load_adapter_config()
777-
self._init_moe()
778+
MegatronArguments._init_moe(self)
778779
self._init_mixed_precision()
779780

780781
self.megatron_extra_kwargs = json_parse_to_dict(self.megatron_extra_kwargs)

swift/megatron/trainers/gkd_trainer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ def _load_teacher_model(self, teacher_model_path: str, model_type: str):
184184
# which is required by gpt_bridge weight loading logic.
185185
if megatron_args.moe_grouped_gemm is None:
186186
megatron_args.moe_grouped_gemm = True
187+
# Apply MoE initialization (bypassed when loading teacher via setattr)
188+
MegatronArguments._init_moe(megatron_args)
187189

188190
# Restore original EP settings if student is Dense.
189191
# This allows MoE teacher to use EP > 1 even when student is Dense.

0 commit comments

Comments
 (0)