fix

hjh0119 · hjh0119 · commit 199509628abe · 2026-01-30T10:36:47.000+08:00
diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
@@ -700,6 +700,7 @@ def _init_mixed_precision(self):
         if self.apply_query_key_layer_scaling:
             os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '1'
 
+    @staticmethod
     def _init_moe(self):
         if self.moe_router_dtype.lower() == 'none':
             self.moe_router_dtype = None
@@ -774,7 +775,7 @@ def __post_init__(self):
                 logger.info('Setting args.tuner_type: lora')
         if self.adapters:
             self._load_adapter_config()
-        self._init_moe()
+        MegatronArguments._init_moe(self)
         self._init_mixed_precision()
 
         self.megatron_extra_kwargs = json_parse_to_dict(self.megatron_extra_kwargs)
diff --git a/swift/megatron/trainers/gkd_trainer.py b/swift/megatron/trainers/gkd_trainer.py
@@ -184,6 +184,8 @@ def _load_teacher_model(self, teacher_model_path: str, model_type: str):
             # which is required by gpt_bridge weight loading logic.
             if megatron_args.moe_grouped_gemm is None:
                 megatron_args.moe_grouped_gemm = True
+            # Apply MoE initialization (bypassed when loading teacher via setattr)
+            MegatronArguments._init_moe(megatron_args)
 
             # Restore original EP settings if student is Dense.
             # This allows MoE teacher to use EP > 1 even when student is Dense.