[megatron] support megatron all-router multimodal (modelscope#7951)

Jintao-Huang · web-flow · commit 9a900bfbfc4b · 2026-02-02T15:03:04.000+08:00
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
@@ -1118,19 +1118,28 @@ def unmerge_lora_adapters(self):
                         module.unmerge()
 
     @staticmethod
-    def _copy_args(output_dir):
-        if is_last_rank():
-            args_path = os.path.join(os.path.dirname(output_dir), 'args.json')
-            if os.path.exists(args_path):
-                shutil.copy(args_path, os.path.join(output_dir, 'args.json'))
+    def copy_path(src_path: str, tgt_path: str):
+        if not is_last_rank():
+            return
+        if not os.path.exists(src_path):
+            raise FileNotFoundError(f'Source path does not exist: {src_path}')
+
+        if os.path.isfile(src_path):
+            os.makedirs(os.path.dirname(tgt_path), exist_ok=True)
+            shutil.copy(src_path, tgt_path)
+        elif os.path.isdir(src_path):
+            shutil.copytree(src_path, tgt_path, dirs_exist_ok=True)
+        else:
+            raise ValueError(f'Source path is neither a file nor a directory: {src_path}')
 
     def save_checkpoint(self, iteration, model, *_args, **kwargs):
         args = get_args()
         output_dir = os.path.join(args.save, f'checkpoint-{iteration}')
         os.makedirs(output_dir, exist_ok=True)
         origin_save = args.save
         args.save = output_dir
-        self._copy_args(output_dir)
+        args_path = os.path.join(os.path.dirname(output_dir), 'args.json')
+        self.copy_path(args_path, os.path.join(output_dir, 'args.json'))
         save_peft_format = args.tuner_type == 'lora' and not args.merge_lora
         if args.save_safetensors and args.no_save_optim:
             model = []
@@ -1142,9 +1151,17 @@ def save_checkpoint(self, iteration, model, *_args, **kwargs):
             # merge-lora does not store lora, lora saving may report an error (Qwen3-VL-Moe)
             if args.tuner_type == 'lora' and args.merge_lora:
                 self.merge_lora_adapters()
+                origin_output_dir = output_dir
                 output_dir = f'{output_dir}-merged'
                 os.makedirs(output_dir, exist_ok=True)
-                self._copy_args(output_dir)
+                for fname in ['latest_checkpointed_iteration.txt', 'args.json']:
+                    src_path = os.path.join(origin_output_dir, fname)
+                    self.copy_path(src_path, os.path.join(output_dir, fname))
+                # common.pt
+                common_path = os.path.join(origin_output_dir, f'iter_{iteration:07d}', 'common.pt')
+                tgt_common_path = os.path.join(output_dir, f'iter_{iteration:07d}', 'common.pt')
+                os.makedirs(os.path.dirname(tgt_common_path), exist_ok=True)
+                self.copy_path(common_path, tgt_common_path)
             self.bridge.save_weights(
                 self.unwrapped_models,
                 output_dir,
diff --git a/swift/megatron/tuners/lora.py b/swift/megatron/tuners/lora.py
@@ -202,7 +202,10 @@ def update_layer(self, adapter_name, r, *, lora_alpha, lora_dropout, init_lora_w
                 lora_b.parallel_mode = self.base_layer.parallel_mode  # fix moe_shared_expert_overlap
         for lora in [lora_a, lora_b]:
             if getattr(lora, 'parallel_mode', None) is None and hasattr(lora, 'weight'):  # TODO: experts
-                sequence_parallel = True if isinstance(self.base_layer, TopKRouter) else self.sequence_parallel
+                if isinstance(self.base_layer, TopKRouter):
+                    sequence_parallel = self.base_layer.weight.sequence_parallel
+                else:
+                    sequence_parallel = self.sequence_parallel
                 lora.weight.sequence_parallel = sequence_parallel
         self.lora_A[adapter_name] = lora_a
         self.lora_B[adapter_name] = lora_b
diff --git a/swift/megatron/utils/utils.py b/swift/megatron/utils/utils.py
@@ -28,10 +28,10 @@
 logger = get_logger()
 
 
-def find_all_linears(model):
+def find_all_linears(model, extra_layers=None):
 
     def _cond(name, module):
-        if name != 'output_layer' and isinstance(
+        if (extra_layers and isinstance(module, tuple(extra_layers))) or name != 'output_layer' and isinstance(
                 module, (TELinear, TELayerNormColumnParallelLinear, TEGroupedLinear, nn.Linear)):
             return True
         return False
@@ -54,6 +54,8 @@ def get_multimodal_target_regex(
     freeze_llm: bool = False,
     freeze_vit: bool = True,
     freeze_aligner: bool = True,
+    include_embedding: bool = False,
+    include_router: bool = False,
 ) -> str:
     from ..model import get_megatron_model_meta
     megatron_model_meta = get_megatron_model_meta(args.hf_model_type)
@@ -68,6 +70,11 @@ def get_multimodal_target_regex(
     if not freeze_aligner:
         modules += aligner
     assert len(modules) > 0, f'modules: {modules}'
+    extra_layers = []
+    if include_embedding:
+        extra_layers.append(LanguageModelEmbedding)
+    if include_router:
+        extra_layers.append(TopKRouter)
 
     res = []
     for module in modules:
@@ -80,13 +87,13 @@ def get_multimodal_target_regex(
         sub_module = deep_getattr(model, module)
         if sub_module is None:
             continue
-        target_modules = find_all_linears(sub_module)
+        target_modules = find_all_linears(sub_module, extra_layers)
         if not target_modules:
             continue
         target_modules = [tm for tm in target_modules if tm]
         target_pattern = rf'.*\.({"|".join(target_modules)})' if target_modules else ''
         rejected_pattern = rf'(?!({"|".join(rejected_modules)}))' if rejected_modules else ''
-        res.append(rf'{rejected_pattern}{module}{target_pattern}')
+        res.append(rf'{rejected_pattern}{module}(?=\.){target_pattern}')
 
     return rf'^({"|".join(res)})$'
 
@@ -103,6 +110,8 @@ def get_target_modules(args, model):
                 freeze_llm=args.freeze_llm,
                 freeze_vit=args.freeze_vit,
                 freeze_aligner=args.freeze_aligner,
+                include_embedding='all-embedding' in target_modules,
+                include_router='all-router' in target_modules,
             )
         else:
             target_modules.remove('all-linear')
diff --git a/swift/tuners/peft.py b/swift/tuners/peft.py
@@ -86,19 +86,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional
 
 
 def _create_and_replace_hook(self, peft_config, adapter_name, target, *args, **kwargs):
-    all_supported_names = ('linear', )
-    all_supported_types = (torch.nn.Embedding, torch.nn.Conv2d, transformers.pytorch_utils.Conv1D, lora.Linear)
-    target_modules = getattr(peft_config, 'target_modules', None)
-    target_parameters = getattr(peft_config, 'target_parameters', None)
     if target is None:
         return
 
-    if isinstance(target_modules, str) and not any(
-        [name in target.__class__.__name__.lower()
-         for name in all_supported_names]) and not any([isinstance(target, type_)
-                                                        for type_ in all_supported_types]) and not target_parameters:
-        return
-
     if target.__class__.__name__ == 'NonDynamicallyQuantizableLinear':
         return