[model] support GLM-5 (transformers) (modelscope#8066)

Jintao-Huang · web-flow · commit ae1cd27210cb · 2026-02-24T14:46:55.000+08:00
diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md
@@ -412,6 +412,7 @@
 |[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|&#x2714;|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
 |[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
 |[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2714;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
+|[ZhipuAI/GLM-5](https://modelscope.cn/models/ZhipuAI/GLM-5)|glm_moe_dsa|glm4_7|transformers>=5.2.0|&#x2718;|-|[zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5)|
 |[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
 |[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|&#x2718;|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -413,6 +413,7 @@ The table below introduces the models integrated with ms-swift:
 |[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|&#x2714;|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
 |[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
 |[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2714;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
+|[ZhipuAI/GLM-5](https://modelscope.cn/models/ZhipuAI/GLM-5)|glm_moe_dsa|glm4_7|transformers>=5.2.0|&#x2718;|-|[zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5)|
 |[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
 |[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|&#x2718;|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
@@ -80,9 +80,12 @@ def __init__(self, args, template: Template):
 
         self.mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
         self.callbacks = []
-        for callback in self.args.callbacks:
+        for callback in args.callbacks:
             self.callbacks.append(megatron_callbacks_map[callback](self))
 
+        if args.async_save and args.use_persistent_ckpt_worker:
+            init_persistent_async_worker()
+
     def _load_checkpoint(self):
         args = self.args
         if not args.finetune:
@@ -482,28 +485,28 @@ def _prepare_data_iterator(self, train_dataset, val_dataset=None, use_origin_cyc
 
     def train(self, train_dataset, val_dataset):
         args = self.args
+        config = self.config
+        state = self.state
         for m in self.wrapped_models:
             m.train()
 
         if args.is_multimodal:
             for m in self.unwrapped_models:
                 self._prepare_vit_gradient_checkpointing(m)
 
-        self.config.finalize_model_grads_func = finalize_model_grads
-        if args.async_save and args.use_persistent_ckpt_worker:
-            init_persistent_async_worker()
+        config.grad_scale_func = self.optimizer.scale_loss
+        config.finalize_model_grads_func = finalize_model_grads
 
         self.call_event('on_train_begin')
         train_metrics = {}
-        if self.args.virtual_pipeline_model_parallel_size is not None:
+        if args.virtual_pipeline_model_parallel_size is not None:
             train_data_iterator, val_data_iterator = [], []
-            for _ in range(self.args.virtual_pipeline_model_parallel_size):
+            for _ in range(args.virtual_pipeline_model_parallel_size):
                 train_it, val_it = self._prepare_data_iterator(train_dataset, val_dataset)
                 train_data_iterator.append(train_it)
                 val_data_iterator.append(train_it)
         else:
             train_data_iterator, val_data_iterator = self._prepare_data_iterator(train_dataset, val_dataset)
-        state = self.state
         while state.iteration < args.train_iters:
             self.call_event('on_step_begin')
             metrics, grad_norm = self.train_step(train_data_iterator)
diff --git a/swift/megatron/utils/megatron_lm_utils.py b/swift/megatron/utils/megatron_lm_utils.py
@@ -539,7 +539,7 @@ def wrap_model(args, models, wrap_with_ddp: bool = True):
             ) for (model_chunk_idx, model_chunk) in enumerate(models)
         ]
 
-        # Broadcast params from data parallel src rank to other data parallel ranks.
+    # Broadcast params from data parallel src rank to other data parallel ranks.
     if args.data_parallel_random_init:
         for m in models:
             m.broadcast_params()
@@ -588,10 +588,7 @@ def unwrap_model(models, module_instances=None):
     except ImportError:
         pass
     if module_instances is None:
-        from megatron.core.distributed import DistributedDataParallel as DDP
         from megatron.core.distributed import TorchFullyShardedDataParallel as torch_FSDP
-        from megatron.core.transformer.module import Float16Module
-
         module_instances = (DDP, torch_FSDP, Float16Module)
 
     return_list = True
diff --git a/swift/model/constant.py b/swift/model/constant.py
@@ -31,6 +31,7 @@ class LLMModelType:
     glm4 = 'glm4'
     glm4_moe = 'glm4_moe'
     glm4_moe_lite = 'glm4_moe_lite'
+    glm_moe_dsa = 'glm_moe_dsa'
 
     glm_edge = 'glm_edge'
     codefuse_codegeex2 = 'codefuse_codegeex2'
diff --git a/swift/model/models/glm.py b/swift/model/models/glm.py
@@ -494,3 +494,15 @@ def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
         architectures=['GlmOcrForConditionalGeneration'],
         requires=['transformers>=5.0.1dev0'],
     ))
+
+register_model(
+    ModelMeta(
+        LLMModelType.glm_moe_dsa,
+        [
+            ModelGroup([
+                Model('ZhipuAI/GLM-5', 'zai-org/GLM-5'),
+            ], template=TemplateType.glm4_7),
+        ],
+        architectures=['GlmMoeDsaForCausalLM'],
+        requires=['transformers>=5.2.0'],
+    ))