Skip to content

Commit ae1cd27

Browse files
authored
[model] support GLM-5 (transformers) (modelscope#8066)
1 parent 767fe00 commit ae1cd27

File tree

6 files changed

+26
-11
lines changed

6 files changed

+26
-11
lines changed

docs/source/Instruction/Supported-models-and-datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@
412412
|[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|✔|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
413413
|[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|✘|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
414414
|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|✔|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
415+
|[ZhipuAI/GLM-5](https://modelscope.cn/models/ZhipuAI/GLM-5)|glm_moe_dsa|glm4_7|transformers>=5.2.0|✘|-|[zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5)|
415416
|[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
416417
|[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|✘|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
417418
|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|&#x2718;|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ The table below introduces the models integrated with ms-swift:
413413
|[ZhipuAI/GLM-4.7](https://modelscope.cn/models/ZhipuAI/GLM-4.7)|glm4_moe|glm4_7|transformers>=4.54|&#x2714;|-|[zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)|
414414
|[ZhipuAI/GLM-4.7-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.7-FP8)|glm4_moe|glm4_7|transformers>=4.54|&#x2718;|-|[zai-org/GLM-4.7-FP8](https://huggingface.co/zai-org/GLM-4.7-FP8)|
415415
|[ZhipuAI/GLM-4.7-Flash](https://modelscope.cn/models/ZhipuAI/GLM-4.7-Flash)|glm4_moe_lite|glm4_7|transformers>=5.0.0.dev|&#x2714;|-|[zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)|
416+
|[ZhipuAI/GLM-5](https://modelscope.cn/models/ZhipuAI/GLM-5)|glm_moe_dsa|glm4_7|transformers>=5.2.0|&#x2718;|-|[zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5)|
416417
|[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat)|
417418
|[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge|chatglm4|transformers>=4.46|&#x2718;|-|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
418419
|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B)|codefuse_codegeex2|codefuse|transformers<4.34|&#x2718;|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|

swift/megatron/trainers/base.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def __init__(self, args, template: Template):
8080

8181
self.mcore_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
8282
self.callbacks = []
83-
for callback in self.args.callbacks:
83+
for callback in args.callbacks:
8484
self.callbacks.append(megatron_callbacks_map[callback](self))
8585

86+
if args.async_save and args.use_persistent_ckpt_worker:
87+
init_persistent_async_worker()
88+
8689
def _load_checkpoint(self):
8790
args = self.args
8891
if not args.finetune:
@@ -482,28 +485,28 @@ def _prepare_data_iterator(self, train_dataset, val_dataset=None, use_origin_cyc
482485

483486
def train(self, train_dataset, val_dataset):
484487
args = self.args
488+
config = self.config
489+
state = self.state
485490
for m in self.wrapped_models:
486491
m.train()
487492

488493
if args.is_multimodal:
489494
for m in self.unwrapped_models:
490495
self._prepare_vit_gradient_checkpointing(m)
491496

492-
self.config.finalize_model_grads_func = finalize_model_grads
493-
if args.async_save and args.use_persistent_ckpt_worker:
494-
init_persistent_async_worker()
497+
config.grad_scale_func = self.optimizer.scale_loss
498+
config.finalize_model_grads_func = finalize_model_grads
495499

496500
self.call_event('on_train_begin')
497501
train_metrics = {}
498-
if self.args.virtual_pipeline_model_parallel_size is not None:
502+
if args.virtual_pipeline_model_parallel_size is not None:
499503
train_data_iterator, val_data_iterator = [], []
500-
for _ in range(self.args.virtual_pipeline_model_parallel_size):
504+
for _ in range(args.virtual_pipeline_model_parallel_size):
501505
train_it, val_it = self._prepare_data_iterator(train_dataset, val_dataset)
502506
train_data_iterator.append(train_it)
503507
val_data_iterator.append(train_it)
504508
else:
505509
train_data_iterator, val_data_iterator = self._prepare_data_iterator(train_dataset, val_dataset)
506-
state = self.state
507510
while state.iteration < args.train_iters:
508511
self.call_event('on_step_begin')
509512
metrics, grad_norm = self.train_step(train_data_iterator)

swift/megatron/utils/megatron_lm_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ def wrap_model(args, models, wrap_with_ddp: bool = True):
539539
) for (model_chunk_idx, model_chunk) in enumerate(models)
540540
]
541541

542-
# Broadcast params from data parallel src rank to other data parallel ranks.
542+
# Broadcast params from data parallel src rank to other data parallel ranks.
543543
if args.data_parallel_random_init:
544544
for m in models:
545545
m.broadcast_params()
@@ -588,10 +588,7 @@ def unwrap_model(models, module_instances=None):
588588
except ImportError:
589589
pass
590590
if module_instances is None:
591-
from megatron.core.distributed import DistributedDataParallel as DDP
592591
from megatron.core.distributed import TorchFullyShardedDataParallel as torch_FSDP
593-
from megatron.core.transformer.module import Float16Module
594-
595592
module_instances = (DDP, torch_FSDP, Float16Module)
596593

597594
return_list = True

swift/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class LLMModelType:
3131
glm4 = 'glm4'
3232
glm4_moe = 'glm4_moe'
3333
glm4_moe_lite = 'glm4_moe_lite'
34+
glm_moe_dsa = 'glm_moe_dsa'
3435

3536
glm_edge = 'glm_edge'
3637
codefuse_codegeex2 = 'codefuse_codegeex2'

swift/model/models/glm.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,3 +494,15 @@ def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
494494
architectures=['GlmOcrForConditionalGeneration'],
495495
requires=['transformers>=5.0.1dev0'],
496496
))
497+
498+
register_model(
499+
ModelMeta(
500+
LLMModelType.glm_moe_dsa,
501+
[
502+
ModelGroup([
503+
Model('ZhipuAI/GLM-5', 'zai-org/GLM-5'),
504+
], template=TemplateType.glm4_7),
505+
],
506+
architectures=['GlmMoeDsaForCausalLM'],
507+
requires=['transformers>=5.2.0'],
508+
))

0 commit comments

Comments
 (0)