Skip to content

Commit fe0cc35

Browse files
authored
[model] Add support for GLM4.7 Flash (#1460)
1 parent b20fe9b commit fe0cc35

File tree

4 files changed

+58
-5
lines changed

4 files changed

+58
-5
lines changed

scripts/models/glm4.7-30B-A3B.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
MOE_SHARED_EXPERTS=1
2+
3+
MOE_FFN_HIDDEN=1536
4+
MOE_SHARED_EXPERT_INTERMEDIATE_SIZE=$((MOE_FFN_HIDDEN * MOE_SHARED_EXPERTS))
5+
N_DENSE_LAYERS=1
6+
N_MOE_LAYERS=46
7+
8+
MODEL_ARGS=(
9+
--moe-layer-freq [0]*$N_DENSE_LAYERS+[1]*$N_MOE_LAYERS
10+
--num-experts 64
11+
--moe-shared-expert-intermediate-size $MOE_SHARED_EXPERT_INTERMEDIATE_SIZE
12+
--moe-router-topk 4
13+
--moe-grouped-gemm
14+
--moe-permute-fusion
15+
--moe-ffn-hidden-size $MOE_FFN_HIDDEN
16+
--moe-router-score-function sigmoid
17+
--moe-router-pre-softmax
18+
--moe-router-enable-expert-bias
19+
--moe-router-bias-update-rate 0
20+
--moe-router-load-balancing-type seq_aux_loss
21+
--moe-router-topk-scaling-factor 1.8
22+
--moe-aux-loss-coeff 0
23+
--moe-router-dtype fp32
24+
--num-layers $((N_DENSE_LAYERS + N_MOE_LAYERS))
25+
--hidden-size 2048
26+
--ffn-hidden-size 10240
27+
--num-attention-heads 20
28+
--disable-bias-linear
29+
--add-qkv-bias
30+
--swiglu
31+
--untie-embeddings-and-output-weights
32+
--position-embedding-type rope
33+
--no-position-embedding
34+
--normalization RMSNorm
35+
--qk-layernorm
36+
--multi-latent-attention
37+
--q-lora-rank 768
38+
--kv-lora-rank 512
39+
--qk-head-dim 192
40+
--v-head-dim 256
41+
--kv-channels 192
42+
--qk-pos-emb-head-dim 64
43+
--vocab-size 154880
44+
--rotary-base 1000000
45+
--enable-experimental
46+
)

slime/backends/megatron_utils/megatron_to_hf/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ def convert_to_hf(args, model_name, name, param, quantization_config=None):
3131

3232
# TODO optimize code details
3333
def _convert_to_hf_core(args, model_name, name, param):
34-
if "glm4moe" in model_name:
34+
if "glm4moelite" in model_name or "deepseekv3" in model_name:
35+
converted_named_tensors = convert_deepseekv3_to_hf(args, name, param)
36+
elif "glm4moe" in model_name:
3537
converted_named_tensors = convert_glm4moe_to_hf(args, name, param)
3638
elif "glm4" in model_name:
3739
converted_named_tensors = convert_glm4_to_hf(args, name, param)
@@ -41,9 +43,6 @@ def _convert_to_hf_core(args, model_name, name, param):
4143
converted_named_tensors = convert_qwen3_next_to_hf(args, name, param)
4244
elif "qwen2" in model_name or "qwen3" in model_name:
4345
converted_named_tensors = convert_qwen2_to_hf(args, name, param)
44-
elif "deepseekv3" in model_name:
45-
converted_named_tensors = convert_deepseekv3_to_hf(args, name, param)
46-
4746
elif "llama" in model_name:
4847
converted_named_tensors = convert_llama_to_hf(args, name, param)
4948
elif "mimo" in model_name:

slime_plugins/mbridge/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .glm4 import GLM4Bridge
22
from .glm4moe import GLM4MoEBridge
3+
from .glm4moe_lite import GLM4MoELiteBridge
34
from .mimo import MimoBridge
45
from .qwen3_next import Qwen3NextBridge
56

6-
__all__ = ["GLM4Bridge", "GLM4MoEBridge", "Qwen3NextBridge", "MimoBridge"]
7+
__all__ = ["GLM4Bridge", "GLM4MoEBridge", "GLM4MoELiteBridge", "Qwen3NextBridge", "MimoBridge"]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from mbridge.core import register_model
2+
from mbridge.models import DeepseekV3Bridge
3+
4+
5+
@register_model("glm4_moe_lite")
6+
class GLM4MoELiteBridge(DeepseekV3Bridge):
7+
pass

0 commit comments

Comments
 (0)