Skip to content

Commit 56e7d53

Browse files
committed
add more v3 config
1 parent d74b57d commit 56e7d53

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
bases:
2+
- deepseek_v3_base.yaml
3+
4+
# 393B total params, 20B active params
5+
6+
tokenizer_type: null
7+
extra_tokenizer_type: DeepSeekV3Tokenizer
8+
tokenizer_model: deepseek-ai/DeepSeek-V3
9+
10+
# model
11+
num_layers: 64
12+
hidden_size: 5120
13+
ffn_hidden_size: 13824
14+
num_attention_heads: 40
15+
# mla
16+
q_lora_rank: null
17+
kv_lora_rank: 512
18+
qk_head_dim: 128
19+
qk_pos_emb_head_dim: 0
20+
v_head_dim: 128
21+
kv_channels: 128
22+
# moe
23+
moe_layer_freq: 2
24+
num_experts: 256
25+
moe_router_topk: 8
26+
# num_shared_experts: 1
27+
moe_ffn_hidden_size: 1536
28+
moe_shared_expert_intermediate_size: 1536 # num_shared_experts * moe_ffn_hidden_size
29+
30+
# device limited routing
31+
expert_model_parallel_size: 8
32+
moe_router_num_groups: 8 # int
33+
moe_router_group_topk: 4 # int
34+
moe_aux_loss_coeff: 1.0e-4 # aux_loss_alpha
35+
36+
# noted limited routing (ep=32, 4nodes)
37+
# expert_model_parallel_size: 32
38+
# moe_router_num_groups: 4 # int, node number
39+
# moe_router_group_topk: 2 # int
40+
41+
moe_router_topk_scaling_factor: 2.5 # float

0 commit comments

Comments
 (0)