File tree Expand file tree Collapse file tree 1 file changed +41
-0
lines changed
primus/configs/models/megatron Expand file tree Collapse file tree 1 file changed +41
-0
lines changed Original file line number Diff line number Diff line change 1+ bases :
2+ - deepseek_v3_base.yaml
3+
4+ # 393B total params, 20B active params
5+
6+ tokenizer_type : null
7+ extra_tokenizer_type : DeepSeekV3Tokenizer
8+ tokenizer_model : deepseek-ai/DeepSeek-V3
9+
10+ # model
11+ num_layers : 64
12+ hidden_size : 5120
13+ ffn_hidden_size : 13824
14+ num_attention_heads : 40
15+ # mla
16+ q_lora_rank : null
17+ kv_lora_rank : 512
18+ qk_head_dim : 128
19+ qk_pos_emb_head_dim : 0
20+ v_head_dim : 128
21+ kv_channels : 128
22+ # moe
23+ moe_layer_freq : 2
24+ num_experts : 256
25+ moe_router_topk : 8
26+ # num_shared_experts: 1
27+ moe_ffn_hidden_size : 1536
28+ moe_shared_expert_intermediate_size : 1536 # num_shared_experts * moe_ffn_hidden_size
29+
30+ # device limited routing
31+ expert_model_parallel_size : 8
32+ moe_router_num_groups : 8 # int
33+ moe_router_group_topk : 4 # int
34+ moe_aux_loss_coeff : 1.0e-4 # aux_loss_alpha
35+
36+ # noted limited routing (ep=32, 4nodes)
37+ # expert_model_parallel_size: 32
38+ # moe_router_num_groups: 4 # int, node number
39+ # moe_router_group_topk: 2 # int
40+
41+ moe_router_topk_scaling_factor : 2.5 # float
You can’t perform that action at this time.
0 commit comments