Add kimi-k2 (#560)

Gao016 · gaochang · web-flow · commit 34bba0b8fc1f · 2025-10-30T10:58:54.000+08:00
Co-authored-by: gaochang &lt;gaochang@U-19PX2WQ1-0350.local&gt;
diff --git a/docs/en/get_started/quick_start.md b/docs/en/get_started/quick_start.md
@@ -93,6 +93,7 @@ PYTHONPATH=/root/Megatron-LM python tools/convert_hf_to_torch_dist.py \
 ```
 
 For larger models, you can use `torchrun` to start the covnersion script to convert with multi-gpus or even multi-nodes.
+Note: When converting the kimi-k2 model weights, you need to open config.json in the model path and change "model_type": "kimi_k2" to "model_type": "deepseek_v3".
 
 ### Convert from Megatron Format to Hugging Face Format
 
diff --git a/docs/zh/get_started/quick_start.md b/docs/zh/get_started/quick_start.md
@@ -93,6 +93,7 @@ PYTHONPATH=/root/Megatron-LM python tools/convert_hf_to_torch_dist.py \
 ```
 
 对于更大的模型，可以使用 `torchrun` 来启动转换脚本，从而使用多张 GPU 甚至多机进行权重转换。
+注意：kimi-k2模型权重转换时，需打开模型路径中的config.json，将"model_type": "kimi_k2"修改为"model_type": "deepseek_v3"。
 
 ### Megatron 格式 转换为 Hugging Face 格式
 
diff --git a/scripts/models/kimi-k2.sh b/scripts/models/kimi-k2.sh
@@ -0,0 +1,63 @@
+NLAYERS=61
+FIRST_K_DENSE_REPLACE=1
+
+arr=()
+for ((i=0; i<NLAYERS; i++)); do
+  if (( i < FIRST_K_DENSE_REPLACE )); then
+    arr+=(0)
+  else
+    arr+=(1)
+  fi
+done
+
+printf -v MOE_LAYER_FREQ "[%s]" "$(IFS=', '; echo "${arr[*]}")"
+
+# kimi-k2
+MODEL_ARGS=(
+    --disable-bias-linear
+    --num-layers 61
+    --hidden-size 7168
+    --ffn-hidden-size 18432
+    --num-attention-heads 64
+    --kv-channels 64
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --norm-epsilon 1e-6
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --vocab-size 163840
+    
+    --multi-latent-attention
+    --q-lora-rank 1536
+    --kv-lora-rank 512
+    --qk-head-dim 128
+    --qk-pos-emb-head-dim 64
+    --v-head-dim 128
+    --qk-layernorm
+    --rotary-scaling-factor 32.0
+    --rotary-base 50000
+    --mscale 1.0
+    --mscale-all-dim 1.0
+    --attention-softmax-in-fp32
+    --no-rope-fusion
+
+    # moe
+    --num-experts 384
+    --moe-layer-freq $MOE_LAYER_FREQ
+    --moe-ffn-hidden-size 2048
+    --moe-router-topk 8
+    --moe-shared-expert-intermediate-size 2048
+    --moe-router-pre-softmax
+    --moe-router-score-function sigmoid
+    --moe-router-enable-expert-bias
+    --moe-router-load-balancing-type seq_aux_loss
+    --moe-token-dispatcher-type alltoall
+    --moe-aux-loss-coeff 0
+    --moe-router-bias-update-rate 0
+    --moe-router-group-topk 1
+    --moe-router-num-groups 1
+    --moe-grouped-gemm
+    --moe-router-topk-scaling-factor 2.827
+    --moe-router-dtype fp32
+    --moe-permute-fusion
+)