support mtp; update readme (#11)

wenxie-amd · web-flow · commit dbb8f970631d · 2025-03-25T16:39:43.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ logs
 local/
 .gitmodules
 output
-experiment
+experiment
+data
diff --git a/README.md b/README.md
@@ -1,8 +1,57 @@
 # Primus
 
-```shell
-# install pre-commit
-pip install pre-commit
-# the first time you download the repo, it will be cached for future use
-cd path_to_primus && pre-commit install
+## Overview
+Primus is a training framework that supports different training and inference backends. It is designed for pretraining, posttraining, and reinforcement learning tasks.
+
+## Setup Environment
+Use the following command to create a container:
+```bash
+# pull the public docker image
+docker pull rocm/megatron-lm:latest
+
+# create a container
+docker run -d \
+  --name=dev_username \
+  --network=host\
+  --ipc=host  \
+  --device /dev/dri \
+  --device /dev/kfd \
+  --group-add video \
+  --cap-add=SYS_PTRACE \
+  --security-opt seccomp=unconfined \
+  --shm-size=64G \
+  rocm/megatron-lm:latest sleep infinity
+
+# get into the container
+docker exec -it dev_username bash
 ```
+
+
+Use the following command to clone the repo:
+- [ ] Set Megatron-LM as a submodule repo
+```bash
+mkdir workspace && cd workspace
+git clone git@github.com:AMD-AIG-AIMA/Primus.git
+git clone git@github.com:NVIDIA/Megatron-LM.git
+# version 20250324
+cd Megatron-LM && git checkout d61821b7174bac690afbad9134bcb4983521052f
+```
+
+## Setup Primus
+```bash
+cd workspace/Primus
+# Install the required dependencies using:
+pip install -r requirements.txt
+# setup the pre-commit for your repo
+pre-commit install
+```
+
+## Examples
+```bash
+cd workspace/Primus
+# deepseek pretrain (default use deepseek_v2_lite model)
+./examples/deepseek/run_pretrain.sh
+```
+
+
+
diff --git a/examples/deepseek/run_pretrain.sh b/examples/deepseek/run_pretrain.sh
@@ -16,8 +16,8 @@ echo "PRIMUS_PATH: $PRIMUS_PATH"
 echo "MEGATRON_PATH: $MEGATRON_PATH"
 
 # check megatron path
-[[ -z "${MEGATRON_PATH}" ]] && {
-    echo "MEGATRON_PATH path is not set"
+[[ ! -d "${MEGATRON_PATH}" ]] && {
+    echo "Error: MEGATRON_PATH (${MEGATRON_PATH}) does not exist"
     exit 1
 }
 
@@ -42,8 +42,8 @@ export NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7
 export NCCL_IB_GID_INDEX=3
 export NCCL_CROSS_NIC=0
 export HSA_ENABLE_SDMA=0
-export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-eth0}
-export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-eth0}
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-ens51f0}
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-ens51f0}
 export CUDA_DEVICE_MAX_CONNECTIONS=1 # Reducing to 1 ensures no PCIE traffic (even on single node)
 export NCCL_PROTO=Simple
 export RCCL_MSCCL_ENABLE=0
diff --git a/primus/configs/models/megatron/deepseek_v3.yaml b/primus/configs/models/megatron/deepseek_v3.yaml
@@ -20,6 +20,9 @@ qk_head_dim: 128
 qk_pos_emb_head_dim: 64
 v_head_dim: 128
 kv_channels: 128
+# mtp
+mtp_num_layers: 1
+mtp_loss_scaling_factor: 0.1
 # moe
 moe_layer_freq: 3
 num_experts: 256
diff --git a/primus/configs/models/megatron/deepseek_v3_base.yaml b/primus/configs/models/megatron/deepseek_v3_base.yaml
@@ -9,6 +9,10 @@ multi_latent_attention: true
 # multi_latent_attention does not support apply_rope_fusion
 apply_rope_fusion: false
 
+# mtp
+mtp_num_layers: null # num_nextn_predict_layers
+mtp_loss_scaling_factor: 0.1
+
 # moe
 moe_layer_freq: 1
 moe_router_topk: 6
diff --git a/primus/configs/models/megatron/language_model.yaml b/primus/configs/models/megatron/language_model.yaml
@@ -88,6 +88,10 @@ rotary_scaling_factor: 1.0 # float
 mscale: 1.0 # float
 mscale_all_dim: 1.0 # float
 
+# MTP 
+mtp_num_layers: null # int
+mtp_loss_scaling_factor: 0.1 # float
+
 # MoE related
 num_experts: null
 moe_layer_freq: 1 # int
diff --git a/primus/configs/modules/megatron/trainer_base.yaml b/primus/configs/modules/megatron/trainer_base.yaml
@@ -325,10 +325,24 @@ log_straggler: false
 disable_straggler_on_startup: false
 straggler_ctrlr_port: 65535
 straggler_minmax_count: 1
+# inference
 inference_batch_times_seqlen_threshold: -1
+inference_dynamic_batching: false
+inference_dynamic_batching_buffer_size_gb: 40.0 # float
+inference_dynamic_batching_buffer_guaranteed_fraction: 0.2 # float   
+inference_dynamic_batching_buffer_overflow_factor: null # float
+inference_dynamic_batching_max_requests_override: null # int
+inference_dynamic_batching_max_tokens_override: null # int
 max_tokens_to_oom: 12000
 output_bert_embeddings: false
-bert_embedder_type: megatron
+bert_embedder_type: megatron # "megatron", "huggingface"
+flash_decode: false
+enable_cuda_graph: false
+cuda_graph_warmup_steps: 3 # int
+external_cuda_graph: false
+cuda_graph_scope: full # full, attn
+inference_max_requests: 8 # int
+inference_max_seq_length: 2560 # int, (prefill + decode)
 
 create_attention_mask_in_dataloader: true
 num_dataset_builder_threads: 1
@@ -354,3 +368,4 @@ parallel_output: false
 enable_ft_package: false
 calc_ft_timeouts: false
 run_workload_inspector_server: false
+is_hybrid_model: false
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 loguru
 wandb
+pre-commit

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`loguru`
`2`	`2`	`wandb`
	`3`	`+pre-commit`