AMD-AGI
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/deepseek_v3/exp_pretrain.yaml‎
Lines changed: 30 additions & 9 deletions b/‎examples/deepseek_v3/exp_pretrain.yaml‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎examples/deepseek_v3/run_pretrain.sh‎
Lines changed: 3 additions & 3 deletions b/‎examples/deepseek_v3/run_pretrain.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎primus/backends/megatron/data/README.md‎ renamed to ‎primus/backends/megatron/core/data/README.md‎ b/‎primus/backends/megatron/data/README.md‎ renamed to ‎primus/backends/megatron/core/data/README.md‎
diff --git a/‎primus/backends/megatron/data/__init__.py‎ renamed to ‎primus/backends/megatron/core/data/__init__.py‎ b/‎primus/backends/megatron/data/__init__.py‎ renamed to ‎primus/backends/megatron/core/data/__init__.py‎
diff --git a/‎primus/backends/megatron/datasets/README.md‎ renamed to ‎primus/backends/megatron/core/datasets/README.md‎ b/‎primus/backends/megatron/datasets/README.md‎ renamed to ‎primus/backends/megatron/core/datasets/README.md‎
diff --git a/‎primus/backends/megatron/datasets/__init__.py‎ renamed to ‎primus/backends/megatron/core/datasets/__init__.py‎ b/‎primus/backends/megatron/datasets/__init__.py‎ renamed to ‎primus/backends/megatron/core/datasets/__init__.py‎
diff --git a/‎primus/backends/megatron/distributed/__init__.py‎ renamed to ‎primus/backends/megatron/core/distributed/__init__.py‎ b/‎primus/backends/megatron/distributed/__init__.py‎ renamed to ‎primus/backends/megatron/core/distributed/__init__.py‎
diff --git a/‎primus/backends/megatron/fusions/__init__.py‎ renamed to ‎primus/backends/megatron/core/fusions/__init__.py‎ b/‎primus/backends/megatron/fusions/__init__.py‎ renamed to ‎primus/backends/megatron/core/fusions/__init__.py‎
diff --git a/‎primus/backends/megatron/model/README.md‎ renamed to ‎primus/backends/megatron/core/models/README.md‎ b/‎primus/backends/megatron/model/README.md‎ renamed to ‎primus/backends/megatron/core/models/README.md‎
@@ -1,4 +1,4 @@
-name: Megatron-LM-X-CI
+name: Primus-CI
 
 on:
   workflow_dispatch:
@@ -16,7 +16,7 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-      - run: echo "🎉 Begin Megatron-LM-X Python Lint."
+      - run: echo "🎉 Begin Primus Python Lint."
       - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
       - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
       - uses: actions/checkout@v4
 
@@ -14,28 +14,49 @@ modules:
     config: pre_trainer.yaml
     model: deepseek_v3_45BA3B.yaml
     overrides:
+      # log
       wandb_project: "Primus_DeepSeekV3_Pretrain"
-      # TODO
-      disable_wandb: true
+      # disable_wandb: false
       stderr_sink_level: DEBUG
 
-      # TODO
-      build_tokenizer: false
+      # debug
+      num_layers: 4
 
+      # hyber parameters
       train_iters: 10
       micro_batch_size: 1
       global_batch_size: 16
       seq_length: 4096
-      tensor_model_parallel_size: 1
-      pipeline_model_parallel_size: 1
-      # TODO(wenx)
-      expert_model_parallel_size: 1
-
+      max_position_embeddings: 4096
       lr: 1.0e-5
       min_lr: 0.0
       lr_warmup_iters: 2
       lr_decay_iters: null
       lr_decay_style: cosine
 
+      # parallel
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 8
+
+      # data
+      train_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
+      valid_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
+      test_data_path: /home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
+
+      # fusion
+      # 20250317: need latest apex in docker image
+      gradient_accumulation_fusion: false
+      # 20250317: TE grouped gemm has numerical issue
+      moe_use_legacy_grouped_gemm: true
+
+      # ckpt
       finetune: false
       auto_continue_train: true
+      load: null
+      no_load_optim: null
+      no_load_rng: null
+      save: null
+      save_interval: 20000
+      no_save_optim: null
+      no_save_rng: null
@@ -11,6 +11,8 @@ export PYTHONPATH=${SITE_PACKAGES}:${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH}
     echo "MEGATRON_PATH path is not set"
     exit 1
 }
+# build helper_cpp
+pushd "${MEGATRON_PATH}/megatron/core/datasets" && make && popd || exit 1
 
 export EXP_CONFIG=${EXP_CONFIG:-examples/deepseek_v3/exp_pretrain.yaml}
 
@@ -32,9 +34,6 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 # export AMD_SERIALIZE_KERNEL=3
 # export HSA_NO_SCRATCH_RECLAIM=1
 
-# TODO(wenx)
-export GPUS_PER_NODE=2
-
 # cluster node envs
 RUN_ENV="${RUN_ENV:-torchrun}"
 if [ "$RUN_ENV" = "torchrun" ]; then
@@ -71,6 +70,7 @@ echo "GPUS_PER_NODE: $GPUS_PER_NODE"
 echo "HIP_VISIBLE_DEVICES: $HIP_VISIBLE_DEVICES"
 echo ""
 
+
 DISTRIBUTED_ARGS=(
     --nproc_per_node "${GPUS_PER_NODE}"
     --nnodes "${NNODES}"