|
| 1 | +work_group: ${TEAM:amd} |
| 2 | +user_name: ${USER:root} |
| 3 | +exp_name: ${EXP_NAME:deepseek_v3-pretrain} |
| 4 | +workspace: ./output |
| 5 | + |
| 6 | +modules: |
| 7 | + pre_trainer: |
| 8 | + framework: megatron |
| 9 | + config: pre_trainer.yaml |
| 10 | + |
| 11 | + # model to run |
| 12 | + model: ${PRIMUS_MODEL:deepseek_v3}.yaml |
| 13 | + overrides: |
| 14 | + # log |
| 15 | + wandb_project: "Primus_DeepSeek_Pretrain" |
| 16 | + stderr_sink_level: DEBUG |
| 17 | + |
| 18 | + # debug |
| 19 | + moe_router_force_load_balancing: true |
| 20 | + log_avg_skip_iterations: 2 |
| 21 | + log_avg_reset_interval: 50 |
| 22 | + |
| 23 | + # hyber parameters |
| 24 | + train_iters: 50 |
| 25 | + micro_batch_size: 4 |
| 26 | + global_batch_size: 256 |
| 27 | + seq_length: ${PRIMUS_SEQ_LENGTH:4096} |
| 28 | + max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:4096} |
| 29 | + lr: 1.0e-5 |
| 30 | + min_lr: 0.0 |
| 31 | + lr_warmup_iters: 2 |
| 32 | + lr_decay_iters: null |
| 33 | + lr_decay_style: cosine |
| 34 | + weight_decay: 0.1 |
| 35 | + adam_beta1: 0.9 |
| 36 | + adam_beta2: 0.95 |
| 37 | + eod_mask_loss: true |
| 38 | + init_method_std: 0.008 |
| 39 | + norm_epsilon: 1.0e-6 |
| 40 | + |
| 41 | + # parallel |
| 42 | + tensor_model_parallel_size: ${PRIMUS_TP:1} |
| 43 | + pipeline_model_parallel_size: ${PRIMUS_PP:1} |
| 44 | + expert_model_parallel_size: ${PRIMUS_EP:8} |
| 45 | + overlap_grad_reduce: true |
| 46 | + overlap_param_gather: true |
| 47 | + |
| 48 | + # data |
| 49 | + mock_data: true |
| 50 | + train_data_path: ${TOKENIZED_DATA_PATH:null} |
| 51 | + valid_data_path: null |
| 52 | + test_data_path: null |
| 53 | + |
| 54 | + # fusion |
| 55 | + # 20250321: need latest megatron docker image |
| 56 | + moe_permute_fusion: false |
| 57 | + # fused wgrad gemm and accumulation |
| 58 | + gradient_accumulation_fusion: false |
| 59 | + # recommend set `false` in fp8 |
| 60 | + moe_use_legacy_grouped_gemm: true |
| 61 | + # fused topk router with aux score |
| 62 | + moe_use_fused_router_with_aux_score: false |
| 63 | + # pad 192/128 for deepseek attention |
| 64 | + fused_padded_mla_attention: false |
| 65 | + |
| 66 | + # Performance toggles |
| 67 | + #multi_latent_attention: false |
| 68 | + #apply_rope_fusion: true |
| 69 | + |
| 70 | + # ckpt |
| 71 | + finetune: false |
| 72 | + auto_continue_train: false |
| 73 | + load: null |
| 74 | + no_load_optim: null |
| 75 | + no_load_rng: null |
| 76 | + save: null |
| 77 | + save_interval: 20000 |
| 78 | + no_save_optim: null |
| 79 | + no_save_rng: null |
| 80 | + disable_last_saving: true |
| 81 | + ckpt_format: torch |
| 82 | + eval_iters: 0 |
0 commit comments