deepseek v3 pretrain

wenxie-amd · wenxie-amd · commit f2ed8695909f · 2025-03-17T14:53:25.000Z
diff --git a/examples/deepseek_v3/exp_pretrain.yaml b/examples/deepseek_v3/exp_pretrain.yaml
@@ -6,7 +6,7 @@ workspace: ./output
 platform:
   config: platform_azure.yaml
   overrides:
-    master_sink_level: DEBUG
+    master_sink_level: INFO
 
 modules:
   pre_trainer:
@@ -16,7 +16,7 @@ modules:
     overrides:
       # log
       wandb_project: "Primus_DeepSeekV3_Pretrain"
-      # disable_wandb: false
+      disable_wandb: false
       stderr_sink_level: DEBUG
 
       # debug
@@ -60,3 +60,4 @@ modules:
       save_interval: 20000
       no_save_optim: null
       no_save_rng: null
+      disable_last_saving: true
diff --git a/examples/deepseek_v3/pretrain.py b/examples/deepseek_v3/pretrain.py
@@ -32,4 +32,4 @@
         log_init(primus_cfg, trainer.platform)
 
     trainer.init()
-    # trainer.run()
+    trainer.run()
diff --git a/primus/configs/models/megatron/deepseek_v3_base.yaml b/primus/configs/models/megatron/deepseek_v3_base.yaml
@@ -24,7 +24,7 @@ moe_router_bias_update_rate: 1.0e-3
 moe_router_load_balancing_type: seq_aux_loss
 moe_token_dispatcher_type: alltoall
 moe_shared_expert_overlap: true
-moe_aux_loss_coeff: ${moe_aux_loss_coeff:1.0e-2}
+moe_aux_loss_coeff: 1.0e-2
 
 # parallel and optimization
 expert_model_parallel_size: 1
diff --git a/primus/configs/modules/megatron/trainer_base.yaml b/primus/configs/modules/megatron/trainer_base.yaml
@@ -179,6 +179,7 @@ train_iters: null
 eval_iters: 32
 eval_interval: 2000
 skip_train: false
+train_sync_interval: null # int
 
 adlr_autoresume: false
 adlr_autoresume_interval: 1000
@@ -228,6 +229,7 @@ profile: false
 profile_ranks: [0]
 profile_step_end: 12
 profile_step_start: 10
+iterations_to_skip: null
 result_rejected_tracker_filename: null
 enable_gloo_process_groups: true
 record_memory_history: false
@@ -351,3 +353,4 @@ parallel_output: false
 
 enable_ft_package: false
 calc_ft_timeouts: false
+run_workload_inspector_server: false
diff --git a/primus/modules/base_module.py b/primus/modules/base_module.py
@@ -107,6 +107,7 @@ def setup_worker_logger(self, rank, world_size):
 
         # monkey patch print function of builtins
         self.original_print = builtins.print
+        # builtins.print = log_rank_all
         builtins.print = debug_rank_all
 
         # disable all logging handlers
diff --git a/primus/modules/module_utils.py b/primus/modules/module_utils.py
@@ -27,6 +27,31 @@ def log_rank_0(msg, *args, **kwargs):
         log_func(msg, module_name, function_name, line)
 
 
+def log_rank_last(msg, *args, **kwargs):
+    log_func = logger.info_with_caller
+
+    caller = inspect.stack()[1]
+    caller_frame = caller.frame
+    function_name = caller_frame.f_code.co_name
+    module_name = caller_frame.f_globals["__name__"].split(".")[-1]
+    line = caller.lineno
+
+    if _rank == _world_size - 1:
+        log_func(msg, module_name, function_name, line)
+
+
+def log_rank_all(msg, *args, **kwargs):
+    log_func = logger.info_with_caller
+
+    caller = inspect.stack()[1]
+    caller_frame = caller.frame
+    function_name = caller_frame.f_code.co_name
+    module_name = caller_frame.f_globals["__name__"].split(".")[-1]
+    line = caller.lineno
+
+    log_func(msg, module_name, function_name, line)
+
+
 def log_kv_rank_0(key, value):
     log_func = logger.log_kv_with_caller
 
diff --git a/primus/modules/trainer/base_trainer.py b/primus/modules/trainer/base_trainer.py
@@ -6,20 +6,22 @@
 
 from abc import ABC, abstractmethod
 
+import torch
+from megatron.core.models.gpt import GPTModel
 
-class BaseTrainer(ABC):
-    @abstractmethod
-    def get_batch_func(self):
-        raise NotImplementedError
 
+class BaseTrainer(ABC):
+    # def get_batch_func(self):
     @abstractmethod
-    def get_loss_func(self):
+    def get_batch(self, data_iterator):
         raise NotImplementedError
 
+    # def get_loss_func(self):
     @abstractmethod
-    def build_dataset_and_tokenizer(self):
+    def loss_func(self, loss_mask: torch.Tensor, output_tensor: torch.Tensor):
         raise NotImplementedError
 
+    # def get_forward_step_func(self):
     @abstractmethod
-    def get_forward_step_func(self):
+    def forward_step(self, data_iterator, model: GPTModel):
         raise NotImplementedError
diff --git a/primus/modules/trainer/megatron/pre_trainer.py b/primus/modules/trainer/megatron/pre_trainer.py
@@ -4,6 +4,17 @@
 # See LICENSE for license information.
 #################################################################################
 
+from functools import partial
+
+import torch
+from megatron.core import mpu
+from megatron.core.models.gpt import GPTModel
+from megatron.core.rerun_state_machine import get_rerun_state_machine
+from megatron.core.utils import StragglerDetector
+from megatron.training import get_args, get_timers
+from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank
+
+stimer = StragglerDetector()
 
 from .trainer import MegatronTrainer
 
@@ -13,14 +24,106 @@ def __init__(self, *args, **kwargs):
         kwargs["module_name"] = "pre_trainer"
         super().__init__(*args, **kwargs)
 
-    def get_batch_func(self):
-        raise NotImplementedError
+    def get_batch(self, data_iterator):
+        """Generate a batch."""
+
+        # TODO: this is pretty hacky, find a better way
+        if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+            return None, None, None, None, None
+
+        # get batches based on the TP rank you are on
+        batch = get_batch_on_this_tp_rank(data_iterator)
+
+        # slice batch along sequence dimension for context parallelism
+        batch = get_batch_on_this_cp_rank(batch)
+
+        return batch.values()
+
+    def loss_func(self, loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+        """Loss function.
+
+        Args:
+            loss_mask (torch.Tensor): Used to mask out some portions of the loss
+            output_tensor (torch.Tensor): The tensor with the losses
+
+        Returns:
+            the loss scalar for this micro-batch
+            the number of non-padded tokens in this microbatch
+            a dict containing reporting metrics on the loss and number of tokens across
+                the data parallel ranks
+        """
+        args = get_args()
+
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        total_tokens = loss_mask.sum()
+        loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
+
+        if args.context_parallel_size > 1:
+            torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
+
+        # Check individual rank losses are not NaN prior to DP all-reduce.
+        rerun_state_machine = get_rerun_state_machine()
+        if args.check_for_nan_in_loss_and_grad:
+            rerun_state_machine.validate_result(
+                result=loss[0],
+                rejection_func=torch.isnan,
+                message="found NaN in local forward loss calculation",
+                tolerance=0.0,  # forward pass calculations are determinisic
+                fatal=True,
+            )
+            rerun_state_machine.validate_result(
+                result=loss[0],
+                rejection_func=torch.isinf,
+                message="found Inf in local forward loss calculation",
+                tolerance=0.0,  # forward pass calculations are determinisic
+                fatal=True,
+            )
+        # Check for spiky loss
+        if args.check_for_spiky_loss:
+            rerun_state_machine.validate_result(
+                result=loss[0],
+                rejection_func=partial(
+                    rerun_state_machine.is_unexpectedly_large,
+                    threshold=SPIKY_LOSS_FACTOR,
+                    context="loss",
+                ),
+                message="Spiky loss",
+                tolerance=0.0,  # forward pass calculations are determinisic
+                fatal=False,
+            )
+        # Reduce loss for logging.
+        reporting_loss = loss.clone().detach()
+        torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+
+        # loss[0] is a view of loss, so it has ._base not None, which triggers assert error
+        # in core/pipeline_parallel/schedule.py::deallocate_output_tensor, calling .clone()
+        # on loss[0] fixes this
+        local_num_tokens = loss[1].clone().detach().to(torch.int)
+        return (
+            loss[0].clone(),
+            local_num_tokens,
+            {"lm loss": (reporting_loss[0], reporting_loss[1])},
+        )
+
+    def forward_step(self, data_iterator, model: GPTModel):
+        """Forward training step.
+
+        Args:
+            data_iterator : Input data iterator
+            model (GPTModel): The GPT Model
+        """
+        get_args()
+        timers = get_timers()
 
-    def get_loss_func(self):
-        raise NotImplementedError
+        # Get the batch.
+        timers("batch-generator", log_level=2).start()
+        global stimer
+        with stimer(bdata=True):
+            tokens, labels, loss_mask, attention_mask, position_ids = self.get_batch(data_iterator)
+        timers("batch-generator").stop()
 
-    def build_dataset_and_tokenizer(self):
-        raise NotImplementedError
+        with stimer:
+            output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
 
-    def get_forward_step_func(self):
-        raise NotImplementedError
+        return output_tensor, partial(self.loss_func, loss_mask)
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py