SchumiDing
diff --git a/‎docs/advance/mtp.md‎
Lines changed: 21 additions & 2 deletions b/‎docs/advance/mtp.md‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎examples/sft/gsm8k/run_mimo_megatron_mtp.sh‎
Lines changed: 102 additions & 0 deletions b/‎examples/sft/gsm8k/run_mimo_megatron_mtp.sh‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎verl/models/mcore/model_forward.py‎
Lines changed: 20 additions & 0 deletions b/‎verl/models/mcore/model_forward.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎verl/trainer/config/sft_trainer_engine.yaml‎
Lines changed: 2 additions & 0 deletions b/‎verl/trainer/config/sft_trainer_engine.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎verl/trainer/sft_trainer.py‎
Lines changed: 17 additions & 6 deletions b/‎verl/trainer/sft_trainer.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎verl/utils/checkpoint/megatron_checkpoint_manager.py‎
Lines changed: 4 additions & 1 deletion b/‎verl/utils/checkpoint/megatron_checkpoint_manager.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎verl/utils/dataset/multiturn_sft_dataset.py‎
Lines changed: 38 additions & 2 deletions b/‎verl/utils/dataset/multiturn_sft_dataset.py‎
Lines changed: 38 additions & 2 deletions
@@ -1,8 +1,8 @@
-# Guide to Using MTP in RL Training and Inference
+# Guide to Using MTP in SFT/RL Training and Inference
 
 **Author**: `https://github.com/meituan-search`
 
-Last updated: 01/16/2026
+Last updated: 01/21/2026
 
 # 1. Scope of Support
 
@@ -41,6 +41,8 @@ Experiment chart:
 ![fully_async_policy_revenue](
 https://github.com/ArronHZG/verl-community/blob/main/docs/mimo-7b-mtp.png?raw=true)
 
+The wandb link for the graph: [wandb](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
+
 **Scenarios with No Significant Effect**
 
 The following configurations will not have a noticeable impact on training results:
@@ -82,3 +84,20 @@ Taking the mimo-7B model deployed separately on H20 hardware using SGLang as an
 - Current priority recommendation: Do not enable MTP acceleration during the inference phase for now;
 
 - Future planning: Further optimization of the speculative logic in the Rollout phase will be conducted to improve throughput performance.
+
+# 5. SFT training
+
+The SFT training with MTP is supported, using the same MTP training configuration as RL training.
+
+An example configuration for running SFT can be found in `examples/sft/gsm8k/run_mimo_megatron_mtp.sh`
+
+**SFT result**
+
+The experiment was conducted using following data:
+- model = mimo-7B-math
+- dataset = gsm8k
+
+The result: [wandb link](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
+
+The presence of mtp layer has limited effect on main loss. However, when MTP layer is detached, the mtp_loss converges to a higher value.
+
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+NUM_GPUS=${NUM_GPUS:-8}
+SP_SIZE=${SP_SIZE:-1}
+TP_SIZE=${TP_SIZE:-1}
+PP_SIZE=${PP_SIZE:-1}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-False}
+LR="1e-5"
+MINLR="1e-6"
+
+export VERL_SFT_LOGGING_LEVEL=INFO
+
+backend=${BACKEND:-megatron}
+
+TENSORBOARD_DIR=~/tensorboard
+
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-29500}
+NNODES=${NNODES:-1}
+RANK=${RANK:-0}
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+
+# Note the default MultiturnSFT Dataset requires all the sys/user/assistant in 'data.message_key'
+DATASET_DIR=${DATASET_DIR:-~/dataset/rl/gsm8k}
+TRAIN_FILES=${DATASET_DIR}/train.parquet
+VAL_FILES=${DATASET_DIR}/eval.parquet
+
+project_name=verl_sft_test
+
+RESUME_MODE=disable
+
+MODEL_PATH="XiaomiMiMo/MiMo-7B-RL"
+ckpts_home=${ckpts_home:-~/verl/test/gsm8k-sft-${backend}}
+
+# currently relies on these two commits that is not on master
+PYPATH=$HOME/pythonpath
+mkdir -p $PYPATH && cd $PYPATH
+[ -d Megatron-LM ] || git clone https://github.com/NVIDIA/Megatron-LM -b dev && (cd Megatron-LM; git checkout 23e092f41ec8bc659020e401ddac9576c1cfed7e)
+[ -d mbridge ] || git clone https://github.com/ArronHZG/mbridge -b feature/verl_mtp && (cd mbridge; git checkout 6bf2d45a15dc4fb52d2f0c38ff546bee33447d10)
+cd -
+export PYTHONPATH=$PYTHONPATH:$PYPATH/mbridge:$PYPATH/Megatron-LM
+
+
+MEGATRON_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=${LR} \
+    optim.min_lr=${MINLR} \
+    optim.lr_warmup_steps=10 \
+    optim.weight_decay=0.1 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    engine.override_transformer_config.recompute_method=uniform \
+    engine.override_transformer_config.recompute_granularity=full \
+    engine.override_transformer_config.recompute_num_layers=1 \
+    engine.use_dist_checkpointing=False \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True \
+    "
+
+ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+echo "Using megatron engine"
+exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}-lr-${MINLR}-${LR}
+
+mkdir -p "${ckpts_home}"
+
+$COMMAND \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${TRAIN_FILES}" \
+    data.train_batch_size=64 \
+    data.micro_batch_size_per_gpu=2 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=error \
+    data.max_length=1024 \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=2048 \
+    data.messages_key=prompt \
+    data.num_workers=0 \
+    model.path=$MODEL_PATH \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    model.trust_remote_code=True \
+    model.mtp.enable=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=after_each_epoch \
+    trainer.save_freq=-1 \
+    trainer.logger="['console']" \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${ckpts_home}" \
+    trainer.resume_mode=${RESUME_MODE}
+    
@@ -168,6 +168,7 @@ def gptmodel_forward_no_padding(
     vision_model=False,
     pad_token_id=None,
     data_format: str = "thd",
+    enable_mtp: bool = False,
 ):
     """Default forward pass for GPT models with optional sequence packing."""
 
@@ -190,6 +191,15 @@ def gptmodel_forward_no_padding(
         input_ids_rmpad, packed_seq_params = preprocess_thd_no_padding(input_ids, pre_process=pre_process)
         input_ids_rmpad = input_ids_rmpad.contiguous()
 
+        if enable_mtp and post_process:
+            args = {
+                k: preprocess_thd_no_padding(v, pre_process=True, need_roll=(k == "label" or k == "loss_mask"))[0]
+                for k, v in logits_processor_args.items()
+            }
+            model_kwargs["labels"] = args["label"].contiguous()
+            model_kwargs["loss_mask"] = args["loss_mask"].contiguous()
+        logits_processor_args.pop("loss_mask")
+
         # For VLM model, need to pass bshd format `input_ids` and `attention_mask`.
         attention_mask = None
         if vision_model:
@@ -233,6 +243,16 @@ def gptmodel_forward_no_padding(
         input_ids_bshd, attention_mask_bshd, position_ids_bshd = preprocess_bshd_no_padding(
             input_ids, pre_process=pre_process
         )
+
+        if enable_mtp and post_process:
+            args = {
+                k: preprocess_bshd_no_padding(v, pre_process=True, need_roll=(k == "label" or k == "loss_mask"))[0]
+                for k, v in logits_processor_args.items()
+            }
+            model_kwargs["labels"] = args["label"].contiguous()
+            model_kwargs["loss_mask"] = args["loss_mask"].contiguous()
+        logits_processor_args.pop("loss_mask")
+
         output_orig = model(
             input_ids=input_ids_bshd,
             attention_mask=attention_mask_bshd,
 
@@ -26,6 +26,7 @@ data:
   messages_key: messages  # Key for messages list in multi-turn mode
   tools_key: tools  # Key for tools list in multi-turn mode
   enable_thinking_key: enable_thinking  # Whether to enable thinking in multi-turn mode
+  enable_thinking_default: none         # The default value when enable_thinking_key is not present in the dataset
   pad_mode: no_padding
   # for right padding
   max_length: 1024
@@ -36,6 +37,7 @@ data:
     name: null
   use_shm: False
   apply_chat_template_kwargs: {}
+  num_workers: 8
 
   # MultiTurnSFTDataset apply_chat_template to each turn separately and concat `input_ids`
   # as a whole sequence, which may not equal to apply_chat_template to whole messages at once. 
 
@@ -38,6 +38,8 @@
 from verl.utils.device import auto_set_device, get_device_name
 from verl.utils.distributed import destroy_global_process_group
 from verl.utils.logger import log_with_rank
+from verl.utils.memory_utils import aggressive_empty_cache
+from verl.utils.profiler import log_gpu_memory_usage
 from verl.utils.tracking import Tracking
 from verl.workers.engine_workers import TrainingWorker
 
@@ -52,6 +54,8 @@ def __init__(
     ):
         self.config = config
 
+        log_gpu_memory_usage(f"rank {torch.distributed.get_rank()}: Before SFTTrainer init", logger=logger)
+
         self.rank = torch.distributed.get_rank()
 
         self._build_config()
@@ -73,6 +77,8 @@ def __init__(
         if self.rank == 0:
             print(self.config)
 
+        log_gpu_memory_usage(f"rank {self.rank}: After SFTTrainer init", logger=logger)
+
     def _build_ckpt_handler(self):
         resume_mode = getattr(self.config.trainer, "resume_mode", "auto")
         resume_from_path = getattr(self.config.trainer, "resume_from_path", None)
@@ -200,7 +206,7 @@ def _build_dataloader(self):
             batch_size=self.train_batch_size_per_dp,
             sampler=self.train_sampler,
             collate_fn=self.collate_fn,
-            num_workers=8,
+            num_workers=self.config.data.num_workers,
             pin_memory=False,
             drop_last=True,
             pin_memory_device=device_name,
@@ -215,7 +221,7 @@ def _build_dataloader(self):
                 batch_size=self.train_batch_size_per_dp,
                 sampler=self.val_sampler,
                 collate_fn=self.collate_fn,
-                num_workers=8,
+                num_workers=self.config.data.num_workers,
                 pin_memory=False,
                 drop_last=True,
                 pin_memory_device=device_name,
@@ -298,6 +304,9 @@ def fit(self):
         for epoch in range(start_epoch, self.config.trainer.total_epochs):
             self.train_sampler.set_epoch(epoch=epoch)
 
+            aggressive_empty_cache(force_sync=True)
+            log_gpu_memory_usage(f"rank {self.rank}: At start of epoch {epoch}", logger=logger)
+
             for step_in_epoch, data in enumerate(
                 tqdm(
                     self.train_dataloader,
@@ -330,10 +339,11 @@ def fit(self):
                     metrics = tu.get(output, "metrics")
 
                     # TODO: we can actual accumulate metrics for N steps and perform aggregate metrics
-                    metrics["train/loss"] = metrics.pop("loss")
-                    metrics["train/grad_norm"] = metrics.pop("grad_norm")
-                    metrics["train/lr"] = metrics.pop("lr")
-                    metrics["train/mfu"] = metrics.pop("mfu")
+                    for k in ["loss", "grad_norm", "lr", "mfu"]:
+                        if k in metrics.keys():
+                            value = metrics.pop(k)
+                            metrics[f"train/{k}"] = value
+
                     metrics["train/global_tokens"] = torch.sum(
                         torch.tensor(batch_seqlens, device=self.device_name)
                     ).item()
@@ -373,6 +383,7 @@ def fit(self):
                     torch.distributed.barrier()
 
                 if is_last_step or (self.save_freq > 0 and is_save_step):
+                    aggressive_empty_cache(force_sync=True)
                     self.ckpt_handler.save_checkpoint(step=global_step)
 
                 if is_last_step:
 
@@ -259,7 +259,10 @@ def generate_state_dict(
                 key = "model"
             if hasattr(model, "module"):
                 model = model.module
-            state_dict[key] = model.sharded_state_dict()
+
+            # GPTModel's sharded_state_dict function when having mtp requires metadata['dp_cp_group']
+            kwargs = {"metadata": {"dp_cp_group": mpu.get_data_parallel_group(with_context_parallel=True)}}
+            state_dict[key] = model.sharded_state_dict(**kwargs)
 
         # Optimizer State Dict
         if generate_optimizer:
 
@@ -19,6 +19,7 @@
 import logging
 import os
 import re
+from functools import wraps
 from typing import Any, Optional
 
 import numpy as np
@@ -40,6 +41,33 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
+def once(func):
+    """Decorator to ensure a function runs only once. Subsequent calls do nothing."""
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not hasattr(wrapper, "called"):
+            wrapper.called = True
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+@once
+def print_assembled_message(tokenizer, message_list, input_ids, loss_mask, attn_mask, tools):
+    """
+    Print the message after applying the chat template
+    """
+
+    tokenized = tokenizer.apply_chat_template(message_list, add_generation_prompt=False, tokenize=False, tools=tools)
+    sep = "\n\n"
+    str = f"tokenized entire message:\n{tokenized}"
+    str += sep
+    str += f"tokenized seperately    :\n{tokenizer.decode(input_ids)}"
+
+    logger.debug(str)
+
+
 def convert_nested_value_to_list_recursive(data_item):
     if isinstance(data_item, dict):
         return {k: convert_nested_value_to_list_recursive(v) for k, v in data_item.items()}
@@ -91,6 +119,7 @@ def __init__(
         )
         self.tools_key = config.get("tools_key", "tools")
         self.enable_thinking_key = config.get("enable_thinking_key", "enable_thinking")
+        self.enable_thinking_default = config.get("enable_thinking_default", None)
         self.apply_chat_template_kwargs = config.get("apply_chat_template_kwargs", {})
         self.shuffle = config.get("shuffle", False)
         self.seed = config.get("seed")
@@ -125,7 +154,8 @@ def series_to_item(ls):
 
         dataframes = []
         for parquet_file in self.parquet_files:
-            dataframe = pd.read_parquet(parquet_file)
+            # default loader loads some list as np.ndarray, which fails the tokenizer
+            dataframe = pd.read_parquet(parquet_file, dtype_backend="pyarrow")
             dataframes.append(dataframe)
         self.dataframe = pd.concat(dataframes)
 
@@ -167,6 +197,7 @@ def _process_single_message(
         self,
         index: int,
         message: dict[str, Any],
+        full_message: list,
         tools: Optional[list[dict[str, Any]]] = None,
         enable_thinking: Optional[bool] = None,
     ) -> tuple[list[int], list[int], list[int]]:
@@ -267,14 +298,17 @@ def __getitem__(self, item):
         row_dict: dict = self.dataframe.iloc[item].to_dict()
         messages = self._build_messages(row_dict)
         tools = self.tools[item] if self.tools is not None else None
-        enable_thinking = self.enable_thinking[item] if self.enable_thinking is not None else None
+        enable_thinking = (
+            self.enable_thinking[item] if self.enable_thinking is not None else self.enable_thinking_default
+        )
 
         # 1. tokenize each message
         input_ids, loss_mask, attention_mask, multi_modal_inputs = [], [], [], {}
         for i, message in enumerate(messages):
             _input_ids, _loss_mask, _attention_mask, _inputs = self._process_single_message(
                 index=i,
                 message=message,
+                full_message=messages,
                 tools=tools if i == 0 else None,
                 enable_thinking=enable_thinking,
             )
@@ -290,6 +324,8 @@ def __getitem__(self, item):
         assert input_ids.shape == loss_mask.shape == attention_mask.shape, (
             f"Shape mismatch: {input_ids.shape}, {loss_mask.shape}, {attention_mask.shape}"
         )
+
+        print_assembled_message(self.tokenizer, messages, input_ids, loss_mask, attention_mask, tools)
         self.sanity_check(input_ids, messages, tools, enable_thinking)
 
         # Since the tokenizer may return user-customized results, we need to filter out inconsistent tensor shapes