verl-project · wuxibin89 · Feb 20, 2026 · Jan 23, 2026 · Jan 30, 2026 · Feb 1, 2026
diff --git a/tests/special_e2e/sft/run_sft_engine.sh b/tests/special_e2e/sft/run_sft_engine.sh
@@ -30,7 +30,7 @@ MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
 #hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
 
 SP_SIZE=${SP_SIZE:-1}
-FSDP_SIZE=${FSDP_SIZE:-${NUM_GPUS}}
+FSDP_SIZE=${FSDP_SIZE:-1}
 FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp"}
 
 TP_SIZE=${TP_SIZE:-1}
@@ -44,6 +44,8 @@ USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
 
 FSDP_ENGINE_CONFIG="\
     engine=${backend} \
+    model=hf_model \
+    model.path=$MODEL_PATH \
     optim=${backend} \
     optim.lr=1e-5 \
     optim.lr_warmup_steps_ratio=0.2 \
@@ -58,6 +60,8 @@ FSDP_ENGINE_CONFIG="\
 
 VEOMNI_ENGINE_CONFIG="\
     engine=${backend} \
+    model=hf_model \
+    model.path=$MODEL_PATH \
     optim=${backend} \
     optim.lr=1e-5 \
     optim.lr_warmup_steps_ratio=0.2 \
@@ -71,6 +75,8 @@ VEOMNI_ENGINE_CONFIG="\
 
 MEGATRON_ENGINE_CONFIG="\
     engine=${backend} \
+    model=hf_model \
+    model.path=$MODEL_PATH \
     optim=${backend} \
     optim.lr=1e-5 \
     optim.lr_warmup_steps_ratio=0.2 \
@@ -87,6 +93,27 @@ MEGATRON_ENGINE_CONFIG="\
     +engine.override_transformer_config.context_parallel_size=${CP_SIZE} \
     engine.use_mbridge=True"
 
+TORCHTITAN_ENGINE_CONFIG="\
+    engine=${backend} \
+    model=torchtitan_model \
+    model.attn_type=varlen \
+    model.path=${MODEL_PATH} \
+    optim=${backend} \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_factor=0.1 \
+    optim.decay_type=cosine \
+    optim.total_training_steps=1000 \
+    engine.tensor_parallel_size=${TP_SIZE} \
+    engine.pipeline_parallel_size=${PP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.data_parallel_shard_size=${FSDP_SIZE} \
+    engine.use_torch_compile=False"
+
+
 if [ "$backend" = "fsdp" ]; then
     ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
     echo "Using fsdp engine"
@@ -95,6 +122,10 @@ elif [ "$backend" = "veomni" ]; then
     ENGINE_CONFIG="$VEOMNI_ENGINE_CONFIG"
     echo "Using veomni engine"
     exp_name=gsm8k-${backend}-sp${SP_SIZE}-fsdp${FSDP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
+elif [ "$backend" = "torchtitan" ]; then
+    ENGINE_CONFIG="$TORCHTITAN_ENGINE_CONFIG"
+    echo "Using torchtitan engine"
+    exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-dp${FSDP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
 else
     ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
     echo "Using megatron engine"
@@ -112,8 +143,8 @@ $COMMAND \
     data.use_dynamic_bsz=True \
     data.max_token_len_per_gpu=2048 \
     data.messages_key=messages \
-    model.path=$MODEL_PATH \
     model.use_remove_padding=${USE_REMOVE_PADDING} \
+    data.ignore_input_ids_mismatch=True \
     ${ENGINE_CONFIG} \
     trainer.test_freq=after_each_epoch \
     trainer.save_freq=-1 \
@@ -128,5 +159,5 @@ $COMMAND \
     # trainer.total_training_steps=${TOTAL_TRAIN_STEP} \
     # trainer.checkpoint.save_contents=[model,optimizer,extra,hf_model] \
     # trainer.max_ckpt_to_keep=1 \
-    
-rm -rf "${ckpts_home:?}/*"
+
+rm -rf "${ckpts_home:?}/*"
diff --git a/tests/special_e2e/sft/test_sft_engine_all.sh b/tests/special_e2e/sft/test_sft_engine_all.sh
@@ -37,6 +37,14 @@ BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 b
 echo "run with tp2 pp2 vpp2 cp2 num_gpus8 mode=ray"
 BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine.sh
 
+# test with torchtitan fsdp=1
+echo "run with tp1 pp1 cp1 fsdp2 num_gpus2"
+BACKEND=torchtitan TP_SIZE=1 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=2 bash tests/special_e2e/sft/run_sft_engine.sh
+
+# test with torchtitan tp2 fsdp=2
+echo "run with tp2 pp1 cp1 fsdp2 num_gpus4"
+BACKEND=torchtitan TP_SIZE=2 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=4 bash tests/special_e2e/sft/run_sft_engine.sh
+
 python3 tests/special_e2e/sft/compare_sft_engine_results.py
 
 rm -rf ~/verl/test/log
@@ -0,0 +1,65 @@
+# Target class for this configuration
+_target_: verl.workers.config.TorchtitanEngineConfig
+
+# policy for wrapping the model
+wrap_policy:
+  # Minimum number of parameters to trigger wrapping a layer with FSDP
+  min_num_params: 0
+
+# The policy for applying `reshard_after_forward` within an FSDP setup
+# Options: "default", "always", "never"
+reshard_after_forward: default
+
+# Prefetch the next forward-pass all-gather before the current forward computation.
+forward_prefetch: false
+
+# Whether to use original parameters
+use_orig_params: false
+
+# Mixed precision configuration for FSDP
+mixed_precision: false
+
+# Whether to use torch compile
+use_torch_compile: true
+
+# Whether to use entropy_from_logits_with_chunking
+entropy_from_logits_with_chunking: false
+
+# Whether to use entropy checkpointing
+entropy_checkpointing: false
+
+# Data parallel size (FSDP group size)
+data_parallel_size: 1
+
+# Data parallel replicate size
+data_parallel_replicate_size: 1
+
+# Data parallel shard size
+data_parallel_shard_size: 1
+
+# Tensor parallel size
+tensor_parallel_size: 1
+
+# Expert parallel size
+expert_parallel_size: 1
+
+# Pipeline parallel size
+pipeline_parallel_size: 1
+
+# Context parallel size
+context_parallel_size: 1
+
+# Strategy
+strategy: torchtitan
+
+# Random seed for reproducibility
+seed: 42
+
+# Whether to enable full determinism for distributed training, only for debugging
+full_determinism: false
+
+# Whether to use forward only
+forward_only: false
+
+# Mixed precision training param dtype
+dtype: bfloat16
diff --git a/verl/trainer/config/model/torchtitan_model.yaml b/verl/trainer/config/model/torchtitan_model.yaml
@@ -0,0 +1,25 @@
+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+
+_target_: verl.workers.config.TorchtitanModelConfig
+
+# Model name (e.g., "qwen3", "llama3")
+name: qwen3
+
+# Model flavor/size (e.g., "0.6B", "1.5B", "7B")
+flavor: "0.6B"
+
+# Path to HuggingFace model (tokenizer, config, weights, etc.)
+path: ./assets/hf/Qwen3-0.6B
+
+# whether to use remove padding. Only valid when we use hf model definition
+use_remove_padding: True
+
+# Attention type for the model (e.g., "sdpa", "flex", "varlen")
+attn_type: sdpa
+
+# Attention mask type for the model (e.g., "causal", "document_mask", "block_causal")
+attn_mask_type: causal
@@ -0,0 +1,35 @@
+# Target class for this configuration
+_target_: verl.workers.config.TorchtitanOptimizerConfig
+
+# Optimizer name
+name: AdamW
+
+# Learning rate
+lr: 1e-3
+
+# LR warmup steps ratio
+lr_warmup_steps_ratio: 0.0
+
+# Total training steps
+total_training_steps: -1
+
+# Weight decay
+weight_decay: 0.01
+
+# LR warmup steps
+lr_warmup_steps: -1
+
+# Betas for Adam optimizer
+betas: [0.9, 0.999]
+
+# Clip gradient
+clip_grad: 1.0
+
+# Epsilon for Adam optimizer
+eps: 1e-8
+
+# Decay type: "linear", "sqrt", or "cosine"
+decay_type: linear
+
+# Minimum LR factor for cosine schedule
+min_lr_factor: 0.0
@@ -238,16 +238,22 @@ def _get_batch_seqlens(self, data):
             batch_seqlens: torch.Tensor = data["attention_mask"].sum(dim=-1)
         batch_seqlens = batch_seqlens.to(self.device_name)  # (global_bsz // dp)
 
+        dp_group = self.engine.get_data_parallel_group()
+        dp_size = self.engine.get_data_parallel_size()
+
+        if dp_size == 1 or dp_group is None:
+            return batch_seqlens.tolist()
+
         output_tensor = torch.empty(
-            (batch_seqlens.shape[0] * self.engine.get_data_parallel_size(),),
+            (batch_seqlens.shape[0] * dp_size,),
             dtype=batch_seqlens.dtype,
             device=self.device_name,
         )  # (global_bsz,)
 
         torch.distributed.all_gather_into_tensor(
             output_tensor=output_tensor,
             input_tensor=batch_seqlens,
-            group=self.engine.get_data_parallel_group(),
+            group=dp_group,
         )
 
         batch_seqlens = output_tensor.tolist()
@@ -372,9 +378,9 @@ def fit(self):
                     if self.engine.is_mp_src_rank_with_outputs():
                         val_loss = torch.mean(torch.tensor(val_losses, device=self.device_name))
                         # average over data parallel group
-                        torch.distributed.all_reduce(
-                            val_loss, op=torch.distributed.ReduceOp.AVG, group=self.engine.get_data_parallel_group()
-                        )
+                        dp_group = self.engine.get_data_parallel_group()
+                        if dp_group is not None:
+                            torch.distributed.all_reduce(val_loss, op=torch.distributed.ReduceOp.AVG, group=dp_group)
 
                     if is_logging:
                         metric = {"val/loss": val_loss.detach().item()}

diff --git a/verl/utils/seqlen_balancing.py b/verl/utils/seqlen_balancing.py
@@ -388,7 +388,7 @@ def rearrange_micro_batches(
     if min_num_micro_batch is not None:
         # used to support pp
         num_micro_batches = max(min_num_micro_batch, num_micro_batches)
-    if dist.is_initialized() and same_micro_num_in_dp:
+    if dist.is_initialized() and same_micro_num_in_dp and dp_group is not None:
         num_micro_batches = torch.tensor([num_micro_batches], device=get_device_name())
         dist.all_reduce(num_micro_batches, op=dist.ReduceOp.MAX, group=dp_group)
         num_micro_batches = num_micro_batches.cpu().item()

diff --git a/verl/utils/torch_functional.py b/verl/utils/torch_functional.py
@@ -743,6 +743,8 @@ def get_cosine_schedule_with_warmup(
     assert init_lr_ratio >= 0 and init_lr_ratio <= 1.0
 
     def lr_lambda(current_step):
+        # # 0-indexed step, hence + 1 adjustments
+        current_step += 1
         if current_step < num_warmup_steps:
             return init_lr_ratio + (1.0 - init_lr_ratio) * (float(current_step) / float(max(1, num_warmup_steps)))
         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))

diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
@@ -27,6 +27,7 @@
     "FSDPEngineConfig",
     "McoreEngineConfig",
     "TrainingWorkerConfig",
+    "TorchtitanEngineConfig",
     "VeOmniEngineConfig",
     "EngineConfig",
     "EngineRouterReplayConfig",
@@ -309,6 +310,62 @@ def __post_init__(self):
         assert self.strategy in ["veomni"], f"strategy {self.strategy} not supported"
 
 
+@dataclass
+class TorchtitanEngineConfig(EngineConfig):
+    """Configuration for Torchtitan.
+
+    The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
+
+    Args:
+        wrap_policy (Dict[str, Any]): Configuration for FSDP wrap policy.
+        reshard_after_forward (Literal["default", "always", "never"]): The policy for applying
+            `reshard_after_forward` within an FSDP setup, default "default"
+        forward_prefetch (bool): Whether to prefetch parameters for next forward pass, default False
+        use_orig_params (bool): Whether to use original parameters when initialize FSDP, default False
+        mixed_precision (bool): Mixed precision configuration for FSDP, default False
+        offload_policy (bool): Whether to offload policy model parameters, default False
+        data_parallel_size (int): Data parallel group size, default 1
+        data_parallel_replicate_size (int): Data parallel replicate size, default 1
+        data_parallel_shard_size (int): Data parallel shard degree, default 1
+        tensor_parallel_size (int): Tensor parallel size, default 1
+        expert_parallel_size (int): Expert parallel size, default 1
+        expert_tensor_parallel_size (int): Expert tensor parallel size, default 1
+        pipeline_parallel_size (int): Pipeline parallel size, default 1
+        context_parallel_size (int): Context parallel size, default 1
+        strategy (str): Strategy to use for distributed training, default "torchtitan"
-        data_parallel_replicate_size (int): Data parallel replicate size, default 1
-        data_parallel_shard_size (int): Data parallel shard degree, default 1
-        tensor_parallel_size (int): Tensor parallel size, default 1
-        expert_parallel_size (int): Expert parallel size, default 1
-        pipeline_parallel_size (int): Pipeline parallel size, default 1
-        context_parallel_size (int): Ring-attn context parallel size, default 1
-        strategy (str): Strategy to use for distributed training, default "torchtitan"
+        data_parallel_size (int): Data parallel group size, default 1
+        data_parallel_replicate_size (int): Data parallel replicate size, default 1
+        data_parallel_shard_size (int): Data parallel shard degree, default 1
+        tensor_parallel_size (int): Tensor parallel size, default 1
+        expert_parallel_size (int): Expert parallel size, default 1
+        pipeline_parallel_size (int): Pipeline parallel size, default 1
+        context_parallel_size (int): Context parallel size, default 1
-        data_parallel_replicate_size (int): Data parallel replicate size, default 1
-        data_parallel_shard_size (int): Data parallel shard degree, default 1
-        tensor_parallel_size (int): Tensor parallel size, default 1
-        expert_parallel_size (int): Expert parallel size, default 1
-        pipeline_parallel_size (int): Pipeline parallel size, default 1
-        context_parallel_size (int): Ring-attn context parallel size, default 1
-        strategy (str): Strategy to use for distributed training, default "torchtitan"
+        data_parallel_size (int): Data parallel group size, default 1
+        data_parallel_replicate_size (int): Data parallel replicate size, default 1
+        data_parallel_shard_size (int): Data parallel shard degree, default 1
+        tensor_parallel_size (int): Tensor parallel size, default 1
+        expert_parallel_size (int): Expert parallel size, default 1
+        pipeline_parallel_size (int): Pipeline parallel size, default 1
+        context_parallel_size (int): Context parallel size, default 1
+        seed (int): Random seed for reproducibility.
+        full_determinism (bool): If true, enable_full_determinism is called to ensure reproducible results
+            in distributed training. Important: this will negatively impact performance, so only use it for
+            debugging.
+
+    """
+
+    wrap_policy: dict[str, Any] = field(default_factory=dict)
+    reshard_after_forward: Literal["default", "always", "never"] = "default"
+    forward_prefetch: bool = False
+    use_orig_params: bool = False
+    mixed_precision: bool = False
+    offload_policy: bool = False
+    use_torch_compile: bool = True
+    entropy_from_logits_with_chunking: bool = False
+    entropy_checkpointing: bool = False
+    data_parallel_size: int = 1
+    data_parallel_replicate_size: int = 1
+    data_parallel_shard_size: int = 1
+    tensor_parallel_size: int = 1
+    expert_parallel_size: int = 1
+    expert_tensor_parallel_size: int = 1
+    pipeline_parallel_size: int = 1
+    context_parallel_size: int = 1
+    strategy: str = "torchtitan"
+    seed: int = 42
+    full_determinism: bool = False
+
+    def __post_init__(self):
+        super().__post_init__()
+        assert self.strategy in ["torchtitan"], f"strategy {self.strategy} not supported"
+
+
 @dataclass
 class TrainingWorkerConfig(BaseConfig):
     model_type: str = None  # model type (language_model/value_model)

diff --git a/verl/workers/config/model.py b/verl/workers/config/model.py
@@ -24,7 +24,7 @@
 from verl.utils.import_utils import import_external_libs
 from verl.utils.model import get_generation_config, update_model_config
 
-__all__ = ["HFModelConfig", "MtpConfig"]
+__all__ = ["HFModelConfig", "MtpConfig", "TorchtitanModelConfig"]
 
 
 @dataclass
@@ -220,3 +220,17 @@ def __post_init__(self):
 
     def get_processor(self):
         return self.processor if self.processor is not None else self.tokenizer
+
+
+@dataclass
+class TorchtitanModelConfig(HFModelConfig):
+    # Torchtitan-specific fields
+    name: str = "qwen3"
+    flavor: str = "0.6B"
+
+    # Model args overrides
+    attn_type: str = "sdpa"  # e.g., "sdpa", "flex", "varlen"
+    attn_mask_type: str = "causal"  # e.g., "causal", "block_causal"
+
+    def __post_init__(self):
+        super().__post_init__()