verl-project · HuiyingLi · Feb 23, 2026 · Feb 24, 2026 · Feb 26, 2026 · Feb 26, 2026
@@ -90,6 +90,7 @@ verl is fast with:
    workers/ray_trainer
    workers/fsdp_workers
    workers/megatron_workers
+   workers/automodel_workers
    workers/sglang_worker
    workers/trtllm_worker
    workers/model_engine

@@ -0,0 +1,65 @@
+Automodel Backend
+=================
+
+Last updated: 03/07/2026.
+
+We support the Automodel (nemo_automodel) backend by implementing the
+``AutomodelEngine`` and ``AutomodelEngineWithLMHead`` engine classes.
+The Automodel backend delegates model building, parallelization, optimizer
+sharding, LR scheduling, gradient clipping, and checkpointing to
+nemo_automodel's infrastructure while using verl's training loop,
+data pipeline, and loss function.
+
+**Requirements**
+
+- Automodel r0.3.0
+- transformers v5.0.0
+
+**Pros**
+
+- Supports FSDP2 and TP distributed strategies out of
+  the box.
+
+- Native support for Mixture-of-Experts (MoE) models with Expert
+  Parallelism (EP) via DeepEP.
+
+- TransformerEngine (TE) integration for optimized attention, linear
+  layers, and RMSNorm.
+
+- Readily supports any HuggingFace model without checkpoint conversion.
+
+**Cons**
+
+- Pipeline parallelism is not yet supported.
+
+
+SFT Examples
+------------
+
+We provide example SFT training scripts using the Automodel backend in
+`examples/sft/gsm8k/ <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/>`_.
+
+Basic: Qwen2.5-0.5B with FSDP2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A minimal example using ``Qwen/Qwen2.5-0.5B-Instruct`` with FSDP2 and
+no parallelism:
+
+.. code:: shell
+
+   bash examples/sft/gsm8k/run_qwen_05_automodel.sh 4 /tmp/automodel_sft_test
+
+See `run_qwen_05_automodel.sh <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/run_qwen_05_automodel.sh>`_.
+
+Advanced: Qwen3-30B MoE with Expert Parallelism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A larger-scale example using ``Qwen/Qwen3-30B-A3B-Base`` (MoE model)
+with Expert Parallelism (EP=8), DeepEP, TransformerEngine backend, and
+torch_mm experts backend:
+
+.. code:: shell
+
+   bash examples/sft/gsm8k/run_qwen3_30b_automodel.sh 8 /tmp/automodel_sft_30b
+
+See `run_qwen3_30b_automodel.sh <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/run_qwen3_30b_automodel.sh>`_.
diff --git a/examples/sft/gsm8k/run_qwen3_30b_automodel.sh b/examples/sft/gsm8k/run_qwen3_30b_automodel.sh
@@ -0,0 +1,75 @@
+# Requires: Automodel, transformers>=5.3.0, torchao
+# MoE also requires: grouped_gemm (github.com/fanshiqing/grouped_gemm v1.1.4)
+
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen3_30b_automodel.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.sft_trainer \
+    data.train_files=$HOME/data/hellaswag_sft/hellaswag_sft.parquet \
+    data.val_files=$HOME/data/hellaswag_sft/hellaswag_sft.parquet \
+    data.train_batch_size=512 \
+    data.max_length=2048 \
+    data.truncation=left \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=8192 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    data.train_max_samples=-1 \
+    data.val_max_samples=1024 \
+    model=hf_model \
+    model.path=Qwen/Qwen3-30B-A3B-Base \
+    model.trust_remote_code=True \
+    model.use_remove_padding=True \
+    engine=automodel \
+    engine.distributed_strategy=fsdp2 \
+    engine.tp_size=1 \
+    engine.pp_size=1 \
+    engine.cp_size=1 \
+    engine.ep_size=8 \
+    engine.backend_config.dispatcher=deepep \
+    engine.backend_config.attn=te \
+    engine.backend_config.linear=te \
+    engine.backend_config.rms_norm=torch_fp32 \
+    engine.backend_config.enable_fsdp_optimizations=True \
+    engine.backend_config.experts=torch_mm \
+    engine.activation_checkpointing=True \
+    engine.model_dtype=bf16 \
+    engine.attn_implementation=te \
+    engine.use_torch_compile=False \
+    optim=automodel \
+    optim.optimizer=FusedAdam \
+    optim.optimizer_impl=transformer_engine.pytorch.optimizers.fused_adam \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.1 \
+    optim.weight_decay=0 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.init_lr_ratio=0.1 \
+    optim.min_lr_ratio=0.01 \
+    optim.lr_scheduler_type=cosine \
+    optim.master_weights=true \
+    optim.store_param_remainders=true \
+    optim.exp_avg_dtype=bf16 \
+    optim.exp_avg_sq_dtype=bf16 \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=hellaswag-sft \
+    trainer.experiment_name=hellaswag-sft-qwen3-30b-automodel \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps=100 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.logger=console \
+    trainer.seed=1111 \
+    trainer.nnodes=1 \
+    trainer.resume_mode=disable $@
diff --git a/examples/sft/gsm8k/run_qwen_05_automodel.sh b/examples/sft/gsm8k/run_qwen_05_automodel.sh
@@ -0,0 +1,55 @@
+# Requires: Automodel, transformers>=5.3.0, torchao
+# MoE also requires: grouped_gemm (github.com/fanshiqing/grouped_gemm v1.1.4)
+
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen_05_automodel.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.sft_trainer \
+    data.train_files=$HOME/data/gsm8k_sft/train.parquet \
+    data.val_files=$HOME/data/gsm8k_sft/test.parquet \
+    data.train_batch_size=128 \
+    data.pad_mode=no_padding \
+    data.truncation=error \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=2048 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    model=hf_model \
+    model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    model.use_remove_padding=True \
+    engine=automodel \
+    engine.distributed_strategy=fsdp2 \
+    engine.tp_size=1 \
+    engine.pp_size=1 \
+    engine.cp_size=1 \
+    engine.ep_size=1 \
+    engine.use_torch_compile=False \
+    optim=automodel \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.init_lr_ratio=0 \
+    optim.min_lr_ratio=0.1 \
+    optim.lr_scheduler_type=cosine \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=gsm8k-sft \
+    trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-automodel \
+    trainer.total_epochs=2 \
+    trainer.test_freq=-1 \
+    trainer.save_freq=-1 \
+    trainer.logger=console \
+    trainer.seed=1111 \
+    trainer.resume_mode=disable $@
diff --git a/tests/special_e2e/sft/run_sft_engine.sh b/tests/special_e2e/sft/run_sft_engine.sh
@@ -112,6 +112,22 @@ TORCHTITAN_ENGINE_CONFIG="\
     engine.data_parallel_shard_size=${FSDP_SIZE} \
     engine.use_torch_compile=False"
 
+AUTOMODEL_ENGINE_CONFIG="\
+    engine=${backend} \
+    model=hf_model \
+    model.path=${MODEL_PATH} \
+    optim=${backend} \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.lr_scheduler_type=cosine \
+    engine.tp_size=${TP_SIZE} \
+    engine.cp_size=${CP_SIZE} \
+    engine.use_torch_compile=False"
+
 
 if [ "$backend" = "fsdp" ]; then
     ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
@@ -125,6 +141,10 @@ elif [ "$backend" = "torchtitan" ]; then
     ENGINE_CONFIG="$TORCHTITAN_ENGINE_CONFIG"
     echo "Using torchtitan engine"
     exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-dp${FSDP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
+elif [ "$backend" = "automodel" ]; then
+    ENGINE_CONFIG="$AUTOMODEL_ENGINE_CONFIG"
+    echo "Using automodel engine"
+    exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
 else
     ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
     echo "Using megatron engine"

diff --git a/tests/special_e2e/sft/test_sft_engine_all.sh b/tests/special_e2e/sft/test_sft_engine_all.sh
@@ -46,6 +46,14 @@ BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 m
 # echo "run with tp2 pp1 cp1 fsdp2 num_gpus4"
 # BACKEND=torchtitan TP_SIZE=2 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=4 bash tests/special_e2e/sft/run_sft_engine.sh
 
+# # test with automodel dp=2
+# echo "run with automodel tp1 pp1 cp1 dp2 num_gpus2"
+# BACKEND=automodel TP_SIZE=1 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=2 bash tests/special_e2e/sft/run_sft_engine.sh
+
+# # test with automodel tp2 dp=2
+# echo "run with automodel tp2 pp1 cp1 dp2 num_gpus4"
+# BACKEND=automodel TP_SIZE=2 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=4 bash tests/special_e2e/sft/run_sft_engine.sh
+
 python3 tests/special_e2e/sft/compare_sft_engine_results.py
 
 rm -rf ~/verl/test/log
diff --git a/tests/special_sanity/check_device_api_usage.py b/tests/special_sanity/check_device_api_usage.py
@@ -44,6 +44,7 @@
     "verl/workers/engine/veomni/transformer_impl.py",  # appear in default device_name
     "verl/workers/engine/torchtitan/transformer_impl.py",  # appear in default device_name
     "verl/workers/engine/torchtitan/utils.py",  # appear in torch.cuda.empty_cache()
+    "verl/workers/engine/automodel/transformer_impl.py",  # appear in default device_name
     "verl/workers/rollout/vllm_rollout/vllm_async_server.py",  # appear in config.cudagraph_capture_sizes
     "verl/workers/rollout/sglang_rollout/async_sglang_server.py",  # manually set CUDA_VISIBLE_DEVICES
     "verl/workers/rollout/trtllm_rollout/trtllm_async_server.py",  # appear in config.cudagraph_capture_sizes

@@ -0,0 +1,82 @@
+# Target class for this configuration
+_target_: verl.workers.config.AutomodelEngineConfig
+
+# Backend strategy identifier
+strategy: automodel
+
+# Distributed training strategy: "fsdp2", "megatron_fsdp", or "ddp"
+distributed_strategy: fsdp2
+
+# Parallelism sizes
+tp_size: 1
+pp_size: 1
+cp_size: 1
+ep_size: 1
+dp_replicate_size: 1
+sequence_parallel: false
+defer_fsdp_grad_sync: true
+
+# Whether to offload model parameters to CPU
+param_offload: false
+
+# Whether to offload optimizer state to CPU
+optimizer_offload: false
+
+# Whether to enable activation checkpointing
+activation_checkpointing: false
+
+# Whether to enable FP8 training
+enable_fp8: false
+
+# Whether to enable torch.compile for the model
+enable_compile: false
+
+# Model data type for loading weights ("fp32", "bf16", "fp16")
+model_dtype: fp32
+
+# Attention implementation ("sdpa", "flash_attention_2", "eager", "te")
+attn_implementation: flash_attention_2
+
+# Backend settings
+backend_config:
+  attn: sdpa                      # "te", "sdpa"
+  linear: te                      # "torch", "te"
+  rms_norm: torch_fp32            # "torch", "torch_fp32", "te"
+  rope_fusion: true
+  dispatcher: torch                # "torch", "deepep"
+  experts: gmm                    # "gmm", "torch_mm", "torch", "te"
+  gate_precision: null
+  enable_hf_state_dict_adapter: true
+  enable_fsdp_optimizations: false
+  fake_balanced_gate: false
+  fake_gate_noise: 0.0
+
+# MoE settings (MoEParallelizerConfig)
+moe_config:
+  ignore_router_for_ac: false
+  reshard_after_forward: false
+  lm_head_precision: null
+  wrap_outer_model: true
+
+# Mixed precision policy (FSDP2 MixedPrecisionPolicy)
+mp_param_dtype: bf16
+mp_reduce_dtype: fp32
+mp_output_dtype: bf16
+
+# Random seed for reproducibility
+seed: 42
+
+# Whether to enable full determinism for distributed training, only for debugging
+full_determinism: false
+
+# Whether to use forward only mode
+forward_only: false
+
+# Whether to use torch compile for entropy computation
+use_torch_compile: false
+
+# Whether to use chunked entropy computation
+entropy_from_logits_with_chunking: false
+
+# Whether to use checkpointing for entropy computation
+entropy_checkpointing: false