diff --git a/docs/index.rst b/docs/index.rst
index 3b27486e062..1d3bcf239ff 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -90,6 +90,7 @@ verl is fast with:
    workers/ray_trainer
    workers/fsdp_workers
    workers/megatron_workers
+   workers/automodel_workers
    workers/sglang_worker
    workers/trtllm_worker
    workers/model_engine
diff --git a/docs/workers/automodel_workers.rst b/docs/workers/automodel_workers.rst
new file mode 100644
index 00000000000..55864db4360
--- /dev/null
+++ b/docs/workers/automodel_workers.rst
@@ -0,0 +1,65 @@
+Automodel Backend
+=================
+
+Last updated: 03/07/2026.
+
+We support the Automodel (nemo_automodel) backend by implementing the
+``AutomodelEngine`` and ``AutomodelEngineWithLMHead`` engine classes.
+The Automodel backend delegates model building, parallelization, optimizer
+sharding, LR scheduling, gradient clipping, and checkpointing to
+nemo_automodel's infrastructure while using verl's training loop,
+data pipeline, and loss function.
+
+**Requirements**
+
+- Automodel r0.3.0
+- transformers v5.0.0
+
+**Pros**
+
+- Supports FSDP2 and TP distributed strategies out of
+  the box.
+
+- Native support for Mixture-of-Experts (MoE) models with Expert
+  Parallelism (EP) via DeepEP.
+
+- TransformerEngine (TE) integration for optimized attention, linear
+  layers, and RMSNorm.
+
+- Readily supports any HuggingFace model without checkpoint conversion.
+
+**Cons**
+
+- Pipeline parallelism is not yet supported.
+
+
+SFT Examples
+------------
+
+We provide example SFT training scripts using the Automodel backend in
+`examples/sft/gsm8k/ <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/>`_.
+
+Basic: Qwen2.5-0.5B with FSDP2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A minimal example using ``Qwen/Qwen2.5-0.5B-Instruct`` with FSDP2 and
+no parallelism:
+
+.. code:: shell
+
+   bash examples/sft/gsm8k/run_qwen_05_automodel.sh 4 /tmp/automodel_sft_test
+
+See `run_qwen_05_automodel.sh <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/run_qwen_05_automodel.sh>`_.
+
+Advanced: Qwen3-30B MoE with Expert Parallelism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A larger-scale example using ``Qwen/Qwen3-30B-A3B-Base`` (MoE model)
+with Expert Parallelism (EP=8), DeepEP, TransformerEngine backend, and
+torch_mm experts backend:
+
+.. code:: shell
+
+   bash examples/sft/gsm8k/run_qwen3_30b_automodel.sh 8 /tmp/automodel_sft_30b
+
+See `run_qwen3_30b_automodel.sh <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/run_qwen3_30b_automodel.sh>`_.
diff --git a/examples/sft/gsm8k/run_qwen3_30b_automodel.sh b/examples/sft/gsm8k/run_qwen3_30b_automodel.sh
new file mode 100644
index 00000000000..95d699d218a
--- /dev/null
+++ b/examples/sft/gsm8k/run_qwen3_30b_automodel.sh
@@ -0,0 +1,75 @@
+# Requires: Automodel, transformers>=5.3.0, torchao
+# MoE also requires: grouped_gemm (github.com/fanshiqing/grouped_gemm v1.1.4)
+
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen3_30b_automodel.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.sft_trainer \
+    data.train_files=$HOME/data/hellaswag_sft/hellaswag_sft.parquet \
+    data.val_files=$HOME/data/hellaswag_sft/hellaswag_sft.parquet \
+    data.train_batch_size=512 \
+    data.max_length=2048 \
+    data.truncation=left \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=8192 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    data.train_max_samples=-1 \
+    data.val_max_samples=1024 \
+    model=hf_model \
+    model.path=Qwen/Qwen3-30B-A3B-Base \
+    model.trust_remote_code=True \
+    model.use_remove_padding=True \
+    engine=automodel \
+    engine.distributed_strategy=fsdp2 \
+    engine.tp_size=1 \
+    engine.pp_size=1 \
+    engine.cp_size=1 \
+    engine.ep_size=8 \
+    engine.backend_config.dispatcher=deepep \
+    engine.backend_config.attn=te \
+    engine.backend_config.linear=te \
+    engine.backend_config.rms_norm=torch_fp32 \
+    engine.backend_config.enable_fsdp_optimizations=True \
+    engine.backend_config.experts=torch_mm \
+    engine.activation_checkpointing=True \
+    engine.model_dtype=bf16 \
+    engine.attn_implementation=te \
+    engine.use_torch_compile=False \
+    optim=automodel \
+    optim.optimizer=FusedAdam \
+    optim.optimizer_impl=transformer_engine.pytorch.optimizers.fused_adam \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.1 \
+    optim.weight_decay=0 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.init_lr_ratio=0.1 \
+    optim.min_lr_ratio=0.01 \
+    optim.lr_scheduler_type=cosine \
+    optim.master_weights=true \
+    optim.store_param_remainders=true \
+    optim.exp_avg_dtype=bf16 \
+    optim.exp_avg_sq_dtype=bf16 \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=hellaswag-sft \
+    trainer.experiment_name=hellaswag-sft-qwen3-30b-automodel \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps=100 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.logger=console \
+    trainer.seed=1111 \
+    trainer.nnodes=1 \
+    trainer.resume_mode=disable $@
diff --git a/examples/sft/gsm8k/run_qwen_05_automodel.sh b/examples/sft/gsm8k/run_qwen_05_automodel.sh
new file mode 100644
index 00000000000..d3c7dd8b01c
--- /dev/null
+++ b/examples/sft/gsm8k/run_qwen_05_automodel.sh
@@ -0,0 +1,55 @@
+# Requires: Automodel, transformers>=5.3.0, torchao
+# MoE also requires: grouped_gemm (github.com/fanshiqing/grouped_gemm v1.1.4)
+
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen_05_automodel.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.sft_trainer \
+    data.train_files=$HOME/data/gsm8k_sft/train.parquet \
+    data.val_files=$HOME/data/gsm8k_sft/test.parquet \
+    data.train_batch_size=128 \
+    data.pad_mode=no_padding \
+    data.truncation=error \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=2048 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
+    model=hf_model \
+    model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    model.use_remove_padding=True \
+    engine=automodel \
+    engine.distributed_strategy=fsdp2 \
+    engine.tp_size=1 \
+    engine.pp_size=1 \
+    engine.cp_size=1 \
+    engine.ep_size=1 \
+    engine.use_torch_compile=False \
+    optim=automodel \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.init_lr_ratio=0 \
+    optim.min_lr_ratio=0.1 \
+    optim.lr_scheduler_type=cosine \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=gsm8k-sft \
+    trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-automodel \
+    trainer.total_epochs=2 \
+    trainer.test_freq=-1 \
+    trainer.save_freq=-1 \
+    trainer.logger=console \
+    trainer.seed=1111 \
+    trainer.resume_mode=disable $@
diff --git a/tests/special_e2e/sft/run_sft_engine.sh b/tests/special_e2e/sft/run_sft_engine.sh
index 9fe80afae13..e7350ee99cf 100644
--- a/tests/special_e2e/sft/run_sft_engine.sh
+++ b/tests/special_e2e/sft/run_sft_engine.sh
@@ -112,6 +112,22 @@ TORCHTITAN_ENGINE_CONFIG="\
     engine.data_parallel_shard_size=${FSDP_SIZE} \
     engine.use_torch_compile=False"
 
+AUTOMODEL_ENGINE_CONFIG="\
+    engine=${backend} \
+    model=hf_model \
+    model.path=${MODEL_PATH} \
+    optim=${backend} \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.lr_scheduler_type=cosine \
+    engine.tp_size=${TP_SIZE} \
+    engine.cp_size=${CP_SIZE} \
+    engine.use_torch_compile=False"
+
 
 if [ "$backend" = "fsdp" ]; then
     ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
@@ -125,6 +141,10 @@ elif [ "$backend" = "torchtitan" ]; then
     ENGINE_CONFIG="$TORCHTITAN_ENGINE_CONFIG"
     echo "Using torchtitan engine"
     exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-dp${FSDP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
+elif [ "$backend" = "automodel" ]; then
+    ENGINE_CONFIG="$AUTOMODEL_ENGINE_CONFIG"
+    echo "Using automodel engine"
+    exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
 else
     ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
     echo "Using megatron engine"
diff --git a/tests/special_e2e/sft/test_sft_engine_all.sh b/tests/special_e2e/sft/test_sft_engine_all.sh
index 21524ce1d09..5bf2927eb46 100644
--- a/tests/special_e2e/sft/test_sft_engine_all.sh
+++ b/tests/special_e2e/sft/test_sft_engine_all.sh
@@ -46,6 +46,14 @@ BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 m
 # echo "run with tp2 pp1 cp1 fsdp2 num_gpus4"
 # BACKEND=torchtitan TP_SIZE=2 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=4 bash tests/special_e2e/sft/run_sft_engine.sh
 
+# # test with automodel dp=2
+# echo "run with automodel tp1 pp1 cp1 dp2 num_gpus2"
+# BACKEND=automodel TP_SIZE=1 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=2 bash tests/special_e2e/sft/run_sft_engine.sh
+
+# # test with automodel tp2 dp=2
+# echo "run with automodel tp2 pp1 cp1 dp2 num_gpus4"
+# BACKEND=automodel TP_SIZE=2 PP_SIZE=1 CP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=4 bash tests/special_e2e/sft/run_sft_engine.sh
+
 python3 tests/special_e2e/sft/compare_sft_engine_results.py
 
 rm -rf ~/verl/test/log
diff --git a/tests/special_sanity/check_device_api_usage.py b/tests/special_sanity/check_device_api_usage.py
index 46461590e94..dda18d5278d 100644
--- a/tests/special_sanity/check_device_api_usage.py
+++ b/tests/special_sanity/check_device_api_usage.py
@@ -44,6 +44,7 @@
     "verl/workers/engine/veomni/transformer_impl.py",  # appear in default device_name
     "verl/workers/engine/torchtitan/transformer_impl.py",  # appear in default device_name
     "verl/workers/engine/torchtitan/utils.py",  # appear in torch.cuda.empty_cache()
+    "verl/workers/engine/automodel/transformer_impl.py",  # appear in default device_name
     "verl/workers/rollout/vllm_rollout/vllm_async_server.py",  # appear in config.cudagraph_capture_sizes
     "verl/workers/rollout/sglang_rollout/async_sglang_server.py",  # manually set CUDA_VISIBLE_DEVICES
     "verl/workers/rollout/trtllm_rollout/trtllm_async_server.py",  # appear in config.cudagraph_capture_sizes
diff --git a/verl/trainer/config/engine/automodel.yaml b/verl/trainer/config/engine/automodel.yaml
new file mode 100644
index 00000000000..ea731aec88c
--- /dev/null
+++ b/verl/trainer/config/engine/automodel.yaml
@@ -0,0 +1,82 @@
+# Target class for this configuration
+_target_: verl.workers.config.AutomodelEngineConfig
+
+# Backend strategy identifier
+strategy: automodel
+
+# Distributed training strategy: "fsdp2", "megatron_fsdp", or "ddp"
+distributed_strategy: fsdp2
+
+# Parallelism sizes
+tp_size: 1
+pp_size: 1
+cp_size: 1
+ep_size: 1
+dp_replicate_size: 1
+sequence_parallel: false
+defer_fsdp_grad_sync: true
+
+# Whether to offload model parameters to CPU
+param_offload: false
+
+# Whether to offload optimizer state to CPU
+optimizer_offload: false
+
+# Whether to enable activation checkpointing
+activation_checkpointing: false
+
+# Whether to enable FP8 training
+enable_fp8: false
+
+# Whether to enable torch.compile for the model
+enable_compile: false
+
+# Model data type for loading weights ("fp32", "bf16", "fp16")
+model_dtype: fp32
+
+# Attention implementation ("sdpa", "flash_attention_2", "eager", "te")
+attn_implementation: flash_attention_2
+
+# Backend settings
+backend_config:
+  attn: sdpa                      # "te", "sdpa"
+  linear: te                      # "torch", "te"
+  rms_norm: torch_fp32            # "torch", "torch_fp32", "te"
+  rope_fusion: true
+  dispatcher: torch                # "torch", "deepep"
+  experts: gmm                    # "gmm", "torch_mm", "torch", "te"
+  gate_precision: null
+  enable_hf_state_dict_adapter: true
+  enable_fsdp_optimizations: false
+  fake_balanced_gate: false
+  fake_gate_noise: 0.0
+
+# MoE settings (MoEParallelizerConfig)
+moe_config:
+  ignore_router_for_ac: false
+  reshard_after_forward: false
+  lm_head_precision: null
+  wrap_outer_model: true
+
+# Mixed precision policy (FSDP2 MixedPrecisionPolicy)
+mp_param_dtype: bf16
+mp_reduce_dtype: fp32
+mp_output_dtype: bf16
+
+# Random seed for reproducibility
+seed: 42
+
+# Whether to enable full determinism for distributed training, only for debugging
+full_determinism: false
+
+# Whether to use forward only mode
+forward_only: false
+
+# Whether to use torch compile for entropy computation
+use_torch_compile: false
+
+# Whether to use chunked entropy computation
+entropy_from_logits_with_chunking: false
+
+# Whether to use checkpointing for entropy computation
+entropy_checkpointing: false
diff --git a/verl/trainer/config/optim/automodel.yaml b/verl/trainer/config/optim/automodel.yaml
new file mode 100644
index 00000000000..9e06ffc6ce0
--- /dev/null
+++ b/verl/trainer/config/optim/automodel.yaml
@@ -0,0 +1,56 @@
+# Target class for this configuration
+_target_: verl.workers.config.AutomodelOptimizerConfig
+
+optimizer: AdamW
+
+# Module path to import optimizer from
+optimizer_impl: torch.optim
+
+# Learning rate (maps to max_lr in Automodel's OptimizerParamScheduler)
+lr: 1e-5
+
+# LR warmup steps ratio (used when lr_warmup_steps <= 0)
+lr_warmup_steps_ratio: 0.0
+
+# Total training steps (injected at runtime)
+total_training_steps: -1
+
+# Weight decay
+weight_decay: 0.01
+
+# LR warmup steps (set > 0 to override lr_warmup_steps_ratio)
+lr_warmup_steps: -1
+
+# Betas for Adam optimizer
+betas: [0.9, 0.999]
+
+# Clip gradient norm
+clip_grad: 1.0
+
+# Initial LR ratio for warmup start (init_lr = lr * init_lr_ratio)
+init_lr_ratio: 0.1
+
+# Minimum LR ratio after decay (min_lr = lr * min_lr_ratio)
+min_lr_ratio: 0.01
+
+# LR scheduler type (Automodel OptimizerParamScheduler decay style)
+# Options: "constant", "cosine", "linear", "inverse-square-root"
+lr_scheduler_type: cosine
+
+# Weight decay increment style: "constant", "linear", or "cosine"
+wd_incr_style: constant
+
+# Kept for backward compatibility (unused by Automodel scheduler)
+num_cycles: 0.5
+zero_indexed_step: true
+
+# Common optimizer kwargs
+eps: 1e-8
+master_weights: false
+store_param_remainders: false
+exp_avg_dtype: null       # "fp32", "bf16"
+exp_avg_sq_dtype: null    # "fp32", "bf16"
+master_weight_dtype: null # "fp32", "bf16"
+
+# Additional optimizer kwargs (passed directly to constructor)
+override_optimizer_config: {}
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index 081d1dcfafa..5e950842298 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -64,7 +64,8 @@ def print_assembled_message(tokenizer, message_list, input_ids, loss_mask, attn_
     sep = "\n\n"
     str = f"tokenized entire message:\n{tokenized}"
     str += sep
-    str += f"tokenized seperately    :\n{tokenizer.decode(input_ids)}"
+    decoded_ids = input_ids.tolist() if hasattr(input_ids, "tolist") else input_ids
+    str += f"tokenized seperately    :\n{tokenizer.decode(decoded_ids)}"
 
     logger.debug(str)
 
diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
index 41fc8181c2a..b193c6f0669 100644
--- a/verl/workers/config/engine.py
+++ b/verl/workers/config/engine.py
@@ -29,6 +29,7 @@
     "TrainingWorkerConfig",
     "TorchtitanEngineConfig",
     "VeOmniEngineConfig",
+    "AutomodelEngineConfig",
     "EngineConfig",
     "EngineRouterReplayConfig",
     "QATEngineConfig",
@@ -396,6 +397,127 @@ def __post_init__(self):
         assert self.strategy in ["torchtitan"], f"strategy {self.strategy} not supported"
 
 
+@dataclass
+class AutomodelEngineConfig(EngineConfig):
+    """Configuration for Automodel (nemo_automodel) backend.
+
+    The Automodel backend uses NeMoAutoModelForCausalLM for model loading and
+    supports FSDP2, MegatronFSDP, and DDP distributed strategies with optional
+    TP, CP, and EP parallelism.
+
+    Args:
+        strategy (str): Backend strategy identifier, must be "automodel".
+        distributed_strategy (str): Distributed training strategy: "fsdp2", "megatron_fsdp", or "ddp".
+        tp_size (int): Tensor parallel size.
+        pp_size (int): Pipeline parallel size (only pp_size=1 supported initially).
+        cp_size (int): Context parallel size.
+        ep_size (int): Expert parallel size for MoE models.
+        dp_replicate_size (int): Data-parallel replicate size for HSDP. 1 = pure sharding.
+        sequence_parallel (bool): Enable sequence parallelism in the TP plan.
+        defer_fsdp_grad_sync (bool): Defer FSDP gradient sync to the final micro-batch.
+        activation_checkpointing (bool): Whether to enable activation checkpointing.
+        enable_fp8 (bool): Whether to enable FP8 training.
+        enable_compile (bool): Whether to enable torch.compile for the model.
+        model_dtype (str): Model data type for loading weights. "fp32" loads in float32
+            (matching FSDP golden), "auto" uses the dtype from the model config.
+        attn_implementation (str): Attention implementation to use ("sdpa", "flash_attention_2", "eager", "te").
+
+    Backend settings (nemo_automodel BackendConfig):
+        backend_config (dict): Dict of kwargs passed directly to
+            nemo_automodel.components.models.common.BackendConfig(**backend_config).
+            Controls how model layers are implemented (TE vs PyTorch) and MoE dispatch.
+            See automodel.yaml for all predefined keys with defaults.
+            Key fields:
+                attn (str): Attention backend. "te" = TransformerEngine fused attention,
+                    "sdpa" = PyTorch scaled dot-product attention. Default: "sdpa".
+                linear (str): Linear layer backend. "te" = TE fused linear (with FP8 support),
+                    "torch" = standard PyTorch linear. Default: "te".
+                rms_norm (str): RMSNorm backend. "te" = TE fused RMSNorm, "torch" = PyTorch,
+                    "torch_fp32" = PyTorch in FP32 (better numerical stability for MoE).
+                    Default: "torch_fp32".
+                rope_fusion (bool): Enable fused RoPE kernel (requires CP=1). Default: true.
+                experts (str): MoE expert computation backend.
+                    "gmm" = grouped_gemm (requires pip install grouped_gemm),
+                    "torch_mm" = torch._grouped_mm (no external dependency),
+                    "te" = TE GroupedLinear. Default: "gmm".
+                dispatcher (str): MoE token dispatch strategy.
+                    "torch" = standard all-gather + local compute,
+                    "deepep" = DeepEP optimized all-to-all (higher throughput).
+                    Default: "torch".
+                    Note: "deepep" with experts="gmm" matches the legacy enable_deepep=True behavior.
+                enable_fsdp_optimizations (bool): Enable FSDP-specific optimizations in Automodel.
+                    Default: false.
+                enable_hf_state_dict_adapter (bool): Enable HuggingFace state dict adapter for
+                    checkpoint compatibility. Default: true.
+                fake_balanced_gate (bool): Use fake balanced gating for debugging. Default: false.
+                fake_gate_noise (float): Noise added to fake balanced gate. Default: 0.0.
+                gate_precision: Gate computation precision. Default: null (auto).
+            Full reference: nemo_automodel/components/models/common/backend_config.py
+
+    MoE / Expert Parallelism settings:
+        moe_config (dict): Dict of kwargs passed directly to
+            nemo_automodel.components.moe.parallelizer.MoEParallelizerConfig(**moe_config).
+            Controls MoE parallelization behavior within FSDP2.
+            See automodel.yaml for all predefined keys with defaults.
+            Key fields:
+                ignore_router_for_ac (bool): Exclude router from activation checkpointing.
+                    Default: false.
+                reshard_after_forward (bool): Reshard expert params after forward pass
+                    (trades compute for memory). Default: false.
+                lm_head_precision: Precision for the LM head. Default: null (auto).
+                wrap_outer_model (bool): Whether to FSDP-wrap the outermost model module.
+                    Default: true.
+            Full reference: nemo_automodel/components/moe/parallelizer.py
+
+    Mixed precision policy (FSDP2):
+        mp_param_dtype (str): Parameter dtype for FSDP2 mixed precision policy.
+        mp_reduce_dtype (str): Reduce dtype for FSDP2 mixed precision policy.
+        mp_output_dtype (str): Output dtype for FSDP2 mixed precision policy.
+
+    Entropy computation:
+        entropy_from_logits_with_chunking (bool): Whether to use chunked entropy computation.
+        use_torch_compile (bool): Whether to use torch.compile for entropy computation.
+        entropy_checkpointing (bool): Whether to use checkpointing for entropy computation.
+    """
+
+    strategy: str = "automodel"
+    distributed_strategy: str = "fsdp2"
+    # Parallelism sizes
+    tp_size: int = 1
+    pp_size: int = 1
+    cp_size: int = 1
+    ep_size: int = 1
+    dp_replicate_size: int = 1
+    sequence_parallel: bool = False
+    defer_fsdp_grad_sync: bool = True
+    # Model settings
+    activation_checkpointing: bool = False
+    enable_fp8: bool = False
+    enable_compile: bool = False
+    model_dtype: str = "fp32"
+    attn_implementation: str = "flash_attention_2"
+    # Backend settings
+    backend_config: dict = field(default_factory=dict)
+    # MoE settings
+    moe_config: dict = field(default_factory=dict)
+    # Mixed precision policy
+    mp_param_dtype: str = "bf16"
+    mp_reduce_dtype: str = "fp32"
+    mp_output_dtype: str = "bf16"
+    # Entropy computation
+    entropy_from_logits_with_chunking: bool = False
+    use_torch_compile: bool = True
+    entropy_checkpointing: bool = False
+
+    def __post_init__(self):
+        super().__post_init__()
+        assert self.strategy == "automodel", f"strategy must be 'automodel', got {self.strategy}"
+        assert self.distributed_strategy in ["fsdp2", "megatron_fsdp", "ddp"], (
+            f"distributed_strategy {self.distributed_strategy} not supported"
+        )
+        assert self.pp_size == 1, "Pipeline parallelism (pp_size > 1) is not yet supported for automodel backend"
+
+
 @dataclass
 class TrainingWorkerConfig(BaseConfig):
     model_type: str = None  # model type (language_model/value_model)
diff --git a/verl/workers/config/optimizer.py b/verl/workers/config/optimizer.py
index b7f05bef518..47afdd3bf2e 100644
--- a/verl/workers/config/optimizer.py
+++ b/verl/workers/config/optimizer.py
@@ -26,6 +26,7 @@
     "build_optimizer",
     "VeOmniOptimizerConfig",
     "TorchtitanOptimizerConfig",
+    "AutomodelOptimizerConfig",
 ]
 
 
@@ -170,6 +171,50 @@ class TorchtitanOptimizerConfig(OptimizerConfig):
     min_lr_factor: float = 0.0
 
 
+@dataclass
+class AutomodelOptimizerConfig(OptimizerConfig):
+    """Automodel optimizer configuration extending base OptimizerConfig.
+
+    Uses the same optimizer building mechanism as FSDP (dynamic import from optimizer_impl).
+    LR scheduling is handled by Automodel's OptimizerParamScheduler.
+
+    Args:
+        optimizer (str): Optimizer class name (e.g., "AdamW").
+        optimizer_impl (str): Module path to import optimizer from (e.g., "torch.optim").
+        lr (float): Learning rate (maps to max_lr in OptimizerParamScheduler).
+        init_lr_ratio (Optional[float]): Initial LR ratio for warmup start (init_lr = lr * init_lr_ratio).
+        min_lr_ratio (Optional[float]): Minimum LR ratio after decay (min_lr = lr * min_lr_ratio).
+        lr_scheduler_type (str): LR decay style: "constant", "cosine", "linear", or "inverse-square-root".
+        wd_incr_style (str): Weight decay increment style: "constant", "linear", or "cosine".
+        num_cycles (float): Kept for backward compatibility (unused by Automodel scheduler).
+        zero_indexed_step (bool): Kept for backward compatibility (unused by Automodel scheduler).
+    """
+
+    _mutable_fields = OptimizerConfig._mutable_fields.copy()
+    _mutable_fields.add("lr_scheduler_type")
+
+    optimizer: str = "AdamW"
+    optimizer_impl: str = "torch.optim"
+    init_lr_ratio: Optional[float] = 0.1
+    min_lr_ratio: Optional[float] = 0.01
+    lr_scheduler_type: str = "cosine"
+    wd_incr_style: str = "constant"
+    num_cycles: float = 0.5
+    zero_indexed_step: bool = True
+    # Common optimizer kwargs
+    eps: float = 1e-8
+    master_weights: bool = False
+    store_param_remainders: bool = False
+    exp_avg_dtype: Optional[str] = None  # "fp32", "bf16", "fp16", or "torch.float32" etc.
+    exp_avg_sq_dtype: Optional[str] = None  # "fp32", "bf16", "fp16", or "torch.float32" etc.
+    master_weight_dtype: Optional[str] = None  # "fp32", "bf16", "fp16", or "torch.float32" etc.
+    override_optimizer_config: Optional[dict] = None
+
+    def __post_init__(self):
+        assert self.lr_scheduler_type in ["constant", "cosine", "linear", "inverse-square-root"]
+        return super().__post_init__()
+
+
 def build_optimizer(parameters, config: FSDPOptimizerConfig):
     """Build an optimizer based on the configuration.
 
diff --git a/verl/workers/engine/__init__.py b/verl/workers/engine/__init__.py
index 8f01080fdcb..009f0a8fc8b 100644
--- a/verl/workers/engine/__init__.py
+++ b/verl/workers/engine/__init__.py
@@ -37,6 +37,14 @@
     VeOmniEngine = None
     VeOmniEngineWithLMHead = None
 
+try:
+    from .automodel import AutomodelEngine, AutomodelEngineWithLMHead
+
+    __all__ += ["AutomodelEngine", "AutomodelEngineWithLMHead"]
+except ImportError:
+    AutomodelEngine = None
+    AutomodelEngineWithLMHead = None
+
 # Mindspeed must be imported before Megatron to ensure the related monkey patches take effect as expected
 try:
     from .mindspeed import MindspeedEngineWithLMHead
diff --git a/verl/workers/engine/automodel/__init__.py b/verl/workers/engine/automodel/__init__.py
new file mode 100644
index 00000000000..a839342706b
--- /dev/null
+++ b/verl/workers/engine/automodel/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .transformer_impl import AutomodelEngine, AutomodelEngineWithLMHead
+
+__all__ = [
+    "AutomodelEngine",
+    "AutomodelEngineWithLMHead",
+]
diff --git a/verl/workers/engine/automodel/transformer_impl.py b/verl/workers/engine/automodel/transformer_impl.py
new file mode 100644
index 00000000000..fc71384a323
--- /dev/null
+++ b/verl/workers/engine/automodel/transformer_impl.py
@@ -0,0 +1,713 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Automodel (nemo_automodel) engine for verl SFT training.
+
+This engine delegates model building, parallelization, optimizer sharding,
+LR scheduling, gradient clipping, and checkpointing to Automodel's
+infrastructure while using verl's training loop, data pipeline, and loss function.
+"""
+
+import gc
+import logging
+import os
+from contextlib import nullcontext
+from typing import Any, Callable, Optional
+
+import torch
+import torch.distributed
+from huggingface_hub.constants import HF_HUB_CACHE
+from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
+from nemo_automodel.components.optim.scheduler import OptimizerParamScheduler
+from nemo_automodel.components.training.utils import (
+    prepare_for_final_backward,
+    prepare_for_grad_accumulation,
+    scale_grads_and_clip_grad_norm,
+)
+from tensordict import TensorDict
+from torch.distributed.tensor import DTensor
+
+import verl.utils.torch_functional as verl_F
+from verl.trainer.config import CheckpointConfig
+from verl.utils import tensordict_utils as tu
+from verl.utils.dataset.dataset_utils import DatasetPadMode
+from verl.utils.debug import log_gpu_memory_usage
+from verl.utils.device import get_device_id, get_device_name
+from verl.utils.model import convert_weight_keys, extract_multi_modal_inputs
+from verl.utils.torch_functional import logprobs_from_logits
+from verl.workers.config import AutomodelEngineConfig, AutomodelOptimizerConfig, HFModelConfig
+
+from ..base import BaseEngine, BaseEngineCtx, EngineRegistry
+from ..utils import enable_full_determinism, postprocess_batch_func, prepare_micro_batches
+from .utils import (
+    build_automodel_model,
+    build_distributed_config_from_engine_config,
+    get_dp_group_size,
+    get_dp_rank,
+    get_pp_rank,
+    get_tp_rank,
+    load_automodel_model_to_gpu,
+    load_automodel_optimizer,
+    maybe_fully_shard_optimizer,
+    offload_automodel_model_to_cpu,
+    offload_automodel_optimizer,
+)
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+class AutomodelEngine(BaseEngine):
+    """Engine implementation using Automodel for distributed training."""
+
+    def __init__(
+        self,
+        model_config: HFModelConfig,
+        engine_config: AutomodelEngineConfig,
+        optimizer_config: AutomodelOptimizerConfig,
+        checkpoint_config: CheckpointConfig,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.model_config = model_config
+        self.engine_config = engine_config
+        self.optimizer_config = optimizer_config
+        self.checkpoint_config = checkpoint_config
+
+        self.mode = None
+        self.rank = torch.distributed.get_rank()
+
+        # Apply compatibility patches early in the process
+        from nemo_automodel._transformers.utils import apply_cache_compatibility_patches
+        from nemo_automodel.shared.te_patches import apply_te_patches
+
+        apply_cache_compatibility_patches()
+        apply_te_patches()
+
+        world_size = torch.distributed.get_world_size()
+        self.distributed_config, self.device_mesh, self.moe_mesh = build_distributed_config_from_engine_config(
+            self.engine_config, world_size
+        )
+
+        if self.engine_config.full_determinism:
+            enable_full_determinism(seed=self.engine_config.seed)
+
+        self._is_offload_param = self.engine_config.param_offload
+        self._is_offload_optimizer = self.engine_config.optimizer_offload
+
+        if self.engine_config.entropy_from_logits_with_chunking:
+            entropy_from_logits = verl_F.entropy_from_logits_with_chunking
+        else:
+            entropy_from_logits = verl_F.entropy_from_logits
+
+        self.compute_entropy_from_logits = (
+            torch.compile(entropy_from_logits, dynamic=True)
+            if self.engine_config.use_torch_compile
+            else entropy_from_logits
+        )
+
+    @property
+    def is_param_offload_enabled(self) -> bool:
+        return self._is_offload_param
+
+    @property
+    def is_optimizer_offload_enabled(self) -> bool:
+        return self._is_offload_optimizer
+
+    def initialize(self):
+        """Build the model, optimizer, LR scheduler, and checkpointer using Automodel infrastructure."""
+        self.module = build_automodel_model(
+            self.model_config, self.engine_config, self.distributed_config, self.device_mesh, self.moe_mesh
+        )
+        log_gpu_memory_usage("After Automodel model build", logger=logger)
+
+        if not self.engine_config.forward_only:
+            self.optimizer = self._build_optimizer(self.module)
+            # maybe shard optimizer for MegatronFSDP
+            maybe_fully_shard_optimizer(self.module, self.optimizer, self.distributed_config)
+            self.lr_scheduler = self._build_lr_scheduler(self.optimizer)
+        else:
+            self.optimizer = None
+            self.lr_scheduler = None
+        self._build_checkpointer()
+
+        self.to(
+            device="cpu",
+            model=self._is_offload_param,
+            optimizer=self._is_offload_optimizer,
+            grad=self._is_offload_param,
+        )
+
+        log_gpu_memory_usage("After offload model/optimizer/grad during init", logger=logger)
+        torch.cuda.empty_cache()
+
+    def _build_optimizer(self, module):
+        """Build optimizer via Automodel's build_optimizer."""
+        from nemo_automodel.components.config.loader import ConfigNode
+        from nemo_automodel.recipes.llm.train_ft import build_optimizer as automodel_build_optimizer
+
+        config = self.optimizer_config
+
+        opt_dict = {
+            "_target_": f"{config.optimizer_impl}.{config.optimizer}",
+            "lr": config.lr,
+            "weight_decay": config.weight_decay,
+            "eps": config.eps,
+            "betas": list(config.betas),
+        }
+
+        if config.master_weights:
+            opt_dict["master_weights"] = config.master_weights
+        if config.store_param_remainders:
+            opt_dict["store_param_remainders"] = config.store_param_remainders
+
+        _short_to_torch = {"bf16": "torch.bfloat16", "fp32": "torch.float32", "fp16": "torch.float16"}
+        for attr in ("exp_avg_dtype", "exp_avg_sq_dtype", "master_weight_dtype"):
+            val = getattr(config, attr, None)
+            if val is not None:
+                opt_dict[attr] = _short_to_torch.get(val, val)
+
+        if config.override_optimizer_config:
+            opt_dict.update(config.override_optimizer_config)
+
+        cfg_opt = ConfigNode(opt_dict)
+        optimizers = automodel_build_optimizer(module, cfg_opt, self.distributed_config, self.device_mesh)
+        assert len(optimizers) == 1, f"Expected 1 optimizer, got {len(optimizers)}"
+        return optimizers[0]
+
+    def _build_lr_scheduler(self, optimizer):
+        cfg = self.optimizer_config
+        total_steps = cfg.total_training_steps
+        num_warmup_steps = cfg.lr_warmup_steps
+
+        if num_warmup_steps <= 0:
+            num_warmup_steps = int(cfg.lr_warmup_steps_ratio * total_steps)
+
+        base_lr = cfg.lr
+        init_lr_ratio = cfg.init_lr_ratio if cfg.init_lr_ratio is not None else 0.1
+        min_lr_ratio = cfg.min_lr_ratio if cfg.min_lr_ratio is not None else 0.01
+
+        if self.rank == 0:
+            print(
+                f"Automodel LR Scheduler: total_steps={total_steps}, warmup={num_warmup_steps}, "
+                f"decay_style={cfg.lr_scheduler_type}, init_lr={base_lr * init_lr_ratio:.2e}, "
+                f"max_lr={base_lr:.2e}, min_lr={base_lr * min_lr_ratio:.2e}"
+            )
+
+        scheduler = OptimizerParamScheduler(
+            optimizer=optimizer,
+            init_lr=base_lr * init_lr_ratio,
+            max_lr=base_lr,
+            min_lr=base_lr * min_lr_ratio,
+            lr_warmup_steps=num_warmup_steps,
+            lr_decay_steps=total_steps,
+            lr_decay_style=cfg.lr_scheduler_type,
+            start_wd=cfg.weight_decay,
+            end_wd=cfg.weight_decay,
+            wd_incr_steps=total_steps,
+            wd_incr_style=getattr(cfg, "wd_incr_style", "constant"),
+        )
+        return scheduler
+
+    def forward_backward_batch(self, data: TensorDict, loss_function: Callable, forward_only=False) -> Any:
+        batch_num_tokens = data["loss_mask"].sum().to(get_device_id())
+        torch.distributed.all_reduce(
+            batch_num_tokens, op=torch.distributed.ReduceOp.SUM, group=self.get_data_parallel_group()
+        )
+        tu.assign_non_tensor(data, batch_num_tokens=batch_num_tokens.item())
+        tu.assign_non_tensor(data, dp_size=self.get_data_parallel_size())
+
+        micro_batches, indices = prepare_micro_batches(
+            data=data, dp_group=self.get_data_parallel_group(), same_micro_num_in_dp=True
+        )
+
+        output_lst = []
+        ctx = torch.no_grad() if forward_only else nullcontext()
+
+        if not forward_only:
+            prepare_for_grad_accumulation([self.module])
+
+            # Set MoE aux loss backward scale to counteract FSDP's gradient allreduce.
+            if self.engine_config.ep_size > 1:
+                from nemo_automodel.components.moe.megatron.moe_utils import MoEAuxLossAutoScaler
+
+                MoEAuxLossAutoScaler.main_loss_backward_scale = torch.tensor(
+                    float(get_dp_group_size(self.device_mesh, include_cp=True))
+                )
+
+        num_micro_batches = len(micro_batches)
+        for i, micro_batch in enumerate(micro_batches):
+            # Signal final backward for MoE
+            if not forward_only and i == num_micro_batches - 1:
+                prepare_for_final_backward([self.module])
+
+            with ctx:
+                loss, meta_info = self.forward_step(micro_batch, loss_function=loss_function, forward_only=forward_only)
+                if not forward_only:
+                    loss.backward()
+            output_lst.append(meta_info)
+
+        return postprocess_batch_func(output_lst=output_lst, indices=indices, data=data)
+
+    def forward_step(self, micro_batch: TensorDict, loss_function, forward_only):
+        raise NotImplementedError("forward_step must be implemented in subclass")
+
+    def optimizer_zero_grad(self):
+        self.optimizer.zero_grad()
+
+    def optimizer_step(self):
+        grad_norm = scale_grads_and_clip_grad_norm(
+            max_grad_norm=self.optimizer_config.clip_grad,
+            model_parts=[self.module],
+            norm_type=2.0,
+            pp_enabled=False,
+            device_mesh=self.device_mesh,
+            moe_mesh=self.moe_mesh,
+            ep_axis_name="ep" if self.moe_mesh is not None and "ep" in self.moe_mesh.mesh_dim_names else None,
+            pp_axis_name=None,
+            foreach=True,
+            num_label_tokens=None,
+            dp_group_size=get_dp_group_size(self.device_mesh, include_cp=True),
+        )
+
+        if isinstance(grad_norm, torch.Tensor):
+            grad_norm_val = grad_norm.item()
+        else:
+            grad_norm_val = float(grad_norm)
+
+        # If grad_norm is not finite, skip the update
+        if not torch.isfinite(torch.tensor(grad_norm_val)):
+            print(f"WARN: grad_norm is not finite: {grad_norm_val}")
+            self.optimizer.zero_grad()
+        else:
+            self.optimizer.step()
+            if hasattr(self.module, "update_moe_gate_bias"):
+                self.module.update_moe_gate_bias()
+
+        return grad_norm_val
+
+    def lr_scheduler_step(self):
+        """Step Automodel's OptimizerParamScheduler and return current LR."""
+        self.lr_scheduler.step(increment=1)
+        lr = self.optimizer.param_groups[0]["lr"]
+        return lr
+
+    def get_data_parallel_rank(self):
+        if self.device_mesh is not None:
+            return self.device_mesh.get_local_rank("dp")
+        return torch.distributed.get_rank()
+
+    def get_data_parallel_size(self):
+        if self.device_mesh is not None:
+            return self.device_mesh["dp"].size()
+        return torch.distributed.get_world_size()
+
+    def get_data_parallel_group(self):
+        if self.device_mesh is not None:
+            return self.device_mesh.get_group(mesh_dim="dp")
+        return torch.distributed.group.WORLD
+
+    def is_mp_src_rank_with_outputs(self):
+        if self.device_mesh is not None and "tp" in self.device_mesh.mesh_dim_names:
+            if self.device_mesh["tp"].size() > 1:
+                return self.device_mesh.get_local_rank("tp") == 0
+        return True
+
+    def train_mode(self, **kwargs):
+        return AutomodelTrainModeCtx(self, **kwargs)
+
+    def eval_mode(self, **kwargs):
+        return AutomodelEvalModeCtx(self, **kwargs)
+
+    def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
+        super().to(device=device, model=model, optimizer=optimizer, grad=grad)
+
+        if self.engine_config.forward_only:
+            return
+
+        device_name = get_device_name()
+        assert device in (device_name, "cpu")
+
+        if device == device_name:
+            if model:
+                load_automodel_model_to_gpu(self.module)
+            if optimizer and self.optimizer is not None:
+                load_automodel_optimizer(self.optimizer, get_device_id())
+            gc.collect()
+        elif device == "cpu":
+            if model:
+                offload_automodel_model_to_cpu(self.module)
+            if optimizer and self.optimizer is not None:
+                offload_automodel_optimizer(self.optimizer)
+        else:
+            raise ValueError(f"Invalid device type: {device}")
+
+    def _build_checkpointer(self):
+        ckpt_config = CheckpointingConfig(
+            enabled=True,
+            checkpoint_dir="checkpoints/",
+            model_save_format="safetensors",
+            model_cache_dir=HF_HUB_CACHE,
+            model_repo_id=self.model_config.path,
+            save_consolidated=True,
+            is_peft=False,
+        )
+        self.checkpointer = Checkpointer(
+            config=ckpt_config,
+            dp_rank=get_dp_rank(self.device_mesh, include_cp=True),
+            tp_rank=get_tp_rank(self.device_mesh),
+            pp_rank=get_pp_rank(self.device_mesh),
+            moe_mesh=self.moe_mesh,
+        )
+
+    def save_checkpoint(
+        self,
+        local_path: str,
+        hdfs_path: Optional[str] = None,
+        global_step: int = 0,
+        max_ckpt_to_keep: Optional[int] = None,
+        **kwargs,
+    ) -> None:
+        """Save model, optimizer, and LR scheduler using Automodel's Checkpointer."""
+        origin_module_device = next(self.module.parameters()).device.type
+        if self._is_offload_param or origin_module_device == "cpu":
+            load_automodel_model_to_gpu(self.module)
+
+        # Save model weights
+        self.checkpointer.save_model(self.module, local_path)
+
+        # Save optimizer and LR scheduler state
+        if self.optimizer is not None:
+            scheduler_list = [self.lr_scheduler] if self.lr_scheduler is not None else None
+            self.checkpointer.save_optimizer(self.optimizer, self.module, local_path, scheduler=scheduler_list)
+
+        torch.distributed.barrier()
+        if self._is_offload_param:
+            offload_automodel_model_to_cpu(self.module)
+
+    def load_checkpoint(
+        self, local_path: str, hdfs_path: Optional[str] = None, del_local_after_load: int = True, **kwargs
+    ) -> None:
+        """Load model, optimizer, and LR scheduler using Automodel's Checkpointer."""
+        if self._is_offload_param:
+            load_automodel_model_to_gpu(self.module)
+
+        model_path = os.path.join(local_path, "model")
+        if not os.path.isdir(model_path):
+            model_path = local_path
+        self.checkpointer.load_model(self.module, model_path)
+
+        if self.optimizer is not None:
+            scheduler_list = [self.lr_scheduler] if self.lr_scheduler is not None else None
+            self.checkpointer.load_optimizer(self.optimizer, self.module, local_path, scheduler=scheduler_list)
+
+        torch.distributed.barrier()
+        if self._is_offload_param:
+            offload_automodel_model_to_cpu(self.module)
+
+        if self._is_offload_optimizer and self.optimizer is not None:
+            offload_automodel_optimizer(self.optimizer)
+
+    def get_per_tensor_param(self, **kwargs):
+        load_automodel_model_to_gpu(self.module)
+
+        params = self.module.state_dict()
+        params = convert_weight_keys(params, getattr(self.module, "_fsdp_wrapped_module", self.module))
+
+        if self._is_offload_param:
+            offload_automodel_model_to_cpu(self.module)
+
+        def param_generator():
+            for name, param in params.items():
+                unsharded_tensor = param.full_tensor() if isinstance(param, DTensor) else param
+                yield name, unsharded_tensor
+
+        return param_generator(), None
+
+
+class AutomodelEvalModeCtx(BaseEngineCtx):
+    def __init__(self, engine: AutomodelEngine, **kwargs):
+        super().__init__(engine=engine, mode="eval", **kwargs)
+
+    def __enter__(self):
+        assert isinstance(self.engine, AutomodelEngine)
+        super().__enter__()
+        self.engine.module.eval()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        assert isinstance(self.engine, AutomodelEngine)
+        # Reshard the root FSDP module
+        if hasattr(self.engine.module, "reshard"):
+            self.engine.module.reshard()
+        super().__exit__(exc_type, exc_value, traceback)
+
+
+class AutomodelTrainModeCtx(BaseEngineCtx):
+    def __init__(self, engine: AutomodelEngine, **kwargs):
+        super().__init__(engine=engine, mode="train", **kwargs)
+
+    def __enter__(self):
+        assert isinstance(self.engine, AutomodelEngine)
+        super().__enter__()
+        self.engine.module.train()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        assert isinstance(self.engine, AutomodelEngine)
+        self.engine.optimizer_zero_grad()
+        super().__exit__(exc_type, exc_value, traceback)
+
+
+@EngineRegistry.register(model_type="language_model", backend=["automodel"], device=["cuda"])
+class AutomodelEngineWithLMHead(AutomodelEngine):
+    """Automodel engine for language model with LM head training."""
+
+    def prepare_model_inputs(self, micro_batch: TensorDict):
+        use_remove_padding = tu.get_non_tensor_data(data=micro_batch, key="use_remove_padding", default=True)
+        pad_mode = tu.get_non_tensor_data(data=micro_batch, key="pad_mode", default=DatasetPadMode.NO_PADDING)
+        use_fused_kernels = tu.get_non_tensor_data(data=micro_batch, key="use_fused_kernels", default=False)
+        temperature = micro_batch["temperature"]
+        temperature_item = temperature
+        if use_fused_kernels:
+            assert not isinstance(temperature, torch.Tensor), (
+                "use_fused_kernels does not support per sample temperature yet"
+            )
+        assert pad_mode == DatasetPadMode.NO_PADDING, f"pad_mode {pad_mode} not supported"
+
+        multi_modal_inputs = extract_multi_modal_inputs(micro_batch.get("multi_modal_inputs", []))
+        input_ids = micro_batch["input_ids"]
+        position_ids = micro_batch["position_ids"]
+
+        if not isinstance(temperature, torch.Tensor):
+            temperature = torch.tensor([temperature] * input_ids.shape[0], device=input_ids.device)
+
+        temperature = temperature.to(torch.float32)
+        assert temperature.shape[0] == input_ids.shape[0]
+
+        output_args = {}
+
+        if use_remove_padding:
+            temperature_rmpad = verl_F.expand_as_nested(temperature, input_ids).values()
+            temperature_rmpad = temperature_rmpad.unsqueeze(0)
+
+            if pad_mode == DatasetPadMode.NO_PADDING:
+                input_ids_rmpad = input_ids.values().unsqueeze(0)
+                if position_ids.dim() == 3:
+                    position_ids_rmpad = position_ids.values().unsqueeze(1)
+                else:
+                    position_ids_rmpad = position_ids.values().unsqueeze(0)
+            else:
+                raise NotImplementedError(f"pad_mode {pad_mode} not implemented")
+
+            input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=1)
+
+            input_ids_rmpad_rolled = input_ids_rmpad_rolled.squeeze(0)
+            temperature_rmpad = temperature_rmpad.squeeze(0)
+            output_args["input_ids_rmpad_rolled"] = input_ids_rmpad_rolled
+            output_args["temperature_rmpad"] = temperature_rmpad
+
+            model_inputs = {
+                "input_ids": input_ids_rmpad,
+                "attention_mask": None,
+                "position_ids": position_ids_rmpad,
+            }
+
+            # For TE attention backend, pass cu_seqlens
+            if self.engine_config.attn_implementation == "te":
+                cu_seqlens = input_ids.offsets().to(torch.int32)
+                max_seqlen = cu_seqlens.diff().max().item()
+                model_inputs["qkv_format"] = "thd"
+                model_inputs["cu_seqlens"] = cu_seqlens.unsqueeze(0)
+                model_inputs["max_seqlen"] = max_seqlen
+
+        else:
+            if pad_mode == DatasetPadMode.NO_PADDING:
+                input_ids = micro_batch["input_ids"]
+                position_ids = micro_batch["position_ids"]
+                loss_mask = micro_batch["loss_mask"]
+
+                pad_token_id = tu.get_non_tensor_data(data=micro_batch, key="pad_token_id", default=0)
+                batch_size = micro_batch.batch_size[0]
+                seq_len_effective = input_ids.offsets().diff()
+                max_seq_len = max(seq_len_effective)
+
+                input_ids_rmpad_rolled = torch.roll(input_ids.values(), shifts=-1, dims=0)
+                output_args["input_ids_rmpad_rolled"] = input_ids_rmpad_rolled
+                output_args["temperature"] = temperature
+
+                input_ids = torch.nested.to_padded_tensor(
+                    input_ids, padding=pad_token_id, output_size=(batch_size, max_seq_len)
+                )
+
+                if position_ids.dim() == 3:
+                    position_ids = torch.nested.to_padded_tensor(
+                        position_ids, padding=0, output_size=(batch_size, 4, max_seq_len)
+                    ).transpose(0, 1)
+                else:
+                    position_ids = torch.nested.to_padded_tensor(
+                        position_ids, padding=0, output_size=(batch_size, max_seq_len)
+                    )
+
+                attention_mask_list = [torch.ones_like(t, dtype=torch.int32) for t in loss_mask]
+                attention_mask = torch.nested.as_nested_tensor(attention_mask_list, layout=torch.jagged)
+                attention_mask = torch.nested.to_padded_tensor(
+                    attention_mask, padding=0, output_size=(batch_size, max_seq_len)
+                )
+
+                model_inputs = {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "position_ids": position_ids,
+                }
+
+            else:
+                raise NotImplementedError(f"pad_mode {pad_mode} not implemented")
+
+        extra_args = {}
+        if use_fused_kernels:
+            extra_args["temperature"] = temperature_item
+            extra_args["return_dict"] = True
+
+        model_inputs.update(multi_modal_inputs)
+        model_inputs.update(extra_args)
+
+        return model_inputs, output_args
+
+    def prepare_model_outputs(self, output, output_args, micro_batch: TensorDict):
+        use_remove_padding = tu.get_non_tensor_data(data=micro_batch, key="use_remove_padding", default=True)
+        pad_mode = tu.get_non_tensor_data(data=micro_batch, key="pad_mode", default=DatasetPadMode.NO_PADDING)
+        use_fused_kernels = tu.get_non_tensor_data(data=micro_batch, key="use_fused_kernels", default=False)
+        calculate_entropy = tu.get_non_tensor_data(data=micro_batch, key="calculate_entropy", default=False)
+
+        if isinstance(output, torch.Tensor):
+            from types import SimpleNamespace
+
+            output = SimpleNamespace(logits=output)
+
+        model_output = {}
+        input_ids = micro_batch["input_ids"]
+
+        if use_remove_padding:
+            input_ids_rmpad_rolled = output_args["input_ids_rmpad_rolled"]
+            temperature_rmpad = output_args["temperature_rmpad"]
+
+            if use_fused_kernels:
+                log_probs = output.log_probs.squeeze(0)
+                entropy_rmpad = output.entropy.squeeze(0)
+            else:
+                logits_rmpad = output.logits.squeeze(0)
+                # With TP, logits are DTensors sharded on vocab dim; gather for log_softmax.
+                if isinstance(logits_rmpad, DTensor):
+                    logits_rmpad = logits_rmpad.full_tensor()
+                logits_rmpad = logits_rmpad / temperature_rmpad.clamp(min=1e-8).unsqueeze(-1).to(logits_rmpad.dtype)
+
+                inplace_backward = True
+                if calculate_entropy:
+                    inplace_backward = False
+                log_probs = logprobs_from_logits(
+                    logits=logits_rmpad,
+                    labels=input_ids_rmpad_rolled,
+                    inplace_backward=inplace_backward,
+                )
+
+                if calculate_entropy:
+                    if not self.engine_config.entropy_checkpointing:
+                        entropy_rmpad = self.compute_entropy_from_logits(logits_rmpad)
+                    else:
+                        entropy_rmpad = torch.utils.checkpoint.checkpoint(
+                            self.compute_entropy_from_logits, logits_rmpad
+                        )
+
+            if pad_mode == DatasetPadMode.NO_PADDING:
+                cu_seqlens = input_ids.offsets()
+                log_probs = torch.nested.nested_tensor_from_jagged(log_probs, cu_seqlens)
+                if calculate_entropy:
+                    entropy = torch.nested.nested_tensor_from_jagged(entropy_rmpad, cu_seqlens)
+            else:
+                raise NotImplementedError(f"pad_mode {pad_mode} not implemented")
+
+        else:
+            response_length = tu.get_non_tensor_data(data=micro_batch, key="max_response_length", default=1024)
+            if use_fused_kernels:
+                log_probs = output.log_probs[:, -response_length - 1 : -1]
+                entropy = output.entropy[:, -response_length - 1 : -1]
+            else:
+                logits = output.logits
+                # With TP, logits are DTensors sharded on vocab dim; gather for log_softmax.
+                if isinstance(logits, DTensor):
+                    logits = logits.full_tensor()
+                temperature = output_args["temperature"]
+                temperature = temperature.unsqueeze(-1).unsqueeze(-1)
+                logits = logits / temperature.clamp(min=1e-8).to(logits.dtype)
+
+                if calculate_entropy:
+                    if not self.engine_config.entropy_checkpointing:
+                        entropy = verl_F.entropy_from_logits(logits)
+                    else:
+                        entropy = torch.utils.checkpoint.checkpoint(verl_F.entropy_from_logits, logits)
+
+                if pad_mode == DatasetPadMode.NO_PADDING:
+                    cu_seqlens = input_ids.offsets()
+                    seq_lengths = cu_seqlens.diff()
+                    starts = torch.zeros_like(seq_lengths, dtype=torch.int64)
+                    logits = torch.nested.narrow(logits, 1, starts, seq_lengths, layout=torch.jagged)
+                    logits_rmpad = torch.cat([t for t in logits.unbind()])
+                    input_ids_rmpad_rolled = output_args["input_ids_rmpad_rolled"]
+                    log_probs = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled)
+                    log_probs = torch.nested.nested_tensor_from_jagged(log_probs, cu_seqlens)
+                    if calculate_entropy:
+                        entropy = torch.nested.narrow(entropy, 1, starts, seq_lengths, layout=torch.jagged)
+                        entropy_rmpad = torch.cat([t for t in entropy.unbind()])
+                        entropy = torch.nested.nested_tensor_from_jagged(entropy_rmpad, cu_seqlens)
+                else:
+                    raise NotImplementedError(f"pad_mode {pad_mode} not implemented")
+
+        model_output["log_probs"] = log_probs
+        if calculate_entropy:
+            model_output["entropy"] = entropy
+
+        return model_output
+
+    def forward_step(self, micro_batch: TensorDict, loss_function, forward_only):
+        """Run forward pass, compute loss, and return outputs."""
+        device_name = get_device_name()
+        micro_batch = micro_batch.to(get_device_id())
+        model_inputs, output_args = self.prepare_model_inputs(micro_batch=micro_batch)
+
+        with torch.autocast(device_type=device_name, dtype=torch.bfloat16):
+            raw_output = self.module(
+                **model_inputs,
+                use_cache=False,
+            )
+
+            model_output = self.prepare_model_outputs(
+                output=raw_output, output_args=output_args, micro_batch=micro_batch
+            )
+
+            if loss_function is not None:
+                loss, metrics = loss_function(
+                    model_output=model_output, data=micro_batch, dp_group=self.get_data_parallel_group()
+                )
+            else:
+                assert forward_only, "forward_only must be True when loss_function is None"
+                loss = torch.tensor(1.0, device=device_name)
+                metrics = {}
+
+            output = {
+                "model_output": model_output,
+                "loss": loss.detach().item(),
+                "metrics": metrics,
+            }
+
+            return loss, output
diff --git a/verl/workers/engine/automodel/utils.py b/verl/workers/engine/automodel/utils.py
new file mode 100644
index 00000000000..c10cf9a2db2
--- /dev/null
+++ b/verl/workers/engine/automodel/utils.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for the Automodel engine integration."""
+
+import torch
+import torch.distributed
+
+from verl.utils.device import get_device_id, get_torch_device
+
+
+def get_dp_rank(device_mesh, include_cp=False):
+    """Get data-parallel rank from device mesh."""
+    if device_mesh is None:
+        return 0
+    if include_cp and "cp" in device_mesh.mesh_dim_names and device_mesh["cp"].size() > 1:
+        return device_mesh.get_local_rank("dp_cp")
+    return device_mesh.get_local_rank("dp")
+
+
+def get_tp_rank(device_mesh):
+    """Get tensor-parallel rank from device mesh."""
+    if device_mesh is None or "tp" not in device_mesh.mesh_dim_names or device_mesh["tp"].size() == 1:
+        return 0
+    return device_mesh.get_local_rank("tp")
+
+
+def get_pp_rank(device_mesh):
+    """Get pipeline-parallel rank from device mesh."""
+    if device_mesh is None or "pp" not in device_mesh.mesh_dim_names or device_mesh["pp"].size() == 1:
+        return 0
+    return device_mesh.get_local_rank("pp")
+
+
+def get_dp_group_size(device_mesh, include_cp=False):
+    """Get data-parallel group size from device mesh."""
+    if device_mesh is None:
+        return torch.distributed.get_world_size()
+    if include_cp and "cp" in device_mesh.mesh_dim_names and device_mesh["cp"].size() > 1:
+        return device_mesh["dp_cp"].size()
+    if "dp" in device_mesh.mesh_dim_names:
+        return device_mesh["dp"].size()
+    return torch.distributed.get_world_size()
+
+
+def maybe_fully_shard_optimizer(model, optimizer, distributed_config):
+    """Call fully_shard_optimizer for MegatronFSDP strategy."""
+    from nemo_automodel.components.distributed.config import MegatronFSDPConfig
+
+    if isinstance(distributed_config, MegatronFSDPConfig) and torch.distributed.get_world_size() > 1:
+        from megatron_fsdp.fully_shard import fully_shard_optimizer
+
+        fully_shard_optimizer(model, optimizer)
+
+
+def build_distributed_config_from_engine_config(engine_config, world_size):
+    """Build v5 distributed config, device_mesh, and moe_mesh from engine config.
+
+    Args:
+        engine_config: AutomodelEngineConfig instance.
+        world_size: Total number of processes in the job.
+
+    Returns:
+        Tuple of (distributed_config, device_mesh, moe_mesh).
+    """
+    from nemo_automodel.components.distributed.config import DDPConfig, FSDP2Config, MegatronFSDPConfig
+    from nemo_automodel.components.distributed.mesh_utils import create_device_mesh
+
+    strategy = engine_config.distributed_strategy
+
+    if strategy == "fsdp2":
+        from torch.distributed.fsdp import MixedPrecisionPolicy
+
+        from verl.utils.torch_dtypes import PrecisionType
+
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=PrecisionType.to_dtype(engine_config.mp_param_dtype),
+            reduce_dtype=PrecisionType.to_dtype(engine_config.mp_reduce_dtype),
+            output_dtype=PrecisionType.to_dtype(engine_config.mp_output_dtype),
+            cast_forward_inputs=True,
+        )
+
+        distributed_config = FSDP2Config(
+            sequence_parallel=engine_config.sequence_parallel,
+            mp_policy=mp_policy,
+            activation_checkpointing=engine_config.activation_checkpointing,
+            defer_fsdp_grad_sync=engine_config.defer_fsdp_grad_sync,
+        )
+
+    elif strategy == "megatron_fsdp":
+        distributed_config = MegatronFSDPConfig(
+            activation_checkpointing=engine_config.activation_checkpointing,
+        )
+
+    elif strategy == "ddp":
+        distributed_config = DDPConfig(
+            activation_checkpointing=engine_config.activation_checkpointing,
+        )
+
+    else:
+        raise ValueError(f"Unsupported distributed_strategy: {strategy}")
+
+    device_mesh, moe_mesh = create_device_mesh(
+        distributed_config,
+        tp_size=engine_config.tp_size,
+        pp_size=engine_config.pp_size,
+        cp_size=engine_config.cp_size,
+        ep_size=engine_config.ep_size,
+        dp_replicate_size=engine_config.dp_replicate_size,
+        world_size=world_size,
+    )
+
+    return distributed_config, device_mesh, moe_mesh
+
+
+def build_automodel_model(model_config, engine_config, distributed_config, device_mesh, moe_mesh):
+    """Build a model using NeMoAutoModelForCausalLM.from_pretrained().
+
+    Args:
+        model_config: HFModelConfig with model path and settings.
+        engine_config: AutomodelEngineConfig with distributed settings.
+        distributed_config: FSDP2Config, MegatronFSDPConfig, or DDPConfig instance.
+        device_mesh: Pre-created device mesh (or None for DDP).
+        moe_mesh: Pre-created MoE mesh (or None).
+
+    Returns:
+        A HuggingFace model with Automodel's distributed infrastructure applied.
+    """
+    from nemo_automodel._transformers.auto_model import NeMoAutoModelForCausalLM
+
+    kwargs = {}
+
+    if engine_config.enable_fp8:
+        from nemo_automodel.components.quantization.fp8 import FP8Config
+
+        kwargs["fp8_config"] = FP8Config()
+
+    if engine_config.enable_compile:
+        from nemo_automodel.components.utils.compile_utils import CompileConfig
+
+        kwargs["compile_config"] = CompileConfig()
+
+    # Qwen/Llama with ep_size<=1: use HF implementation.
+    from transformers import AutoConfig
+
+    _cfg = AutoConfig.from_pretrained(model_config.path, trust_remote_code=model_config.trust_remote_code)
+    _arch = (getattr(_cfg, "architectures", None) or [""])[0].lower()
+    if engine_config.ep_size <= 1 and ("qwen" in _arch or "llama" in _arch):
+        kwargs["force_hf"] = True
+
+    if engine_config.backend_config and not kwargs.get("force_hf", False):
+        from nemo_automodel.components.models.common.utils import BackendConfig
+
+        backend_kwargs = dict(engine_config.backend_config)
+        kwargs["backend"] = BackendConfig(**backend_kwargs)
+
+    # MoE config for MoEParallelizerConfig
+    if engine_config.ep_size > 1:
+        from nemo_automodel.components.moe.config import MoEParallelizerConfig
+
+        moe_kwargs = dict(engine_config.moe_config) if engine_config.moe_config else {}
+        if hasattr(distributed_config, "mp_policy"):
+            moe_kwargs.setdefault("mp_policy", distributed_config.mp_policy)
+
+        kwargs["moe_config"] = MoEParallelizerConfig(**moe_kwargs)
+
+    kwargs["attn_implementation"] = engine_config.attn_implementation
+
+    from verl.utils.torch_dtypes import PrecisionType
+
+    kwargs["torch_dtype"] = PrecisionType.to_dtype(engine_config.model_dtype)
+
+    model = NeMoAutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=model_config.path,
+        device_mesh=device_mesh,
+        moe_mesh=moe_mesh,
+        distributed_config=distributed_config,
+        activation_checkpointing=engine_config.activation_checkpointing,
+        trust_remote_code=model_config.trust_remote_code,
+        **kwargs,
+    )
+
+    return model
+
+
+@torch.no_grad()
+def offload_automodel_model_to_cpu(model, empty_cache=True):
+    """Offload an FSDP2-wrapped model to CPU (reshard, move to CPU, optional cache clear)."""
+    from torch.distributed.fsdp._fully_shard._fsdp_common import TrainingState
+    from torch.distributed.fsdp._fully_shard._fsdp_state import _get_module_fsdp_state
+
+    for module in model.modules():
+        state = _get_module_fsdp_state(module)
+        if state is None:
+            continue
+        fsdp_param_group = state._fsdp_param_group
+
+        if fsdp_param_group is None:
+            continue
+
+        fsdp_param_group._training_state = TrainingState.IDLE
+
+    model.reshard()
+    model.cpu()
+    if empty_cache:
+        get_torch_device().empty_cache()
+
+
+@torch.no_grad()
+def load_automodel_model_to_gpu(model):
+    """Load model back to GPU."""
+    device = get_device_id()
+    model.to(device, non_blocking=True)
+
+
+@torch.no_grad()
+def offload_automodel_optimizer(optimizer):
+    """Offload optimizer state to CPU."""
+    if not optimizer.state:
+        return
+    for param_group in optimizer.param_groups:
+        for param in param_group["params"]:
+            state = optimizer.state[param]
+            for key, value in state.items():
+                if isinstance(value, torch.Tensor):
+                    state[key] = value.to("cpu", non_blocking=True)
+
+
+@torch.no_grad()
+def load_automodel_optimizer(optimizer, device_id):
+    """Load optimizer state back to GPU."""
+    if not optimizer.state:
+        return
+    for param_group in optimizer.param_groups:
+        for param in param_group["params"]:
+            state = optimizer.state[param]
+            for key, value in state.items():
+                if isinstance(value, torch.Tensor):
+                    state[key] = value.to(device_id, non_blocking=True)