NVIDIA-NeMo · wedu-nvidia · Feb 19, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 24, 2026
diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl
@@ -1,149 +1,10 @@
 # syntax=docker/dockerfile:1
-# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/main/docker/Dockerfile
-# TODO: from next update try to re-use their dockerfile as is as they support specifying the commit
+# Lightweight Dockerfile: use pre-built nvcr.io/nvidian/nemo-rl:nightly and only add NeMo-Skills.
+# To use the image without building at all, set containers.nemo-rl to nvcr.io/nvidian/nemo-rl:nightly
+# in your cluster config (see cluster_configs/example-local.yaml).
 
-ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
+ARG NEMO_RL_IMAGE=nvcr.io/nvidian/nemo-rl:nightly
 
-FROM scratch AS nemo-rl
+FROM ${NEMO_RL_IMAGE}
 
-ARG NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-e95efb912a6909b5da91ffeb197debe91fd480d8}
-ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NEMO_RL_COMMIT} /
-
-
-FROM ${BASE_IMAGE} AS base
-# An environment variable to indicate that we are in a container.
-ENV NRL_CONTAINER=1
-
-# It is more convenient for users to run as root
 USER root
-
-RUN <<"EOF" bash -exu -o pipefail
-export DEBIAN_FRONTEND=noninteractive
-export TZ=America/Los_Angeles
-
-apt-get update
-apt-get install -y --no-install-recommends \
-    jq \
-    curl \
-    git \
-    rsync \
-    wget \
-    less \
-    vim \
-
-# Nsight
-apt install -y --no-install-recommends gnupg
-echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
-apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
-apt update
-apt install -y nsight-systems-cli
-
-# To fix CVE-2025-68973
-apt install -y --only-upgrade gnupg
-
-apt-get clean
-rm -rf /var/lib/apt/lists/*
-EOF
-
-# Install uv and python
-ARG UV_VERSION=0.9.7
-ARG PYTHON_VERSION=3.12
-ENV PATH="/root/.local/bin:$PATH"
-RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
-    uv python install ${PYTHON_VERSION}
-
-# Disable usage stats by default for users who are sensitive to sharing usage.
-# Users are encouraged to enable if the wish.
-ENV RAY_USAGE_STATS_ENABLED=0
-# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
-# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
-# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
-ENV RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
-ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
-
-
-FROM base AS hermetic
-
-WORKDIR /opt/NeMo-RL
-
-# Variables to control the build of TE. If there are issues with parallelization, consider
-# setting these to 1.
-ARG MAX_JOBS
-ARG NVTE_BUILD_THREADS_PER_JOB
-# Only use for custom vllm installs. Learn more at https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/use-custom-vllm.md
-ARG BUILD_CUSTOM_VLLM
-
-ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
-ENV UV_LINK_MODE=copy
-
-# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set)
-ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
-
-# First copy only the dependency files
-COPY --from=nemo-rl pyproject.toml uv.lock ./
-# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist.
-COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/
-COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh
-COPY --from=nemo-rl --link research/ ./research/
-COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/
-
-RUN --mount=type=ssh <<"EOF" bash -exu
-uv venv --seed
-if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then
-    bash tools/build-custom-vllm.sh
-    source 3rdparty/vllm/nemo-rl.env
-fi
-# uv sync has a more reliable resolver than simple uv pip install which can fail
-
-# Sync each training + inference backend one at a time (since they may conflict)
-# to warm the uv cache, then at the end just sync the default dependencies.
-# Do everything in one layer to prevent large layers.
-
-# The venv is symlinked to avoid bloating the layer size
-uv sync --link-mode symlink --locked --no-install-project
-uv sync --link-mode symlink --locked --extra vllm --no-install-project
-uv sync --link-mode symlink --locked --extra mcore --no-install-project
-uv sync --link-mode symlink --locked --extra automodel --no-install-project
-uv sync --link-mode symlink --locked --all-groups --no-install-project
-
-# Remove the aiohttp in this uv cache dir to fully address CVE GHSA-mqqc-3gqh-h2x8
-# The ray install will include the older aiohttp version in its cache
-find /root/.cache/uv -type d -path "*ray/_private/runtime_env/agent/thirdparty_files/aiohttp*" -exec rm -rf {} +
-EOF
-
-ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
-ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
-
-WORKDIR /opt/NeMo-RL
-
-FROM hermetic AS release
-
-ARG NVIDIA_BUILD_ID
-ARG NVIDIA_BUILD_REF
-ARG RC_DATE=00.00
-ARG TARGETARCH
-ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>}
-ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
-LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
-LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
-
-ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
-
-# Copy in source from build context (defaults to cloned repo, can be overridden)
-# Exclude pyproject.toml and uv.lock since those may be altered by build-custom-vllm.sh
-COPY --from=nemo-rl --exclude=pyproject.toml --exclude=uv.lock . /opt/NeMo-RL
-# Unshallow the repo to get the full history (in the case it was from the scratch layer).
-# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
-# so do a quick check before trying to unshallow.
-RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
-RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py
-
-# Generate container fingerprint for frozen environment support
-# Store outside /opt/NeMo-RL to avoid being overwritten by user mounts
-RUN python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint
-
-# NOTICES.txt file points to where the OSS source code is archived
-RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo-rl/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
-    echo "For further inquiries or assistance, contact us at oss-requests@nvidia.com" >> NOTICES.txt
-
-RUN git clone https://github.com/NVIDIA-NeMo/Skills.git /opt/NeMo-Skills && cd /opt/NeMo-Skills && uv pip install .
diff --git a/nemo_skills/pipeline/nemo_rl/grpo.py b/nemo_skills/pipeline/nemo_rl/grpo.py
@@ -133,8 +133,8 @@ def get_cmd(self):
         self.logging_params = self.format_wandb_args()
         nsight_cmd = get_nsight_cmd(self.profile_step_range)
         cmd = (
-            f"export PYTHONPATH=$PYTHONPATH:/nemo_run/code:/opt/NeMo-RL && "
-            f"export UV_PROJECT=/opt/NeMo-RL && "
+            f"export PYTHONPATH=$PYTHONPATH:/nemo_run/code:/opt/nemo-rl && "
+            f"export UV_PROJECT=/opt/nemo-rl && "
             f"{nsight_cmd}"
             f"echo 'Starting training' && "
             f"uv run --active python /nemo_run/code/nemo_skills/training/nemo_rl/start_grpo.py "

diff --git a/nemo_skills/pipeline/nemo_rl/sft.py b/nemo_skills/pipeline/nemo_rl/sft.py
@@ -116,11 +116,12 @@ def get_cmd(self):
 
         nsight_cmd = get_nsight_cmd(self.profile_step_range)
         cmd = (
-            "export PYTHONPATH=$PYTHONPATH:/nemo_run/code:/opt/NeMo-RL && "
-            "export UV_PROJECT=/opt/NeMo-RL && "
+            "export PYTHONPATH=$PYTHONPATH:/nemo_run/code:/opt/nemo-rl && "
+            "export UV_PROJECT=/opt/nemo-rl && "
             f"{nsight_cmd}"
             "echo 'Starting training' && "
-            "NRL_FORCE_REBUILD_VENVS=true uv run --active "
+            # "NRL_FORCE_REBUILD_VENVS=true uv run --active "
+            "uv run --active "
             "python /nemo_run/code/nemo_skills/training/nemo_rl/start_sft.py "
             f"{self.format_train_args()} {self.format_data_args()} "
             f"{self.logging_params} {self.extra_arguments}"

diff --git a/nemo_skills/training/nemo_rl/configs/grpo.yaml b/nemo_skills/training/nemo_rl/configs/grpo.yaml
@@ -10,6 +10,7 @@ grpo:
   use_leave_one_out_baseline: true
   val_period: 0 # disabled
   val_at_start: false
+  val_at_end: false
   overlong_filtering: false
   max_val_samples: 256
   val_batch_size: 256
@@ -22,13 +23,20 @@ grpo:
     overlong_buffer_length: 128
     overlong_buffer_penalty: 1
     max_response_length: ${policy.max_total_sequence_length}
+    stop_properly_penalty_coef: null
   reward_scaling:
     enabled: false
     source_min: 0.0
     source_max: 1.0
     target_min: 0.0
     target_max: 1.0
-
+  seq_logprob_error_threshold: null
+  adv_estimator:
+    name: "grpo"  # Use "reinforce_plus_plus" for Reinforce++ estimator
+    normalize_rewards: true
+    use_leave_one_out_baseline: false
+    minus_baseline: true
+      # Reinforce++-baseline specific: subtract per-prompt mean baseline
   async_grpo:
     enabled: false # Set to true to enable async training mode
     # Max age (in training steps) for trajectories used in training
@@ -52,9 +60,12 @@ loss_fn:
   # Set to true when async_grpo.enabled is true
   use_importance_sampling_correction: false
   truncated_importance_sampling_ratio: null
+  truncated_importance_sampling_ratio_min: null  # Lower bound for ICE-POP
+  truncated_importance_sampling_type: tis  # "tis" (clamp to max) or "icepop" (filter outside [min, max])
   sequence_level_importance_ratios: false
   token_level_loss: true
   force_on_policy_ratio: false  # Set to true to force ratio=1.0 (requires train_global_batch_size == num_prompts_per_step * num_generations_per_prompt)
+  use_kl_in_reward: false  # Reinforce++: add KL penalty to reward instead of loss
 
 checkpointing:
   enabled: true
@@ -74,9 +85,9 @@ policy:
     chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
   hf_config_overrides: {}
   train_global_batch_size: 512
-  train_micro_batch_size: 4
+  train_micro_batch_size: 1
   generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
+  logprob_batch_size: 1
   max_total_sequence_length: 512
   precision: "bfloat16"
   tensor_model_parallel_size: 1
@@ -123,6 +134,9 @@ policy:
     bias_activation_fusion: True
     defer_fp32_logits: False
     moe_per_layer_logging: False
+    moe_enable_deepep: false
+    moe_token_dispatcher_type: "allgather"
+    moe_shared_expert_overlap: false
 
     optimizer:
       optimizer: "adam"

diff --git a/nemo_skills/training/nemo_rl/configs/sft.yaml b/nemo_skills/training/nemo_rl/configs/sft.yaml
@@ -11,6 +11,7 @@ sft:
   val_global_batch_size: 32
   val_micro_batch_size: 1
   val_at_start: false
+  val_at_end: false
   seed: 42
 
 checkpointing:
@@ -82,7 +83,7 @@ policy:
     sequence_parallel: ${policy.sequence_parallel}
     freeze_moe_router: false
     moe_router_dtype: null
-    moe_router_load_balancing_type: "aux_loss"
+    moe_router_load_balancing_type: none
     moe_router_bias_update_rate: 1e-3
     moe_permute_fusion: false
     #gives ~20% training perf speedup with sequence packing
@@ -92,6 +93,9 @@ policy:
     layernorm_epsilon: 1e-6
     defer_fp32_logits: False
     moe_per_layer_logging: False
+    moe_enable_deepep: false
+    moe_token_dispatcher_type: "allgather"
+    moe_shared_expert_overlap: false
 
     peft:
       enabled: false
@@ -163,7 +167,7 @@ policy:
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 0.0 # megatron: Zero means no clipping, FSDP: null means no clipping
+  max_grad_norm: 1.0 # megatron: Zero means no clipping, FSDP: null means no clipping
 
   optimizer:
     name: "torch.optim.AdamW"

diff --git a/nemo_skills/training/nemo_rl/start_sft.py b/nemo_skills/training/nemo_rl/start_sft.py
@@ -368,7 +368,6 @@ def main():
         loss_fn,
         master_config,
         logger,
-        sft_task_spec,
         checkpointer,
         sft_save_state,
     )

diff --git a/tests/gpu-tests/test-local.yaml b/tests/gpu-tests/test-local.yaml
@@ -22,13 +22,14 @@ containers:
   nemo-skills: dockerfile:dockerfiles/Dockerfile.nemo-skills
   megatron: dockerfile:dockerfiles/Dockerfile.megatron
   verl: dockerfile:dockerfiles/Dockerfile.verl
-  nemo-rl: dockerfile:dockerfiles/Dockerfile.nemo-rl
+  nemo-rl: gitlab-master.nvidia.com/dl/ai-services/docker-images/igitman/nemo-skills-nemo-rl:latest
 
 mounts:
   - /tmp:/tmp
   # change this if the models are located in a different place
   # TODO: can we make it simpler?
   - /mnt/datadrive/nemo-skills-test-data:/mnt/datadrive/nemo-skills-test-data
+  - /home/wedu:/home/wedu
 
 env_vars:
   - HF_HOME=/mnt/datadrive/nemo-skills-test-data/hf-cache