Nemotron Ultra & Super launcher examples (#1609)

jenchen13 · web-flow · commit 6b73e933bfc8 · 2026-06-04T17:46:58.000Z
### What does this PR do? Type of change: New example New launcher example for Nemotron Super with PTQ + Export + VLLM smoke test on small GPQA-style dataset ### Usage ```python # Usage: # source .env-slurm # cd tools/launcher # uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_lm_ptq.yaml --yes ``` ### Testing  ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ / ❌ / N/A  - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A  - Did you write any new necessary tests?: ✅ / ❌ / N/A  - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A  - Did you get Claude approval on this PR?: ✅ / ❌ / N/A  ### Additional Information   ## Summary by CodeRabbit * **New Features** * Added checkpoint export capability for quantized models to Hugging Face format. * Introduced complete quantization pipelines with conditional MMLU evaluation and model export stages. * **Bug Fixes** * Fixed num_shards calculation to prevent invalid minimum values. * **Documentation** * Updated vLLM version requirements for optimal NVFP4 model performance. * Enhanced quantization pipeline documentation with improved output paths and conditional execution details. * **Chores** * Updated Megatron-LM module to latest version. * Added sample dataset for model evaluation testing.  --------- Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
diff --git a/tools/launcher/common/megatron_lm/export/export.sh b/tools/launcher/common/megatron_lm/export/export.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source ${SCRIPT_DIR}/../../service_utils.sh
+
+util_install_extra_dep
+
+trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER
+###################################################################################################
+
+# Export a quantized MCore checkpoint (saved by quantize.sh) to HF format.
+#
+# Required env: MLM_MODEL_CFG, QUANT_CFG.
+# Optional env:
+#   MLM_MODEL_CKPT  Saved PTQ MCore ckpt path (default: /cicd/megatron-lm/${MLM_MODEL_CFG})
+#   EXPORT_DIR      HF output dir            (default: /cicd/export/${MLM_MODEL_CFG}_${QUANT_CFG basename})
+#   HF_MODEL_CKPT   HF source ckpt for tokenizer/config (default: /hf-local/${MLM_MODEL_CFG})
+#   TP, PP, EP, ETP Parallelism (defaults: 1, 1, 1, 1)
+
+if [[ -z ${MLM_MODEL_CKPT} ]]; then
+    export MLM_MODEL_CKPT="/cicd/megatron-lm/${MLM_MODEL_CFG}"
+fi
+if [[ -z ${EXPORT_DIR} ]]; then
+    # Take basename of QUANT_CFG (strip dirs + .yaml/.yml) so recipe paths
+    # collapse to a flat tag in EXPORT_DIR.
+    _QUANT_CFG_TAG="$(basename "${QUANT_CFG}")"
+    _QUANT_CFG_TAG="${_QUANT_CFG_TAG%.yaml}"
+    _QUANT_CFG_TAG="${_QUANT_CFG_TAG%.yml}"
+    export EXPORT_DIR="/cicd/export/${MLM_MODEL_CFG}_${_QUANT_CFG_TAG}"
+fi
+if [[ -z ${HF_MODEL_CKPT} ]]; then
+    export HF_MODEL_CKPT="/hf-local/${MLM_MODEL_CFG}"
+fi
+export MLM_SKIP_INSTALL=1
+
+EXPORT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/export.sh"
+
+export MLM_EXTRA_ARGS=${@}
+echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1}) ==="
+TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} ${EXPORT_EXE} ${MLM_MODEL_CFG}
+ls ${EXPORT_DIR}
+cat ${EXPORT_DIR}/hf_quant_config.json
+
+###################################################################################################
+
+exit_handler $0
diff --git a/tools/launcher/common/megatron_lm/quantize/quantize.sh b/tools/launcher/common/megatron_lm/quantize/quantize.sh
@@ -15,6 +15,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Runs Megatron-LM PTQ quantization. Also runs MMLU + HF export inline unless
+# RUN_MMLU / RUN_EXPORT are set to "false". Larger models that need different
+# parallelism for MMLU/export should set RUN_MMLU=false RUN_EXPORT=false and
+# chain the standalone mmlu/mmlu.sh and export/export.sh scripts as separate
+# pipeline tasks.
+
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source ${SCRIPT_DIR}/../../service_utils.sh
 
@@ -26,25 +32,35 @@ trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER
 if [[ -z ${HF_MODEL_CKPT} ]]; then
     export HF_MODEL_CKPT="/hf-local/${MLM_MODEL_CFG}"
 fi
-export MLM_MODEL_SAVE="/scratchspace/megatron-lm/${MLM_MODEL_CFG}"
-export EXPORT_DIR="/scratchspace/export/${MLM_MODEL_CFG}_${QUANT_CFG}"
+# Persist PTQ ckpt + HF export under /cicd ($SLURM_JOB_DIR/cicd) so later
+# experiments can re-use them. 
+export MLM_MODEL_SAVE="/cicd/megatron-lm/${MLM_MODEL_CFG}"
+# If QUANT_CFG is a recipe path, collapse to a flat tag (strip dirs + .yaml/.yml).
+_QUANT_CFG_TAG="$(basename "${QUANT_CFG}")"
+_QUANT_CFG_TAG="${_QUANT_CFG_TAG%.yaml}"
+_QUANT_CFG_TAG="${_QUANT_CFG_TAG%.yml}"
+export EXPORT_DIR="/cicd/export/${MLM_MODEL_CFG}_${_QUANT_CFG_TAG}"
 export MLM_SKIP_INSTALL=1
 
 QUANTIZE_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/quantize.sh"
 MMLU_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/mmlu.sh"
-CONVERT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/convert.sh"
 EXPORT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/export.sh"
 
+# Step 1: quantize
 export MLM_EXTRA_ARGS=${@}
 TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} ${QUANTIZE_EXE} ${MLM_MODEL_CFG} ${QUANT_CFG}
 
-export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound ${MMLU_LOWER_BOUND:-0.38} --disable-tqdm"
-TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG}
+# Step 2 (optional): MMLU on the saved PTQ ckpt
+if [[ "${RUN_MMLU:-true}" == "true" ]]; then
+    export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound ${MMLU_LOWER_BOUND:-0.38} --disable-tqdm"
+    TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG}
+fi
 
-# Export quantized checkpoint to HF format
-# Use largest PP <= total GPUs that divides the model's num_hidden_layers
-TOTAL_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo ${NUM_GPUS:-1})
-EXPORT_PP=$(python3 -c "
+# Step 3 (optional): export PTQ ckpt to HF format
+# Use largest PP <= total GPUs that divides the model's num_hidden_layers.
+if [[ "${RUN_EXPORT:-true}" == "true" ]]; then
+    TOTAL_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo ${NUM_GPUS:-1})
+    EXPORT_PP=$(python3 -c "
 import json, os
 cfg = os.path.join('${HF_MODEL_CKPT}', 'config.json')
 n_layers = json.load(open(cfg)).get('num_hidden_layers', 1) if os.path.exists(cfg) else 1
@@ -54,11 +70,12 @@ while pp > 1 and n_layers % pp != 0:
     pp -= 1
 print(pp)
 " 2>/dev/null || echo ${TOTAL_GPUS})
-echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${EXPORT_PP}, ${TOTAL_GPUS} GPUs) ==="
-export MLM_EXTRA_ARGS=
-TP=1 PP=${EXPORT_PP} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
-ls ${EXPORT_DIR}
-cat ${EXPORT_DIR}/hf_quant_config.json
+    echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${EXPORT_PP}, ${TOTAL_GPUS} GPUs) ==="
+    export MLM_EXTRA_ARGS=
+    TP=1 PP=${EXPORT_PP} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
+    ls ${EXPORT_DIR}
+    cat ${EXPORT_DIR}/hf_quant_config.json
+fi
 
 ###################################################################################################
 
diff --git a/tools/launcher/common/query.py b/tools/launcher/common/query.py
@@ -208,7 +208,7 @@ def synthesize(data):
     dataset = load_dataset(args.data, split=args.data_split)
 
 if args.num_shards * 100 > len(dataset):
-    args.num_shards = min(16, len(dataset) // 100)
+    args.num_shards = max(1, min(16, len(dataset) // 100))
 
 if args.save is not None:
     print(f"Create save dir: {args.save}")
diff --git a/tools/launcher/common/vllm/gpqa_sample.jsonl b/tools/launcher/common/vllm/gpqa_sample.jsonl
@@ -0,0 +1,8 @@
+{"messages": [{"role": "user", "content": "A particle of mass m moves in a one-dimensional infinite square well of width L. What is the energy of the third excited state (n=4) in units of (h^2 / (8 m L^2))?\n\n(A) 4\n(B) 9\n(C) 16\n(D) 25\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "In the Diels-Alder reaction between 1,3-butadiene and maleic anhydride, what is the stereochemistry of the major cyclohexene product?\n\n(A) cis-fused, endo\n(B) cis-fused, exo\n(C) trans-fused, endo\n(D) trans-fused, exo\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "Which of the following is the correct order of events in eukaryotic translation initiation?\n\n(A) 40S binds mRNA cap; eIF2-GTP-Met-tRNA joins; scan to AUG; 60S joins\n(B) 60S binds mRNA cap; scan to AUG; eIF2-GTP-Met-tRNA joins; 40S joins\n(C) eIF2-GTP-Met-tRNA binds AUG; 40S joins; 60S joins; cap recognized\n(D) 80S forms first; eIF2 delivers Met-tRNA; cap recognized last\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "A main-sequence star with a mass of 25 solar masses is expected to end its life as which of the following?\n\n(A) White dwarf\n(B) Brown dwarf\n(C) Neutron star\n(D) Stellar-mass black hole\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "An ideal gas undergoes a reversible adiabatic expansion. Which thermodynamic quantity is unchanged?\n\n(A) Internal energy U\n(B) Enthalpy H\n(C) Entropy S\n(D) Gibbs free energy G\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "Which spectroscopic technique is most directly used to determine the local chemical environment of carbon atoms in an organic molecule in solution?\n\n(A) X-ray diffraction\n(B) 13C NMR spectroscopy\n(C) UV-Vis absorption\n(D) Mass spectrometry\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "In a diploid organism, a gene on the X chromosome with two alleles (A dominant, a recessive) has carrier-mother (X^A X^a) and unaffected-father (X^A Y). What is the probability that a son will be affected (X^a Y)?\n\n(A) 0\n(B) 1/4\n(C) 1/2\n(D) 1\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
+{"messages": [{"role": "user", "content": "A photon has wavelength 500 nm in vacuum. What is its momentum in units of 10^-27 kg m/s? (Use h = 6.626e-34 J s.)\n\n(A) 1.0\n(B) 1.3\n(C) 1.7\n(D) 2.1\n\nReply with the single letter A, B, C, or D, then briefly justify."}]}
diff --git a/tools/launcher/common/vllm/query.sh b/tools/launcher/common/vllm/query.sh
@@ -41,7 +41,7 @@ source ${SCRIPT_DIR}/../service_utils.sh
 # vLLM notes:
 #   - vLLM manages GPU distribution internally; run with ntasks_per_node: 1
 #     in slurm_config and pass --tensor-parallel-size to match gpus_per_node.
-#   - NVFP4 models require vllm/vllm-openai:v0.15.0+ on Blackwell GPUs.
+#   - NVFP4 models require vllm/vllm-openai:v0.21.0+ on Blackwell GPUs.
 #   - Use --trust-remote-code for models with custom architectures (e.g. Kimi).
 #
 # In a pipeline YAML task config:
diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml
@@ -1,14 +1,14 @@
 # Megatron-Bridge import for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16.
 #
 # Imports HF weights to a Megatron-LM checkpoint via AutoBridge.import_ckpt
-# (use_cpu_initialization=True). Uses a single 8xH100 Slurm node — Megatron-Bridge
+# (use_cpu_initialization=True). Uses a single 4-GPU Slurm node — Megatron-Bridge
 # requires at least 1 GPU for nccl init even with CPU-resident weights.
 #
 # Usage:
 #   export SLURM_HOST=<slurm-host>
 #   export SLURM_ACCOUNT=<your-team>
 #   export SLURM_PARTITION=<gpu-partition>   # default: batch
-#   export SLURM_JOB_DIR=/home/scratch.<user>/experiments
+#   export SLURM_JOB_DIR=<remote-job-dir>
 #   export HF_TOKEN=<your-hf-token>          # gated model
 #   cd tools/launcher
 #   uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml --yes
@@ -17,7 +17,7 @@ job_name: Nemotron-3-Super-120B_bridge_import
 pipeline:
   skip: false
   allow_to_fail: false
-  note: "HF -> MCore import via Megatron-Bridge (8xH100)"
+  note: "HF -> MCore import via Megatron-Bridge (1 node x 4 GPUs)"
 
   global_vars:
     # /cicd is the experiment_title mount = $SLURM_JOB_DIR/cicd on the host
@@ -37,5 +37,5 @@ pipeline:
       partition: batch
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 8
+      gpus_per_node: 4
       time: "04:00:00"
diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_lm_ptq.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_lm_ptq.yaml
@@ -0,0 +1,91 @@
+# Nemotron-3-Super-120B-A12B-BF16 PTQ quantization + export + vLLM smoke.
+# Tested on B200 Blackwell GPUs.
+#
+# Pipeline:
+#   task_0 (quantize): 1 node x 4 GPUs = 4 ranks, TP=1 PP=1 EP=4 ETP=1.
+#                      Loads HF weights from /hf-local, saves PTQ ckpt to /cicd.
+#   task_1 (export):   1 node x 4 GPUs = 4 ranks, TP=1 PP=4 EP=1 ETP=1.
+#                      88 layers / PP=4 = 22 layers/stage.
+#   task_2 (smoke):    1 node x 4 GPUs. Serve exported NVFP4 ckpt with vLLM and
+#                      answer 8 GPQA-style questions.
+#
+# Usage:
+#   source .env-slurm
+#   cd tools/launcher
+#   uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_lm_ptq.yaml --yes
+
+job_name: Nemotron-3-Super-120B_PTQ
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note: "PTQ on Nemotron-3-Super-120B (super-nvfp4): quantize + export + vLLM smoke, 1 node x 4 GPUs"
+
+  task_0:
+    script: common/megatron_lm/quantize/quantize.sh
+    args:
+      - --seq-length 4096 --max-position-embeddings 4096
+      - --skip-generate
+      # Fast calibration. Bump (e.g. --calib-size 512) for production.
+      - --calib-size 32
+    environment:
+      - MLM_MODEL_CFG: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+      - QUANT_CFG: models/Nemotron-3-Super-120B-A12B/super-nvfp4
+      - HF_MODEL_CKPT: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+      # MMLU + Export run as separate tasks; quantize.sh does quantize only.
+      - RUN_MMLU: "false"
+      - RUN_EXPORT: "false"
+      - TP: "1"
+      - PP: "1"
+      - EP: "4"
+      - ETP: "1"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: nvcr.io/nvidia/nemo:26.04
+      modelopt_install_path: /opt/venv/lib/python3.12/site-packages/modelopt
+      partition: batch
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      time: "04:00:00"
+
+  task_1:
+    script: common/megatron_lm/export/export.sh
+    environment:
+      - MLM_MODEL_CFG: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+      - QUANT_CFG: models/Nemotron-3-Super-120B-A12B/super-nvfp4
+      - HF_MODEL_CKPT: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+      - TP: "1"
+      - PP: "4"
+      - EP: "1"
+      - ETP: "1"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: nvcr.io/nvidia/nemo:26.04
+      modelopt_install_path: /opt/venv/lib/python3.12/site-packages/modelopt
+      partition: batch
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      time: "02:00:00"
+
+  # vLLM generation test: serve the exported HF NVFP4 ckpt and answer 8
+  # GPQA-style questions. Inspect responses under /cicd/vllm/<model>/.
+  task_2:
+    script: common/vllm/query.sh
+    args:
+      - --model /cicd/export/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16_super-nvfp4
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data common/vllm/gpqa_sample.jsonl
+      - --max-tokens 256
+      - --num-shards 1
+      - --save /cicd/vllm/NVIDIA-Nemotron-3-Super-120B-A12B-BF16_super-nvfp4
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: vllm/vllm-openai:v0.21.0
+      partition: batch
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      time: "01:00:00"
diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16/megatron_lm_ptq.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16/megatron_lm_ptq.yaml
@@ -0,0 +1,91 @@
+# Nemotron-3-Ultra-550B-A55B-BF16 PTQ quantization + export + vLLM generation test.
+# Tested on B200 Blackwell GPUs. Uses Super NVFP4 mixed-FP8 max calibration recipe, similar to published NVFP4 checkpoint (which is Four Over Six scales).
+#
+# Pipeline:
+#   task_0 (quantize): 4 nodes x 4 GPUs = 16 ranks, TP=1 PP=1 EP=16 ETP=1.
+#                      Loads HF weights from /hf-local, saves PTQ ckpt to /cicd.
+#   task_1 (export):   3 nodes x 4 GPUs = 12 ranks, TP=1 PP=12 EP=1 ETP=1.
+#                      108 layers / PP=12 = 9 layers/stage.
+#   task_2 (generation test):    1 node x 4 GPUs. Serve exported NVFP4 ckpt with vLLM and
+#                      answer 8 GPQA-style questions.
+#
+# Usage:
+#   source .env-slurm
+#   cd tools/launcher
+#   uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16/megatron_lm_ptq.yaml --yes
+
+job_name: Nemotron-3-Ultra_PTQ
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note: "PTQ on Nemotron-3-Ultra-550B-A55B-BF16 (super-nvfp4-max-calib): quantize @ 4 nodes, export @ 3 nodes, vLLM generation test@ 1 node"
+
+  task_0:
+    script: common/megatron_lm/quantize/quantize.sh
+    args:
+      - --seq-length 4096 --max-position-embeddings 4096
+      - --skip-generate
+      # Fast calibration. Bump (e.g. --calib-size 512) for production.
+      - --calib-size 32
+    environment:
+      - MLM_MODEL_CFG: nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16
+      - QUANT_CFG: models/Nemotron-3-Super-120B-A12B/super-nvfp4-max-calib
+      - HF_MODEL_CKPT: /hf-local/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16
+      # MMLU + Export run as separate tasks; quantize.sh does quantize only.
+      - RUN_MMLU: "false"
+      - RUN_EXPORT: "false"
+      - TP: "1"
+      - PP: "1"
+      - EP: "16"
+      - ETP: "1"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: nvcr.io/nvidia/nemo:26.04
+      modelopt_install_path: /opt/venv/lib/python3.12/site-packages/modelopt
+      partition: batch
+      nodes: 4
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      time: "04:00:00"
+
+  task_1:
+    script: common/megatron_lm/export/export.sh
+    environment:
+      - MLM_MODEL_CFG: nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16
+      - QUANT_CFG: models/Nemotron-3-Super-120B-A12B/super-nvfp4-max-calib
+      - HF_MODEL_CKPT: /hf-local/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16
+      - TP: "1"
+      - PP: "12"
+      - EP: "1"
+      - ETP: "1"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: nvcr.io/nvidia/nemo:26.04
+      modelopt_install_path: /opt/venv/lib/python3.12/site-packages/modelopt
+      partition: batch
+      nodes: 3
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      time: "02:00:00"
+
+  # vLLM generation test: serve the exported HF NVFP4 ckpt and answer 8
+  # GPQA-style questions. Inspect responses under /cicd/vllm/<model>/.
+  task_2:
+    script: common/vllm/query.sh
+    args:
+      - --model /cicd/export/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16_super-nvfp4-max-calib
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data common/vllm/gpqa_sample.jsonl
+      - --max-tokens 256
+      - --num-shards 1
+      - --save /cicd/vllm/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16_super-nvfp4-max-calib
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: vllm/vllm-openai:v0.21.0
+      partition: batch
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      time: "01:00:00"
diff --git a/tools/launcher/modules/Megatron-LM b/tools/launcher/modules/Megatron-LM
@@ -1 +1 @@
-Subproject commit 86bf47659387383b99bb345cc4f2c090c73100b0
+Subproject commit c69697d0510198acddf124694dcfd5057021092f

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ source ${SCRIPT_DIR}/../service_utils.sh`
`41`	`41`	`# vLLM notes:`
`42`	`42`	`# - vLLM manages GPU distribution internally; run with ntasks_per_node: 1`
`43`	`43`	`# in slurm_config and pass --tensor-parallel-size to match gpus_per_node.`
`44`		`-# - NVFP4 models require vllm/vllm-openai:v0.15.0+ on Blackwell GPUs.`
	`44`	`+# - NVFP4 models require vllm/vllm-openai:v0.21.0+ on Blackwell GPUs.`
`45`	`45`	`# - Use --trust-remote-code for models with custom architectures (e.g. Kimi).`
`46`	`46`	`#`
`47`	`47`	`# In a pipeline YAML task config:`