NVIDIA
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/deepseek/README.md‎
Lines changed: 86 additions & 5 deletions b/‎examples/deepseek/README.md‎
Lines changed: 86 additions & 5 deletions
diff --git a/‎examples/deepseek/ptq.py‎ ‎examples/deepseek/deepseek_v3/ptq.py‎examples/deepseek/ptq.py renamed to examples/deepseek/deepseek_v3/ptq.py b/‎examples/deepseek/ptq.py‎ ‎examples/deepseek/deepseek_v3/ptq.py‎examples/deepseek/ptq.py renamed to examples/deepseek/deepseek_v3/ptq.py
diff --git a/‎…amples/deepseek/quantize_fp8_to_nvfp4.sh‎ ‎…eek/deepseek_v3/quantize_fp8_to_nvfp4.sh‎examples/deepseek/quantize_fp8_to_nvfp4.sh renamed to examples/deepseek/deepseek_v3/quantize_fp8_to_nvfp4.sh
Lines changed: 3 additions & 1 deletion b/‎…amples/deepseek/quantize_fp8_to_nvfp4.sh‎ ‎…eek/deepseek_v3/quantize_fp8_to_nvfp4.sh‎examples/deepseek/quantize_fp8_to_nvfp4.sh renamed to examples/deepseek/deepseek_v3/quantize_fp8_to_nvfp4.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/deepseek/quantize_to_nvfp4.py‎ ‎…eepseek/deepseek_v3/quantize_to_nvfp4.py‎examples/deepseek/quantize_to_nvfp4.py renamed to examples/deepseek/deepseek_v3/quantize_to_nvfp4.py b/‎examples/deepseek/quantize_to_nvfp4.py‎ ‎…eepseek/deepseek_v3/quantize_to_nvfp4.py‎examples/deepseek/quantize_to_nvfp4.py renamed to examples/deepseek/deepseek_v3/quantize_to_nvfp4.py
@@ -103,8 +103,8 @@ repos:
               modelopt/torch/speculative/eagle/utils.py|
               modelopt/torch/speculative/plugins/hf_medusa.py|
               modelopt/torch/utils/plugins/megatron_mmlu.py|
-              examples/deepseek/quantize_to_nvfp4.py|
-              examples/deepseek/ptq.py|
+              examples/deepseek/deepseek_v3/quantize_to_nvfp4.py|
+              examples/deepseek/deepseek_v3/ptq.py|
               examples/diffusers/quantization/onnx_utils/export.py|
               examples/llm_eval/lm_eval_hf.py|
               examples/llm_eval/mmlu.py|
 
@@ -6,7 +6,14 @@ This example will demonstrate the steps to quantize DeepSeek models to FP4 and e
 
 Due to the model size, currently it requires 8xH200 or 16xH100 to quantize the FP8 model, we will use 8xH200 as example.
 
-## Convert the HF checkpoint for deepseek FP8 inference
+## Directory Layout
+
+- `deepseek_v3/`: DeepSeek V3, R1, V3.1, and V3.2 FP4 quantization.
+- `deepseek_v4/`: DeepSeek V4 routed-expert NVFP4 quantization.
+
+## DeepSeek V3 FP4
+
+### Convert the HF checkpoint for DeepSeek FP8 inference
 
 ```bash
 # set up variables to run the example
@@ -54,13 +61,13 @@ python inference/convert.py --hf-ckpt-path $HF_FP8_CKPT --save-path $DS_CKPT --n
 DeepSeek V3, R1, V3.1
 
 ```bash
-torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3/inference/configs/config_671B.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
+torchrun --nproc-per-node 8 --master_port=12346 deepseek_v3/ptq.py --model_path $DS_CKPT --config DeepSeek-V3/inference/configs/config_671B.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
 ```
 
 DeepSeek V3.2
 
 ```bash
-torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
+torchrun --nproc-per-node 8 --master_port=12346 deepseek_v3/ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
 ```
 
 #### MoE expert calibration
@@ -78,7 +85,7 @@ during calibration (slower, ~2x forwards, no post-calibration sync) — pass
 `--calib_all_experts`:
 
 ```bash
-torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH --calib_all_experts
+torchrun --nproc-per-node 8 --master_port=12346 deepseek_v3/ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH --calib_all_experts
 ```
 
 A summary of every TensorQuantizer is written to `$FP4_QUANT_PATH/.quant_summary.txt`.
@@ -91,5 +98,79 @@ We provide a one-step-script which will:
 - Copy miscellaneous files to the quantized checkpoint
 
 ```bash
-./quantize_fp8_to_nvfp4.sh --amax_path $FP4_QUANT_PATH --fp4_output_path $HF_FP4_PATH --fp8_hf_path $HF_FP8_CKPT --world_size 8
+./deepseek_v3/quantize_fp8_to_nvfp4.sh --amax_path $FP4_QUANT_PATH --fp4_output_path $HF_FP4_PATH --fp8_hf_path $HF_FP8_CKPT --world_size 8
 ```
+
+## DeepSeek V4 routed-expert NVFP4
+
+DeepSeek V4 uses a mixed native checkpoint layout. The V4 recipe quantizes
+only the routed experts to NVFP4 W4A4 and leaves attention projections, the
+router gate, shared experts, embeddings, and the LM head in their original
+formats.
+
+### Prepare the MP checkpoint
+
+Keep experts in MXFP4 when resharding with DeepSeek's own `convert.py`:
+
+```bash
+export DS_V4=/path/to/DeepSeek-V4-Pro
+export MP=8
+export MP_CKPT=/path/to/DeepSeek-V4-Pro-mp${MP}-mxfp4
+export AMAX=/path/to/amax-nvfp4-experts
+export HF_NVFP4_PATH=/path/to/DeepSeek-V4-Pro-nvfp4-experts
+
+python ${DS_V4}/inference/convert.py \
+    --hf-ckpt-path ${DS_V4} \
+    --save-path ${MP_CKPT} \
+    --n-experts 384 \
+    --model-parallel ${MP}
+```
+
+### Calibrate routed experts
+
+Single node:
+
+```bash
+torchrun --nproc-per-node ${MP} --master_port 12346 deepseek_v4/ptq.py \
+    --model_path ${MP_CKPT} \
+    --config ${DS_V4}/inference/config.json \
+    --dsv4_inference_dir ${DS_V4}/inference \
+    --output_path ${AMAX}
+```
+
+Two 4-GPU nodes for `MP=8`:
+
+```bash
+# node 0
+torchrun --nnodes=2 --node_rank=0 --master_addr=<ip> --master_port=12346 \
+    --nproc-per-node 4 deepseek_v4/ptq.py \
+    --model_path ${MP_CKPT} \
+    --config ${DS_V4}/inference/config.json \
+    --dsv4_inference_dir ${DS_V4}/inference \
+    --output_path ${AMAX}
+
+# node 1
+torchrun --nnodes=2 --node_rank=1 --master_addr=<ip> --master_port=12346 \
+    --nproc-per-node 4 deepseek_v4/ptq.py \
+    --model_path ${MP_CKPT} \
+    --config ${DS_V4}/inference/config.json \
+    --dsv4_inference_dir ${DS_V4}/inference \
+    --output_path ${AMAX}
+```
+
+### Export back to HF shard layout
+
+`deepseek_v4/quantize_to_nvfp4.py` operates on the original HF-style V4 checkpoint and
+produces a new HF-style checkpoint with routed expert weights replaced by
+NVFP4 tensors plus `weight_scale`, `weight_scale_2`, and `input_scale`.
+
+```bash
+python deepseek_v4/quantize_to_nvfp4.py \
+    --amax_path ${AMAX} \
+    --source_ckpt ${DS_V4} \
+    --output_ckpt ${HF_NVFP4_PATH}
+```
+
+The output includes an updated `model.safetensors.index.json`, a `config.json`
+with `quantization_config.moe_quant_algo = "NVFP4"`, and `hf_quant_config.json`
+describing the mixed NVFP4 expert layers.
@@ -16,6 +16,8 @@
 
 set -e  # Exit immediately if any command fails
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 usage() {
     echo "Usage: $0 --amax_path <path> --fp4_output_path <path> --fp8_hf_path <path> [--world_size <n>]"
     exit 1
@@ -84,7 +86,7 @@ cp -r $FP8_HF_PATH/assets $FP4_PATH/ || true
 
 # Run the quantization command
 echo "Running quantization..."
-python quantize_to_nvfp4.py \
+python "$SCRIPT_DIR/quantize_to_nvfp4.py" \
     --amax_path "$AMAX_PATH" \
     --fp4_path "$FP4_PATH" \
     --fp8_hf_path "$FP8_HF_PATH" \