NVIDIA-NeMo · dingqingy-nv · Jun 10, 2026 · Jun 9, 2026 · Jun 9, 2026 · claude
diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
@@ -118,6 +118,9 @@ def deepseek_v3_pretrain_config_gb200(
     cfg = pretrain_config()
     cfg.mixed_precision = precision_config
 
+    if cfg.mixed_precision.fp8_recipe == "mxfp8":
+        cfg.model.fp8_output_proj = True
+
     # Apply model-specific settings that were previously passed as constructor args
     cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
     cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size

diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
@@ -95,7 +95,14 @@
     recompute_modules=["mla_up_proj"],
 )
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = replace(
+    DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1,
+    cuda_graph_impl="full_iteration",
+    cuda_graph_scope=[],
+    moe_a2a_overlap=True,
+    cutedsl_fused_grouped_mlp=True,
+    recompute_modules=["mla_up_proj"],
+)
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
 
 
@@ -199,7 +206,10 @@
     global_batch_size=4096,
 )
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = replace(
+    DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1,
+    global_batch_size=4096,
+)
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2