Skip to content

Commit cd3fc0c

Browse files
dingqingy-nvvasunvidia
authored andcommitted
feat(recipe): DSV3 GB200 MXFP8 full-iter CG recipe (NVIDIA-NeMo#4226)
Signed-off-by: Dingqing Yang <dingqingy@nvidia.com> Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>
1 parent ccff5cb commit cd3fc0c

2 files changed

Lines changed: 15 additions & 2 deletions

File tree

scripts/performance/configs/deepseek/deepseek_llm_pretrain.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ def deepseek_v3_pretrain_config_gb200(
118118
cfg = pretrain_config()
119119
cfg.mixed_precision = precision_config
120120

121+
if cfg.mixed_precision.fp8_recipe == "mxfp8":
122+
cfg.model.fp8_output_proj = True
123+
121124
# Apply model-specific settings that were previously passed as constructor args
122125
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
123126
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size

scripts/performance/configs/deepseek/deepseek_workload_base_configs.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,14 @@
9595
recompute_modules=["mla_up_proj"],
9696
)
9797
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
98-
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
98+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = replace(
99+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1,
100+
cuda_graph_impl="full_iteration",
101+
cuda_graph_scope=[],
102+
moe_a2a_overlap=True,
103+
cutedsl_fused_grouped_mlp=True,
104+
recompute_modules=["mla_up_proj"],
105+
)
99106
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
100107

101108

@@ -199,7 +206,10 @@
199206
global_batch_size=4096,
200207
)
201208
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
202-
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
209+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = replace(
210+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1,
211+
global_batch_size=4096,
212+
)
203213
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
204214

205215

0 commit comments

Comments
 (0)