Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ def deepseek_v3_pretrain_config_gb200(
cfg = pretrain_config()
cfg.mixed_precision = precision_config

if cfg.mixed_precision.fp8_recipe == "mxfp8":
cfg.model.fp8_output_proj = True

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,14 @@
recompute_modules=["mla_up_proj"],
)
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = replace(
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1,
cuda_graph_impl="full_iteration",
cuda_graph_scope=[],
moe_a2a_overlap=True,
cutedsl_fused_grouped_mlp=True,
recompute_modules=["mla_up_proj"],
)
Comment on lines +98 to +105

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: GB300 MX V1 sets fp8_dot_product_attention=True (line 74) but the new GB200 MX V1 does not. The PR description says "Recipe shape matches GB300 except for recompute_modules=["mla_up_proj"]" — is the omission of fp8_dot_product_attention intentional (e.g. incompatible with GB200's PP=4/VP=4 layout), or should it be added here?

DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1


Expand Down Expand Up @@ -199,7 +206,10 @@
global_batch_size=4096,
)
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = replace(
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1,
global_batch_size=4096,
)
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2


Expand Down
Loading