Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ policy:
empty_unused_memory_level: 2
enabled: true
activation_checkpointing: true
moe_grouped_gemm: true
tensor_model_parallel_size: 8
expert_model_parallel_size: 32
pipeline_model_parallel_size: 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ policy:
pipeline_model_parallel_size: 16
expert_model_parallel_size: 16
activation_checkpointing: true
moe_grouped_gemm: true
num_layers_in_first_pipeline_stage: 3
num_layers_in_last_pipeline_stage: 2
apply_rope_fusion: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ policy:
context_parallel_size: 2
expert_model_parallel_size: 16
activation_checkpointing: true
moe_grouped_gemm: true
num_layers_in_first_pipeline_stage: 11
num_layers_in_last_pipeline_stage: 11
defer_fp32_logits: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ policy:
pipeline_model_parallel_size: 1
expert_model_parallel_size: 16
sequence_parallel: false
moe_grouped_gemm: true
optimizer:
lr: 3.0e-07
min_lr: 3.0e-08
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ policy:
expert_model_parallel_size: 8
sequence_parallel: true
context_parallel_size: 8
moe_grouped_gemm: true
optimizer:
lr: 3.0e-07
min_lr: 3.0e-08
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ policy:
pipeline_model_parallel_size: 1
expert_model_parallel_size: 8
sequence_parallel: false
moe_grouped_gemm: true
optimizer:
lr: 3.0e-07
min_lr: 3.0e-08
Expand Down
3 changes: 3 additions & 0 deletions nemo_rl/models/megatron/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ def _apply_moe_config(model_cfg: Any, config: PolicyConfig) -> None:

model_cfg.moe_permute_fusion = config["megatron_cfg"]["moe_permute_fusion"]

if "moe_grouped_gemm" in config["megatron_cfg"]:
model_cfg.moe_grouped_gemm = config["megatron_cfg"]["moe_grouped_gemm"]


def _apply_mtp_config(model_cfg: Any, config: PolicyConfig) -> None:
if "mtp_num_layers" in config["megatron_cfg"]:
Expand Down
4 changes: 4 additions & 0 deletions nemo_rl/models/policy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ class MegatronConfig(TypedDict):
moe_token_dispatcher_type: str
# Can be used only with 'alltoall' token dispatcher
moe_shared_expert_overlap: bool
# Enable grouped GEMM for MoE experts via CUTLASS. Significant throughput
# gain when multiple experts are assigned per rank (num_local_experts > 1).
# Requires TE >= 1.11.0 for FP8 and Ampere (sm_80) or newer.
moe_grouped_gemm: NotRequired[bool]
Comment thread
seonjinn marked this conversation as resolved.
peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled]
optimizer: MegatronOptimizerConfig
scheduler: MegatronSchedulerConfig
Expand Down
Loading